diff options
Diffstat (limited to 'fs')
640 files changed, 28368 insertions, 14298 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6894b085f0ee..620d93489539 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, } init_rwsem(&v9ses->rename_sem); - rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); + rc = bdi_setup_and_register(&v9ses->bdi, "9p"); if (rc) { kfree(v9ses->aname); kfree(v9ses->uname); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 5594505e6e73..b40133796b87 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { @@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 296482fc77a9..3662f1d1d9cf 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -832,7 +832,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, * moved b under k and client parallely did a lookup for * k/b. */ - res = d_materialise_unique(dentry, inode); + res = d_splice_alias(inode, dentry); if (!res) v9fs_fid_add(dentry, fid); else if (!IS_ERR(res)) @@ -1127,7 +1127,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /* Write all dirty data */ - if (S_ISREG(dentry->d_inode->i_mode)) + if (d_is_reg(dentry)) filemap_write_and_wait(dentry->d_inode->i_mapping); retval = p9_client_wstat(fid, &wstat); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 02b64f4e576a..6054c16b8fae 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -826,8 +826,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", - dir->i_ino, dentry->d_name.name, omode, + p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) diff --git a/fs/Kconfig b/fs/Kconfig index 664991afe0c0..ec35851e5b71 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -13,13 +13,6 @@ if BLOCK source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" - -config FS_XIP -# execute in place - bool - depends on EXT2_FS_XIP - default y - source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" @@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +config FS_DAX + bool "Direct Access (DAX) support" + depends on MMU + depends on !(ARM || MIPS || SPARC) + help + Direct Access (DAX) can be used on memory-backed block devices. + If the block device supports DAX and the filesystem supports DAX, + then you can avoid using the pagecache to buffer I/Os. Turning + on this option will compile in support for DAX; you will need to + mount the filesystem using the -o dax option. + + If you do not have a block device that is capable of using this, + or if unsure, say N. Saying Y will increase the size of the kernel + by about 5kB. + endif # BLOCK # Posix ACL utility routines @@ -165,6 +173,7 @@ config HUGETLB_PAGE def_bool HUGETLBFS source "fs/configfs/Kconfig" +source "fs/efivarfs/Kconfig" endmenu @@ -209,7 +218,6 @@ source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" source "fs/f2fs/Kconfig" -source "fs/efivarfs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 370b24cee4d8..270c48148f79 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -30,6 +30,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_RANDOMIZE_PIE bool +config ARCH_BINFMT_ELF_STATE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y @@ -146,13 +149,6 @@ config BINFMT_EM86 later load the module when you want to use a Linux/Intel binary. The module will be called binfmt_em86. If unsure, say Y. -config BINFMT_SOM - tristate "Kernel support for SOM binaries" - depends on PARISC && HPUX - help - SOM is a binary executable format inherited from HP/UX. Say - Y here to be able to load and execute SOM binaries directly. - config BINFMT_MISC tristate "Kernel support for MISC binaries" ---help--- diff --git a/fs/Makefile b/fs/Makefile index 34a1b9dea6dd..a88ac4838c9e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o \ - stack.o fs_struct.o statfs.o fs_pin.o + stack.o fs_struct.o statfs.o fs_pin.o nsfs.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o @@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o @@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o -obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o @@ -104,7 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ -obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ +obj-$(CONFIG_OVERLAY_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 9bca88159725..c8764bd7497d 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -30,6 +30,8 @@ #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) +#define AFFSNAMEMAX 30U + struct affs_ext_key { u32 ext; /* idx of the extended block */ u32 key; /* block number */ @@ -135,8 +137,10 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void secs_to_datestamp(time_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); +__printf(3, 4) extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); +__printf(3, 4) extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); extern bool affs_nofilenametruncate(const struct dentry *dentry); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index abc853968fed..388da1ea815d 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -10,8 +10,6 @@ #include "affs.h" -static char ErrorBuffer[256]; - /* * Functions for accessing Amiga-FFS structures. */ @@ -32,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); - pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino); + pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) @@ -82,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); - pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n", - __func__, (u32)dir->i_ino, rem_ino, offset); + pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, + rem_ino, offset); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -125,7 +123,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) { struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { if (entry_ino == (u32)(long)dentry->d_fsdata) { dentry->d_fsdata = (void *)inode->i_ino; break; @@ -444,30 +442,30 @@ mode_to_prot(struct inode *inode) void affs_error(struct super_block *sb, const char *function, const char *fmt, ...) { - va_list args; - - va_start(args,fmt); - vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); - va_end(args); + struct va_format vaf; + va_list args; - pr_crit("error (device %s): %s(): %s\n", sb->s_id, - function,ErrorBuffer); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); if (!(sb->s_flags & MS_RDONLY)) pr_warn("Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; + va_end(args); } void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) { - va_list args; + struct va_format vaf; + va_list args; - va_start(args,fmt); - vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s(): %pV\n", sb->s_id, function, &vaf); va_end(args); - - pr_warn("(device %s): %s(): %s\n", sb->s_id, - function,ErrorBuffer); } bool @@ -485,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) { + if (len > AFFSNAMEMAX) { if (notruncate) return -ENAMETOOLONG; - else - len = 30; + len = AFFSNAMEMAX; } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' @@ -510,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate) int affs_copy_name(unsigned char *bstr, struct dentry *dentry) { - int len = min(dentry->d_name.len, 30u); + u32 len = min(dentry->d_name.len, AFFSNAMEMAX); *bstr++ = len; memcpy(bstr, dentry->d_name.name, len); diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index c8de51185c23..675148950fed 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -99,7 +99,6 @@ err_bh_read: err_range: affs_error(sb, "affs_free_block","Block %u outside partition", block); - return; } /* diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 59f07bec92a6..ac4f318aafba 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) u32 ino; int error = 0; - pr_debug("%s(ino=%lu,f_pos=%lx)\n", - __func__, inode->i_ino, (unsigned long)ctx->pos); + pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -115,11 +114,11 @@ inside: break; } - namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); + namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], + (u8)AFFSNAMEMAX); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("readdir(): dir_emit(\"%.*s\", " - "ino=%u), hash=%d, f_pos=%x\n", - namelen, name, ino, hash_pos, (u32)ctx->pos); + pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n", + namelen, name, ino, hash_pos, ctx->pos); if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) goto done; diff --git a/fs/affs/file.c b/fs/affs/file.c index 1ed590aafecf..a91795e01a7f 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -12,35 +12,10 @@ * affs regular file handling primitives */ +#include <linux/aio.h> #include "affs.h" -#if PAGE_SIZE < 4096 -#error PAGE_SIZE must be at least 4096 -#endif - -static int affs_grow_extcache(struct inode *inode, u32 lc_idx); -static struct buffer_head *affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext); -static inline struct buffer_head *affs_get_extblock(struct inode *inode, u32 ext); static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); -static int affs_file_open(struct inode *inode, struct file *filp); -static int affs_file_release(struct inode *inode, struct file *filp); - -const struct file_operations affs_file_operations = { - .llseek = generic_file_llseek, - .read = new_sync_read, - .read_iter = generic_file_read_iter, - .write = new_sync_write, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .open = affs_file_open, - .release = affs_file_release, - .fsync = affs_file_fsync, - .splice_read = generic_file_splice_read, -}; - -const struct inode_operations affs_file_inode_operations = { - .setattr = affs_notify_change, -}; static int affs_file_open(struct inode *inode, struct file *filp) @@ -205,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext) ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); if (ext < AFFS_I(inode)->i_extcnt) goto read_ext; - if (ext > AFFS_I(inode)->i_extcnt) - BUG(); + BUG_ON(ext > AFFS_I(inode)->i_extcnt); bh = affs_alloc_extblock(inode, bh, ext); if (IS_ERR(bh)) return bh; @@ -223,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext) struct buffer_head *prev_bh; /* allocate a new extended block */ - if (ext > AFFS_I(inode)->i_extcnt) - BUG(); + BUG_ON(ext > AFFS_I(inode)->i_extcnt); /* get previous extended block */ prev_bh = affs_get_extblock(inode, ext - 1); @@ -324,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul struct buffer_head *ext_bh; u32 ext; - pr_debug("%s(%u, %lu)\n", - __func__, (u32)inode->i_ino, (unsigned long)block); + pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino, + (unsigned long long)block); BUG_ON(block > (sector_t)0x7fffffffUL); @@ -355,7 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul /* store new block */ if (bh_result->b_blocknr) - affs_warning(sb, "get_block", "block already set (%x)", bh_result->b_blocknr); + affs_warning(sb, "get_block", + "block already set (%llx)", + (unsigned long long)bh_result->b_blocknr); AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); @@ -377,7 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul return 0; err_big: - affs_error(inode->i_sb,"get_block","strange block request %d", block); + affs_error(inode->i_sb, "get_block", "strange block request %llu", + (unsigned long long)block); return -EIO; err_ext: // unlock cache @@ -412,6 +388,29 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) } } +static ssize_t +affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (rw == WRITE) { + loff_t size = offset + count; + + if (AFFS_I(inode)->mmu_private < size) + return 0; + } + + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); + if (ret < 0 && (rw & WRITE)) + affs_write_failed(mapping, offset + count); + return ret; +} + static int affs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -438,6 +437,7 @@ const struct address_space_operations affs_aops = { .writepage = affs_writepage, .write_begin = affs_write_begin, .write_end = generic_write_end, + .direct_IO = affs_direct_IO, .bmap = _affs_bmap }; @@ -509,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to) u32 bidx, boff, bsize; u32 tmp; - pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino, + pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, page->index, to); BUG_ON(to > PAGE_CACHE_SIZE); kmap(page); @@ -545,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) u32 size, bsize; u32 tmp; - pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize); + pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize); bsize = AFFS_SB(sb)->s_data_blksize; bh = NULL; size = AFFS_I(inode)->mmu_private; @@ -614,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page) u32 to; int err; - pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index); + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); to = PAGE_CACHE_SIZE; if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { to = inode->i_size & ~PAGE_CACHE_MASK; @@ -637,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping pgoff_t index; int err = 0; - pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino, - (unsigned long long)pos, (unsigned long long)pos + len); + pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pos + len); if (pos > AFFS_I(inode)->mmu_private) { /* XXX: this probably leaves a too-big i_size in case of * failure. Should really be updating i_size at write_end time @@ -687,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, * due to write_begin. */ - pr_debug("%s(%u, %llu, %llu)\n", - __func__, (u32)inode->i_ino, (unsigned long long)pos, - (unsigned long long)pos + len); + pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pos + len); bsize = AFFS_SB(sb)->s_data_blksize; data = page_address(page); @@ -700,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, boff = tmp % bsize; if (boff) { bh = affs_bread_ino(inode, bidx, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } tmp = min(bsize - boff, to - from); BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); @@ -713,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, bidx++; } else if (bidx) { bh = affs_bread_ino(inode, bidx - 1, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + written = PTR_ERR(bh); + goto err_first_bh; + } } while (from + bsize <= to) { prev_bh = bh; bh = affs_getemptyblk_ino(inode, bidx); if (IS_ERR(bh)) - goto out; + goto err_bh; memcpy(AFFS_DATA(bh), data + from, bsize); if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); @@ -752,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, prev_bh = bh; bh = affs_bread_ino(inode, bidx, 1); if (IS_ERR(bh)) - goto out; + goto err_bh; tmp = min(bsize, to - from); BUG_ON(tmp > bsize); memcpy(AFFS_DATA(bh), data + from, tmp); @@ -791,12 +794,13 @@ done: if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; +err_first_bh: unlock_page(page); page_cache_release(page); return written; -out: +err_bh: bh = prev_bh; if (!written) written = PTR_ERR(bh); @@ -837,8 +841,8 @@ affs_truncate(struct inode *inode) struct buffer_head *ext_bh; int i; - pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n", - (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); + pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n", + inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size); last_blk = 0; ext = 0; @@ -867,7 +871,8 @@ affs_truncate(struct inode *inode) // lock cache ext_bh = affs_get_extblock(inode, ext); if (IS_ERR(ext_bh)) { - affs_warning(sb, "truncate", "unexpected read error for ext block %u (%d)", + affs_warning(sb, "truncate", + "unexpected read error for ext block %u (%ld)", ext, PTR_ERR(ext_bh)); return; } @@ -914,7 +919,8 @@ affs_truncate(struct inode *inode) struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); u32 tmp; if (IS_ERR(bh)) { - affs_warning(sb, "truncate", "unexpected read error for last block %u (%d)", + affs_warning(sb, "truncate", + "unexpected read error for last block %u (%ld)", ext, PTR_ERR(bh)); return; } @@ -961,3 +967,19 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) mutex_unlock(&inode->i_mutex); return ret; } +const struct file_operations affs_file_operations = { + .llseek = generic_file_llseek, + .read = new_sync_read, + .read_iter = generic_file_read_iter, + .write = new_sync_write, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .open = affs_file_open, + .release = affs_file_release, + .fsync = affs_file_fsync, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations affs_file_inode_operations = { + .setattr = affs_notify_change, +}; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index e217c511459b..6f34510449e8 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -13,8 +13,6 @@ #include <linux/gfp.h> #include "affs.h" -extern const struct inode_operations affs_symlink_inode_operations; - struct inode *affs_iget(struct super_block *sb, unsigned long ino) { struct affs_sb_info *sbi = AFFS_SB(sb); @@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 u32 block = 0; int retval; - pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n", - __func__, (u32)dir->i_ino, - (u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type); + pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__, + dir->i_ino, inode->i_ino, dentry, type); retval = -EIO; bh = affs_bread(sb, inode->i_ino); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 035bd31556fc..ffb7bd82c2a5 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; - int i; + int retval; + u32 len; - i = affs_check_name(qstr->name, qstr->len, notruncate); - if (i) - return i; + retval = affs_check_name(qstr->name, qstr->len, notruncate); + if (retval) + return retval; hash = init_name_hash(); - i = min(qstr->len, 30u); - for (; i > 0; name++, i--) + len = min(qstr->len, AFFSNAMEMAX); + for (; len > 0; name++, len--) hash = partial_name_hash(toupper(*name), hash); qstr->hash = end_name_hash(hash); @@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len, * If the names are longer than the allowed 30 chars, * the excess is ignored, so their length may differ. */ - if (len >= 30) { - if (name->len < 30) + if (len >= AFFSNAMEMAX) { + if (name->len < AFFSNAMEMAX) return 1; - len = 30; + len = AFFSNAMEMAX; } else if (len != name->len) return 1; @@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) const u8 *name = dentry->d_name.name; int len = dentry->d_name.len; - if (len >= 30) { - if (*name2 < 30) + if (len >= AFFSNAMEMAX) { + if (*name2 < AFFSNAMEMAX) return 0; - len = 30; + len = AFFSNAMEMAX; } else if (len != *name2) return 0; @@ -173,9 +174,9 @@ int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) { toupper_t toupper = affs_get_toupper(sb); - int hash; + u32 hash; - hash = len = min(len, 30u); + hash = len = min(len, AFFSNAMEMAX); for (; len > 0; len--) hash = (hash * 13 + toupper(*name++)) & 0x7ff; @@ -190,8 +191,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) toupper_t toupper = affs_get_toupper(sb); u32 key; - pr_debug("%s(\"%.*s\")\n", - __func__, (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(\"%pd\")\n", __func__, dentry); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -219,8 +219,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) struct buffer_head *bh; struct inode *inode = NULL; - pr_debug("%s(\"%.*s\")\n", - __func__, (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(\"%pd\")\n", __func__, dentry); affs_lock_dir(dir); bh = affs_find_entry(dir, dentry); @@ -250,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%d, %lu \"%.*s\")\n", - __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, - (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + dentry->d_inode->i_ino, dentry); return affs_remove_header(dentry); } @@ -264,9 +262,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) struct inode *inode; int error; - pr_debug("%s(%lu,\"%.*s\",0%ho)\n", - __func__, dir->i_ino, (int)dentry->d_name.len, - dentry->d_name.name,mode); + pr_debug("%s(%lu,\"%pd\",0%ho)\n", + __func__, dir->i_ino, dentry, mode); inode = affs_new_inode(dir); if (!inode) @@ -294,9 +291,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int error; - pr_debug("%s(%lu,\"%.*s\",0%ho)\n", - __func__, dir->i_ino, (int)dentry->d_name.len, - dentry->d_name.name, mode); + pr_debug("%s(%lu,\"%pd\",0%ho)\n", + __func__, dir->i_ino, dentry, mode); inode = affs_new_inode(dir); if (!inode) @@ -321,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%u, %lu \"%.*s\")\n", - __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, - (int)dentry->d_name.len, dentry->d_name.name); + pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + dentry->d_inode->i_ino, dentry); return affs_remove_header(dentry); } @@ -338,9 +333,8 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) int i, maxlen, error; char c, lc; - pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n", - __func__, dir->i_ino, (int)dentry->d_name.len, - dentry->d_name.name, symname); + pr_debug("%s(%lu,\"%pd\" -> \"%s\")\n", + __func__, dir->i_ino, dentry, symname); maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1; inode = affs_new_inode(dir); @@ -409,9 +403,8 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - pr_debug("%s(%u, %u, \"%.*s\")\n", - __func__, (u32)inode->i_ino, (u32)dir->i_ino, - (int)dentry->d_name.len,dentry->d_name.name); + pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, + dentry); return affs_add_entry(dir, inode, dentry, ST_LINKFILE); } @@ -424,10 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; - pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n", - __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len, - old_dentry->d_name.name, (u32)new_dir->i_ino, - (int)new_dentry->d_name.len, new_dentry->d_name.name); + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); retval = affs_check_name(new_dentry->d_name.name, new_dentry->d_name.len, diff --git a/fs/affs/super.c b/fs/affs/super.c index f754ab68a840..4cf0e9113fb6 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -432,39 +432,39 @@ got_root: sb->s_flags |= MS_RDONLY; } switch (chksum) { - case MUFS_FS: - case MUFS_INTLFFS: - case MUFS_DCFFS: - sbi->s_flags |= SF_MUFS; - /* fall thru */ - case FS_INTLFFS: - case FS_DCFFS: - sbi->s_flags |= SF_INTL; - break; - case MUFS_FFS: - sbi->s_flags |= SF_MUFS; - break; - case FS_FFS: - break; - case MUFS_OFS: - sbi->s_flags |= SF_MUFS; - /* fall thru */ - case FS_OFS: - sbi->s_flags |= SF_OFS; - sb->s_flags |= MS_NOEXEC; - break; - case MUFS_DCOFS: - case MUFS_INTLOFS: - sbi->s_flags |= SF_MUFS; - case FS_DCOFS: - case FS_INTLOFS: - sbi->s_flags |= SF_INTL | SF_OFS; - sb->s_flags |= MS_NOEXEC; - break; - default: - pr_err("Unknown filesystem on device %s: %08X\n", - sb->s_id, chksum); - return -EINVAL; + case MUFS_FS: + case MUFS_INTLFFS: + case MUFS_DCFFS: + sbi->s_flags |= SF_MUFS; + /* fall thru */ + case FS_INTLFFS: + case FS_DCFFS: + sbi->s_flags |= SF_INTL; + break; + case MUFS_FFS: + sbi->s_flags |= SF_MUFS; + break; + case FS_FFS: + break; + case MUFS_OFS: + sbi->s_flags |= SF_MUFS; + /* fall thru */ + case FS_OFS: + sbi->s_flags |= SF_OFS; + sb->s_flags |= MS_NOEXEC; + break; + case MUFS_DCOFS: + case MUFS_INTLOFS: + sbi->s_flags |= SF_MUFS; + case FS_DCOFS: + case FS_INTLOFS: + sbi->s_flags |= SF_INTL | SF_OFS; + sb->s_flags |= MS_NOEXEC; + break; + default: + pr_err("Unknown filesystem on device %s: %08X\n", + sb->s_id, chksum); + return -EINVAL; } if (mount_flags & SF_VERBOSE) { @@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = free; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - buf->f_namelen = 30; + buf->f_namelen = AFFSNAMEMAX; return 0; } @@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb) affs_free_bitmap(sb); affs_brelse(sbi->s_root_bh); kfree(sbi->s_prefix); + mutex_destroy(&sbi->s_bmlock); kfree(sbi); } } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index a1645b88fe8a..4ec35e9130e1 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -26,7 +26,7 @@ static int afs_readdir(struct file *file, struct dir_context *ctx); static int afs_d_revalidate(struct dentry *dentry, unsigned int flags); static int afs_d_delete(const struct dentry *dentry); static void afs_d_release(struct dentry *dentry); -static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); @@ -391,10 +391,11 @@ static int afs_readdir(struct file *file, struct dir_context *ctx) * - if afs_dir_iterate_block() spots this function, it'll pass the FID * uniquifier through dtype */ -static int afs_lookup_filldir(void *_cookie, const char *name, int nlen, - loff_t fpos, u64 ino, unsigned dtype) +static int afs_lookup_filldir(struct dir_context *ctx, const char *name, + int nlen, loff_t fpos, u64 ino, unsigned dtype) { - struct afs_lookup_cookie *cookie = _cookie; + struct afs_lookup_cookie *cookie = + container_of(ctx, struct afs_lookup_cookie, ctx); _enter("{%s,%u},%s,%u,,%llu,%u", cookie->name.name, cookie->name.len, name, nlen, @@ -433,7 +434,7 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry, }; int ret; - _enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name); + _enter("{%lu},%p{%pd},", dir->i_ino, dentry, dentry); /* search the directory */ ret = afs_dir_iterate(dir, &cookie.ctx, key); @@ -465,8 +466,8 @@ static struct inode *afs_try_auto_mntpt( struct afs_vnode *vnode = AFS_FS_I(dir); struct inode *inode; - _enter("%d, %p{%s}, {%x:%u}, %p", - ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key); + _enter("%d, %p{%pd}, {%x:%u}, %p", + ret, dentry, dentry, vnode->fid.vid, vnode->fid.vnode, key); if (ret != -ENOENT || !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) @@ -501,8 +502,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, vnode = AFS_FS_I(dir); - _enter("{%x:%u},%p{%s},", - vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name); + _enter("{%x:%u},%p{%pd},", + vnode->fid.vid, vnode->fid.vnode, dentry, dentry); ASSERTCMP(dentry->d_inode, ==, NULL); @@ -588,11 +589,11 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) vnode = AFS_FS_I(dentry->d_inode); if (dentry->d_inode) - _enter("{v={%x:%u} n=%s fl=%lx},", - vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + _enter("{v={%x:%u} n=%pd fl=%lx},", + vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); else - _enter("{neg n=%s}", dentry->d_name.name); + _enter("{neg n=%pd}", dentry); key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell); if (IS_ERR(key)) @@ -607,7 +608,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_validate(dir, key); if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { - _debug("%s: parent dir deleted", dentry->d_name.name); + _debug("%pd: parent dir deleted", dentry); goto out_bad; } @@ -625,16 +626,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (!dentry->d_inode) goto out_bad; if (is_bad_inode(dentry->d_inode)) { - printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n", - parent->d_name.name, dentry->d_name.name); + printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", + dentry); goto out_bad; } /* if the vnode ID has changed, then the dirent points to a * different file */ if (fid.vnode != vnode->fid.vnode) { - _debug("%s: dirent changed [%u != %u]", - dentry->d_name.name, fid.vnode, + _debug("%pd: dirent changed [%u != %u]", + dentry, fid.vnode, vnode->fid.vnode); goto not_found; } @@ -643,8 +644,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) * been deleted and replaced, and the original vnode ID has * been reused */ if (fid.unique != vnode->fid.unique) { - _debug("%s: file deleted (uq %u -> %u I:%u)", - dentry->d_name.name, fid.unique, + _debug("%pd: file deleted (uq %u -> %u I:%u)", + dentry, fid.unique, vnode->fid.unique, dentry->d_inode->i_generation); spin_lock(&vnode->lock); @@ -656,14 +657,14 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case -ENOENT: /* the filename is unknown */ - _debug("%s: dirent not found", dentry->d_name.name); + _debug("%pd: dirent not found", dentry); if (dentry->d_inode) goto not_found; goto out_valid; default: - _debug("failed to iterate dir %s: %d", - parent->d_name.name, ret); + _debug("failed to iterate dir %pd: %d", + parent, ret); goto out_bad; } @@ -681,8 +682,7 @@ not_found: spin_unlock(&dentry->d_lock); out_bad: - _debug("dropping dentry %s/%s", - parent->d_name.name, dentry->d_name.name); + _debug("dropping dentry %pd2", dentry); dput(parent); key_put(key); @@ -698,7 +698,7 @@ out_bad: */ static int afs_d_delete(const struct dentry *dentry) { - _enter("%s", dentry->d_name.name); + _enter("%pd", dentry); if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto zap; @@ -721,7 +721,7 @@ zap: */ static void afs_d_release(struct dentry *dentry) { - _enter("%s", dentry->d_name.name); + _enter("%pd", dentry); } /* @@ -740,8 +740,8 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%ho", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); + _enter("{%x:%u},{%pd},%ho", + dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { @@ -801,8 +801,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s}", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + _enter("{%x:%u},{%pd}", + dvnode->fid.vid, dvnode->fid.vnode, dentry); key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { @@ -843,8 +843,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s}", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); + _enter("{%x:%u},{%pd}", + dvnode->fid.vid, dvnode->fid.vnode, dentry); ret = -ENAMETOOLONG; if (dentry->d_name.len >= AFSNAMEMAX) @@ -917,8 +917,8 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%ho,", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode); + _enter("{%x:%u},{%pd},%ho,", + dvnode->fid.vid, dvnode->fid.vnode, dentry, mode); key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { @@ -980,10 +980,10 @@ static int afs_link(struct dentry *from, struct inode *dir, vnode = AFS_FS_I(from->d_inode); dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%x:%u},{%s}", + _enter("{%x:%u},{%x:%u},{%pd}", vnode->fid.vid, vnode->fid.vnode, dvnode->fid.vid, dvnode->fid.vnode, - dentry->d_name.name); + dentry); key = afs_request_key(dvnode->volume->cell); if (IS_ERR(key)) { @@ -1025,8 +1025,8 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, dvnode = AFS_FS_I(dir); - _enter("{%x:%u},{%s},%s", - dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, + _enter("{%x:%u},{%pd},%s", + dvnode->fid.vid, dvnode->fid.vnode, dentry, content); ret = -EINVAL; @@ -1093,11 +1093,11 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); - _enter("{%x:%u},{%x:%u},{%x:%u},{%s}", + _enter("{%x:%u},{%x:%u},{%x:%u},{%pd}", orig_dvnode->fid.vid, orig_dvnode->fid.vnode, vnode->fid.vid, vnode->fid.vnode, new_dvnode->fid.vid, new_dvnode->fid.vnode, - new_dentry->d_name.name); + new_dentry); key = afs_request_key(orig_dvnode->volume->cell); if (IS_ERR(key)) { diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 294671288449..8a1d38ef0fc2 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -462,8 +462,8 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr) struct key *key; int ret; - _enter("{%x:%u},{n=%s},%x", - vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + _enter("{%x:%u},{n=%pd},%x", + vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 9682c33d5daf..938c5ab06d5a 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -106,14 +106,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - _enter("%p,%p{%p{%s},%s}", - dir, - dentry, - dentry->d_parent, - dentry->d_parent ? - dentry->d_parent->d_name.name : (const unsigned char *) "", - dentry->d_name.name); - + _enter("%p,%p{%pd2}", dir, dentry, dentry); return ERR_PTR(-EREMOTE); } @@ -122,14 +115,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir, */ static int afs_mntpt_open(struct inode *inode, struct file *file) { - _enter("%p,%p{%p{%s},%s}", - inode, file, - file->f_path.dentry->d_parent, - file->f_path.dentry->d_parent ? - file->f_path.dentry->d_parent->d_name.name : - (const unsigned char *) "", - file->f_path.dentry->d_name.name); - + _enter("%p,%p{%pD2}", inode, file, file); return -EREMOTE; } @@ -146,7 +132,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) bool rwpath = false; int ret; - _enter("{%s}", mntpt->d_name.name); + _enter("{%pd}", mntpt); BUG_ON(!mntpt->d_inode); @@ -242,7 +228,7 @@ struct vfsmount *afs_d_automount(struct path *path) { struct vfsmount *newmnt; - _enter("{%s}", path->dentry->d_name.name); + _enter("{%pd}", path->dentry); newmnt = afs_mntpt_do_automount(path->dentry); if (IS_ERR(newmnt)) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 03a3beb17004..dbc732e9a5c0 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg, _debug("- range %u-%u%s", offset, to, msg->msg_flags ? " [more]" : ""); - msg->msg_iov = (struct iovec *) iov; - msg->msg_iovlen = 1; + iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, + iov, 1, to - offset); /* have to change the state *before* sending the last * packet as RxRPC might give us the reply before it @@ -384,8 +384,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = 1; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, + call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = (call->send_pages ? MSG_MORE : 0); @@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, void afs_send_empty_reply(struct afs_call *call) { struct msghdr msg; - struct iovec iov[1]; + struct kvec iov[1]; _enter(""); @@ -778,8 +778,7 @@ void afs_send_empty_reply(struct afs_call *call) iov[0].iov_len = 0; msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = iov; - msg.msg_iovlen = 0; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */ msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -806,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call) void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) { struct msghdr msg; - struct iovec iov[1]; + struct kvec iov[1]; int n; _enter(""); @@ -815,8 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) iov[0].iov_len = len; msg.msg_name = NULL; msg.msg_namelen = 0; - msg.msg_iov = iov; - msg.msg_iovlen = 1; + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 2b607257820c..d142a2449e65 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; - ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); + ret = bdi_setup_and_register(&volume->bdi, "afs"); if (ret) goto error_bdi; diff --git a/fs/afs/write.c b/fs/afs/write.c index ab6adfd52516..c13cb08964ed 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -682,14 +682,13 @@ int afs_writeback_all(struct afs_vnode *vnode) */ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = file->f_mapping->host; + struct inode *inode = file_inode(file); struct afs_writeback *wb, *xwb; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(inode); int ret; - _enter("{%x:%u},{n=%s},%d", - vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, + _enter("{%x:%u},{n=%pD},%d", + vnode->fid.vid, vnode->fid.vnode, file, datasync); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); @@ -273,19 +273,43 @@ static void aio_free_ring(struct kioctx *ctx) static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) { + vma->vm_flags |= VM_DONTEXPAND; vma->vm_ops = &generic_file_vm_ops; return 0; } +static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + struct kioctx_table *table; + int i, res = -EINVAL; + + spin_lock(&mm->ioctx_lock); + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + for (i = 0; i < table->nr; i++) { + struct kioctx *ctx; + + ctx = table->table[i]; + if (ctx && ctx->aio_ring_file == file) { + if (!atomic_read(&ctx->dead)) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + res = 0; + } + break; + } + } + + rcu_read_unlock(); + spin_unlock(&mm->ioctx_lock); + return res; +} + static const struct file_operations aio_ring_fops = { .mmap = aio_ring_mmap, + .mremap = aio_ring_remap, }; -static int aio_set_page_dirty(struct page *page) -{ - return 0; -} - #if IS_ENABLED(CONFIG_MIGRATION) static int aio_migratepage(struct address_space *mapping, struct page *new, struct page *old, enum migrate_mode mode) @@ -357,7 +381,7 @@ out: #endif static const struct address_space_operations aio_ctx_aops = { - .set_page_dirty = aio_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, #if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, #endif @@ -412,7 +436,6 @@ static int aio_setup_ring(struct kioctx *ctx) pr_debug("pid(%d) page[%d]->count=%d\n", current->pid, i, page_count(page)); SetPageUptodate(page); - SetPageDirty(page); unlock_page(page); ctx->ring_pages[i] = page; @@ -708,6 +731,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: + atomic_set(&ctx->dead, 1); + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); @@ -729,11 +755,12 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, { struct kioctx_table *table; - if (atomic_xchg(&ctx->dead, 1)) + spin_lock(&mm->ioctx_lock); + if (atomic_xchg(&ctx->dead, 1)) { + spin_unlock(&mm->ioctx_lock); return -EINVAL; + } - - spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; @@ -1108,6 +1135,13 @@ static long aio_read_events_ring(struct kioctx *ctx, long ret = 0; int copy_ret; + /* + * The mutex can block and wake us up and that will cause + * wait_event_interruptible_hrtimeout() to schedule without sleeping + * and repeat. This should be rare enough that it doesn't cause + * peformance issues. See the comment in read_events() for more detail. + */ + sched_annotate_sleep(); mutex_lock(&ctx->ring_lock); /* Access to ->ring_pages here is protected by ctx->ring_lock. */ @@ -1221,8 +1255,12 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), until); + if (until.tv64 == 0) + aio_read_events(ctx, min_nr, nr, event, &ret); + else + wait_event_interruptible_hrtimeout(ctx->wait, + aio_read_events(ctx, min_nr, nr, event, &ret), + until); if (!ret && signal_pending(current)) ret = -EINTR; @@ -1255,7 +1293,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = -EINVAL; if (unlikely(ctx || nr_events == 0)) { - pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", + pr_debug("EINVAL: ctx %lu nr_events %u\n", ctx, nr_events); goto out; } @@ -1303,7 +1341,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return ret; } - pr_debug("EINVAL: io_destroy: invalid context id\n"); + pr_debug("EINVAL: invalid context id\n"); return -EINVAL; } @@ -1485,7 +1523,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || ((ssize_t)iocb->aio_nbytes < 0) )) { - pr_debug("EINVAL: io_submit: overflow check\n"); + pr_debug("EINVAL: overflow check\n"); return -EINVAL; } diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index aaf96cb25452..ac7d921ed984 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) */ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { - struct autofs_dev_ioctl tmp; + struct autofs_dev_ioctl tmp, *res; if (copy_from_user(&tmp, in, sizeof(tmp))) return ERR_PTR(-EFAULT); @@ -106,7 +106,11 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i if (tmp.size > (PATH_MAX + sizeof(tmp))) return ERR_PTR(-ENAMETOOLONG); - return memdup_user(in, tmp.size); + res = memdup_user(in, tmp.size); + if (!IS_ERR(res)) + res->size = tmp.size; + + return res; } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 683a5b9ce22a..11dd118f75e2 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -41,8 +41,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; - DPRINTK("dentry %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); + DPRINTK("dentry %p %pd", dentry, dentry); path_get(&path); @@ -85,7 +84,7 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev, spin_lock(&root->d_lock); if (prev) - next = prev->d_u.d_child.next; + next = prev->d_child.next; else { prev = dget_dlock(root); next = prev->d_subdirs.next; @@ -99,13 +98,13 @@ cont: return NULL; } - q = list_entry(next, struct dentry, d_u.d_child); + q = list_entry(next, struct dentry, d_child); spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); /* Already gone or negative dentry (under construction) - try next */ if (!d_count(q) || !simple_positive(q)) { spin_unlock(&q->d_lock); - next = q->d_u.d_child.next; + next = q->d_child.next; goto cont; } dget_dlock(q); @@ -155,13 +154,13 @@ again: goto relock; } spin_unlock(&p->d_lock); - next = p->d_u.d_child.next; + next = p->d_child.next; p = parent; if (next != &parent->d_subdirs) break; } } - ret = list_entry(next, struct dentry, d_u.d_child); + ret = list_entry(next, struct dentry, d_child); spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); /* Negative dentry - try next */ @@ -192,8 +191,7 @@ static int autofs4_direct_busy(struct vfsmount *mnt, unsigned long timeout, int do_now) { - DPRINTK("top %p %.*s", - top, (int) top->d_name.len, top->d_name.name); + DPRINTK("top %p %pd", top, top); /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { @@ -221,8 +219,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, struct autofs_info *top_ino = autofs4_dentry_ino(top); struct dentry *p; - DPRINTK("top %p %.*s", - top, (int)top->d_name.len, top->d_name.name); + DPRINTK("top %p %pd", top, top); /* Negative dentry - give up */ if (!simple_positive(top)) @@ -230,8 +227,7 @@ static int autofs4_tree_busy(struct vfsmount *mnt, p = NULL; while ((p = get_next_positive_dentry(p, top))) { - DPRINTK("dentry %p %.*s", - p, (int) p->d_name.len, p->d_name.name); + DPRINTK("dentry %p %pd", p, p); /* * Is someone visiting anywhere in the subtree ? @@ -277,13 +273,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, { struct dentry *p; - DPRINTK("parent %p %.*s", - parent, (int)parent->d_name.len, parent->d_name.name); + DPRINTK("parent %p %pd", parent, parent); p = NULL; while ((p = get_next_positive_dentry(p, parent))) { - DPRINTK("dentry %p %.*s", - p, (int) p->d_name.len, p->d_name.name); + DPRINTK("dentry %p %pd", p, p); if (d_mountpoint(p)) { /* Can we umount this guy */ @@ -368,8 +362,7 @@ static struct dentry *should_expire(struct dentry *dentry, * offset (autofs-5.0+). */ if (d_mountpoint(dentry)) { - DPRINTK("checking mountpoint %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); + DPRINTK("checking mountpoint %p %pd", dentry, dentry); /* Can we umount this guy */ if (autofs4_mount_busy(mnt, dentry)) @@ -381,9 +374,8 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { - DPRINTK("checking symlink %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); + if (dentry->d_inode && d_is_symlink(dentry)) { + DPRINTK("checking symlink %p %pd", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. @@ -479,8 +471,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, return NULL; found: - DPRINTK("returning %p %.*s", - expired, (int)expired->d_name.len, expired->d_name.name); + DPRINTK("returning %p %pd", expired, expired); ino->flags |= AUTOFS_INF_EXPIRING; smp_mb(); ino->flags &= ~AUTOFS_INF_NO_RCU; @@ -489,7 +480,7 @@ found: spin_lock(&sbi->lookup_lock); spin_lock(&expired->d_parent->d_lock); spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + list_move(&expired->d_parent->d_subdirs, &expired->d_child); spin_unlock(&expired->d_lock); spin_unlock(&expired->d_parent->d_lock); spin_unlock(&sbi->lookup_lock); @@ -512,8 +503,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); - DPRINTK("waiting for expire %p name=%.*s", - dentry, dentry->d_name.len, dentry->d_name.name); + DPRINTK("waiting for expire %p name=%pd", dentry, dentry); status = autofs4_wait(sbi, dentry, NFY_NONE); wait_for_completion(&ino->expire_complete); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d76d083f2f06..7e44fdd03e2d 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -108,8 +108,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) struct dentry *dentry = file->f_path.dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - DPRINTK("file=%p dentry=%p %.*s", - file, dentry, dentry->d_name.len, dentry->d_name.name); + DPRINTK("file=%p dentry=%p %pd", file, dentry, dentry); if (autofs4_oz_mode(sbi)) goto out; @@ -279,8 +278,7 @@ static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - DPRINTK("waiting for mount name=%.*s", - dentry->d_name.len, dentry->d_name.name); + DPRINTK("waiting for mount name=%pd", dentry); status = autofs4_wait(sbi, dentry, NFY_MOUNT); DPRINTK("mount wait done status=%d", status); } @@ -340,8 +338,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %.*s", - dentry, dentry->d_name.len, dentry->d_name.name); + DPRINTK("dentry=%p %pd", dentry, dentry); /* The daemon never triggers a mount. */ if (autofs4_oz_mode(sbi)) @@ -374,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + if (dentry->d_inode && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -428,8 +425,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; - DPRINTK("dentry=%p %.*s", - dentry, dentry->d_name.len, dentry->d_name.name); + DPRINTK("dentry=%p %pd", dentry, dentry); /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { @@ -489,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * an incorrect ELOOP error return. */ if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || - (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) + (dentry->d_inode && d_is_symlink(dentry))) status = -EISDIR; } spin_unlock(&sbi->fs_lock); @@ -504,7 +500,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u struct autofs_info *ino; struct dentry *active; - DPRINTK("name = %.*s", dentry->d_name.len, dentry->d_name.name); + DPRINTK("name = %pd", dentry); /* File name too long to exist */ if (dentry->d_name.len > NAME_MAX) @@ -558,8 +554,7 @@ static int autofs4_dir_symlink(struct inode *dir, size_t size = strlen(symname); char *cp; - DPRINTK("%s <- %.*s", symname, - dentry->d_name.len, dentry->d_name.name); + DPRINTK("%s <- %pd", symname, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -687,7 +682,7 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) /* only consider parents below dentrys in the root */ if (IS_ROOT(parent->d_parent)) return; - d_child = &dentry->d_u.d_child; + d_child = &dentry->d_child; /* Set parent managed if it's becoming empty */ if (d_child->next == &parent->d_subdirs && d_child->prev == &parent->d_subdirs) @@ -701,8 +696,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs4_dentry_ino(dentry); struct autofs_info *p_ino; - DPRINTK("dentry %p, removing %.*s", - dentry, dentry->d_name.len, dentry->d_name.name); + DPRINTK("dentry %p, removing %pd", dentry, dentry); if (!autofs4_oz_mode(sbi)) return -EACCES; @@ -744,8 +738,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m if (!autofs4_oz_mode(sbi)) return -EACCES; - DPRINTK("dentry %p, creating %.*s", - dentry, dentry->d_name.len, dentry->d_name.name); + DPRINTK("dentry %p, creating %pd", dentry, dentry); BUG_ON(!ino); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index afd2b4408adf..861b1e1c4777 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,161 +15,14 @@ #include <linux/namei.h> #include <linux/poll.h> - -static loff_t bad_file_llseek(struct file *file, loff_t offset, int whence) -{ - return -EIO; -} - -static ssize_t bad_file_read(struct file *filp, char __user *buf, - size_t size, loff_t *ppos) -{ - return -EIO; -} - -static ssize_t bad_file_write(struct file *filp, const char __user *buf, - size_t siz, loff_t *ppos) -{ - return -EIO; -} - -static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - return -EIO; -} - -static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - return -EIO; -} - -static int bad_file_readdir(struct file *file, struct dir_context *ctx) -{ - return -EIO; -} - -static unsigned int bad_file_poll(struct file *filp, poll_table *wait) -{ - return POLLERR; -} - -static long bad_file_unlocked_ioctl(struct file *file, unsigned cmd, - unsigned long arg) -{ - return -EIO; -} - -static long bad_file_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return -EIO; -} - -static int bad_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - return -EIO; -} - static int bad_file_open(struct inode *inode, struct file *filp) { return -EIO; } -static int bad_file_flush(struct file *file, fl_owner_t id) -{ - return -EIO; -} - -static int bad_file_release(struct inode *inode, struct file *filp) -{ - return -EIO; -} - -static int bad_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - return -EIO; -} - -static int bad_file_aio_fsync(struct kiocb *iocb, int datasync) -{ - return -EIO; -} - -static int bad_file_fasync(int fd, struct file *filp, int on) -{ - return -EIO; -} - -static int bad_file_lock(struct file *file, int cmd, struct file_lock *fl) -{ - return -EIO; -} - -static ssize_t bad_file_sendpage(struct file *file, struct page *page, - int off, size_t len, loff_t *pos, int more) -{ - return -EIO; -} - -static unsigned long bad_file_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - return -EIO; -} - -static int bad_file_check_flags(int flags) -{ - return -EIO; -} - -static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl) -{ - return -EIO; -} - -static ssize_t bad_file_splice_write(struct pipe_inode_info *pipe, - struct file *out, loff_t *ppos, size_t len, - unsigned int flags) -{ - return -EIO; -} - -static ssize_t bad_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - return -EIO; -} - static const struct file_operations bad_file_ops = { - .llseek = bad_file_llseek, - .read = bad_file_read, - .write = bad_file_write, - .aio_read = bad_file_aio_read, - .aio_write = bad_file_aio_write, - .iterate = bad_file_readdir, - .poll = bad_file_poll, - .unlocked_ioctl = bad_file_unlocked_ioctl, - .compat_ioctl = bad_file_compat_ioctl, - .mmap = bad_file_mmap, .open = bad_file_open, - .flush = bad_file_flush, - .release = bad_file_release, - .fsync = bad_file_fsync, - .aio_fsync = bad_file_aio_fsync, - .fasync = bad_file_fasync, - .lock = bad_file_lock, - .sendpage = bad_file_sendpage, - .get_unmapped_area = bad_file_get_unmapped_area, - .check_flags = bad_file_check_flags, - .flock = bad_file_flock, - .splice_write = bad_file_splice_write, - .splice_read = bad_file_splice_read, }; static int bad_inode_create (struct inode *dir, struct dentry *dentry, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 4cf61ec6b7a8..e089f1985fca 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -172,8 +172,8 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) char *utfname; const char *name = dentry->d_name.name; - befs_debug(sb, "---> %s name %s inode %ld", __func__, - dentry->d_name.name, dir->i_ino); + befs_debug(sb, "---> %s name %pd inode %ld", __func__, + dentry, dir->i_ino); /* Convert to UTF-8 */ if (BEFS_SB(sb)->nls) { @@ -191,8 +191,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) } if (ret == BEFS_BT_NOT_FOUND) { - befs_debug(sb, "<--- %s %s not found", __func__, - dentry->d_name.name); + befs_debug(sb, "<--- %s %pd not found", __func__, dentry); return ERR_PTR(-ENOENT); } else if (ret != BEFS_OK || offset == 0) { @@ -222,10 +221,9 @@ befs_readdir(struct file *file, struct dir_context *ctx) size_t keysize; unsigned char d_type; char keybuf[BEFS_NAME_LEN + 1]; - const char *dirname = file->f_path.dentry->d_name.name; - befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld", - __func__, dirname, inode->i_ino, ctx->pos); + befs_debug(sb, "---> %s name %pD, inode %ld, ctx->pos %lld", + __func__, file, inode->i_ino, ctx->pos); more: result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1, @@ -233,8 +231,8 @@ more: if (result == BEFS_ERR) { befs_debug(sb, "<--- %s ERROR", __func__); - befs_error(sb, "IO error reading %s (inode %lu)", - dirname, inode->i_ino); + befs_error(sb, "IO error reading %pD (inode %lu)", + file, inode->i_ino); return -EIO; } else if (result == BEFS_BT_END) { @@ -271,18 +269,14 @@ more: } ctx->pos++; goto more; - - befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos); - - return 0; } static struct inode * befs_alloc_inode(struct super_block *sb) { - struct befs_inode_info *bi; - bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep, - GFP_KERNEL); + struct befs_inode_info *bi; + + bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 929dec08c348..4c556680fa74 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -292,8 +292,8 @@ static int load_aout_binary(struct linux_binprm * bprm) if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) { printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %s\n", - bprm->file->f_path.dentry->d_name.name); + "fd_offset is not page aligned. Please convert program: %pD\n", + bprm->file); } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { @@ -375,8 +375,8 @@ static int load_aout_library(struct file *file) if (printk_ratelimit()) { printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %s\n", - file->f_path.dentry->d_name.name); + "N_TXTOFF is not page aligned. Please convert library: %pD\n", + file); } vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d8fc0605b9d2..995986b8e36b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -386,6 +386,127 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) ELF_PAGESTART(cmds[first_idx].p_vaddr); } +/** + * load_elf_phdrs() - load ELF program headers + * @elf_ex: ELF header of the binary whose program headers should be loaded + * @elf_file: the opened ELF binary file + * + * Loads ELF program headers from the binary file elf_file, which has the ELF + * header pointed to by elf_ex, into a newly allocated array. The caller is + * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. + */ +static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, + struct file *elf_file) +{ + struct elf_phdr *elf_phdata = NULL; + int retval, size, err = -1; + + /* + * If the size of this structure has changed, then punt, since + * we will be doing the wrong thing. + */ + if (elf_ex->e_phentsize != sizeof(struct elf_phdr)) + goto out; + + /* Sanity check the number of program headers... */ + if (elf_ex->e_phnum < 1 || + elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) + goto out; + + /* ...and their total size. */ + size = sizeof(struct elf_phdr) * elf_ex->e_phnum; + if (size > ELF_MIN_ALIGN) + goto out; + + elf_phdata = kmalloc(size, GFP_KERNEL); + if (!elf_phdata) + goto out; + + /* Read in the program headers */ + retval = kernel_read(elf_file, elf_ex->e_phoff, + (char *)elf_phdata, size); + if (retval != size) { + err = (retval < 0) ? retval : -EIO; + goto out; + } + + /* Success! */ + err = 0; +out: + if (err) { + kfree(elf_phdata); + elf_phdata = NULL; + } + return elf_phdata; +} + +#ifndef CONFIG_ARCH_BINFMT_ELF_STATE + +/** + * struct arch_elf_state - arch-specific ELF loading state + * + * This structure is used to preserve architecture specific data during + * the loading of an ELF file, throughout the checking of architecture + * specific ELF headers & through to the point where the ELF load is + * known to be proceeding (ie. SET_PERSONALITY). + * + * This implementation is a dummy for architectures which require no + * specific state. + */ +struct arch_elf_state { +}; + +#define INIT_ARCH_ELF_STATE {} + +/** + * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header + * @ehdr: The main ELF header + * @phdr: The program header to check + * @elf: The open ELF file + * @is_interp: True if the phdr is from the interpreter of the ELF being + * loaded, else false. + * @state: Architecture-specific state preserved throughout the process + * of loading the ELF. + * + * Inspects the program header phdr to validate its correctness and/or + * suitability for the system. Called once per ELF program header in the + * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its + * interpreter. + * + * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load + * with that return code. + */ +static inline int arch_elf_pt_proc(struct elfhdr *ehdr, + struct elf_phdr *phdr, + struct file *elf, bool is_interp, + struct arch_elf_state *state) +{ + /* Dummy implementation, always proceed */ + return 0; +} + +/** + * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header + * @ehdr: The main ELF header + * @has_interp: True if the ELF has an interpreter, else false. + * @state: Architecture-specific state preserved throughout the process + * of loading the ELF. + * + * Provides a final opportunity for architecture code to reject the loading + * of the ELF & cause an exec syscall to return an error. This is called after + * all program headers to be checked by arch_elf_pt_proc have been. + * + * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load + * with that return code. + */ +static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, + struct arch_elf_state *state) +{ + /* Dummy implementation, always proceed */ + return 0; +} + +#endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function @@ -394,16 +515,15 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, unsigned long *interp_map_addr, - unsigned long no_base) + unsigned long no_base, struct elf_phdr *interp_elf_phdata) { - struct elf_phdr *elf_phdata; struct elf_phdr *eppnt; unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; unsigned long error = ~0UL; unsigned long total_size; - int retval, i, size; + int i; /* First of all, some simple consistency checks */ if (interp_elf_ex->e_type != ET_EXEC && @@ -414,40 +534,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, if (!interpreter->f_op->mmap) goto out; - /* - * If the size of this structure has changed, then punt, since - * we will be doing the wrong thing. - */ - if (interp_elf_ex->e_phentsize != sizeof(struct elf_phdr)) - goto out; - if (interp_elf_ex->e_phnum < 1 || - interp_elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) - goto out; - - /* Now read in all of the header information */ - size = sizeof(struct elf_phdr) * interp_elf_ex->e_phnum; - if (size > ELF_MIN_ALIGN) - goto out; - elf_phdata = kmalloc(size, GFP_KERNEL); - if (!elf_phdata) - goto out; - - retval = kernel_read(interpreter, interp_elf_ex->e_phoff, - (char *)elf_phdata, size); - error = -EIO; - if (retval != size) { - if (retval < 0) - error = retval; - goto out_close; - } - - total_size = total_mapping_size(elf_phdata, interp_elf_ex->e_phnum); + total_size = total_mapping_size(interp_elf_phdata, + interp_elf_ex->e_phnum); if (!total_size) { error = -EINVAL; - goto out_close; + goto out; } - eppnt = elf_phdata; + eppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; @@ -474,7 +568,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, *interp_map_addr = map_addr; error = map_addr; if (BAD_ADDR(map_addr)) - goto out_close; + goto out; if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) { @@ -493,7 +587,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) { error = -ENOMEM; - goto out_close; + goto out; } /* @@ -523,7 +617,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, */ if (padzero(elf_bss)) { error = -EFAULT; - goto out_close; + goto out; } /* What we have mapped so far */ @@ -532,13 +626,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); if (BAD_ADDR(error)) - goto out_close; + goto out; } error = load_addr; - -out_close: - kfree(elf_phdata); out: return error; } @@ -554,11 +645,12 @@ out: static unsigned long randomize_stack_top(unsigned long stack_top) { - unsigned int random_variable = 0; + unsigned long random_variable = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = get_random_int() & STACK_RND_MASK; + random_variable = (unsigned long) get_random_int(); + random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } #ifdef CONFIG_STACK_GROWSUP @@ -575,10 +667,9 @@ static int load_elf_binary(struct linux_binprm *bprm) int load_addr_set = 0; char * elf_interpreter = NULL; unsigned long error; - struct elf_phdr *elf_ppnt, *elf_phdata; + struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; unsigned long elf_bss, elf_brk; int retval, i; - unsigned int size; unsigned long elf_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; @@ -589,6 +680,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; + struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; loc = kmalloc(sizeof(*loc), GFP_KERNEL); if (!loc) { @@ -611,26 +703,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (!bprm->file->f_op->mmap) goto out; - /* Now read in all of the header information */ - if (loc->elf_ex.e_phentsize != sizeof(struct elf_phdr)) - goto out; - if (loc->elf_ex.e_phnum < 1 || - loc->elf_ex.e_phnum > 65536U / sizeof(struct elf_phdr)) - goto out; - size = loc->elf_ex.e_phnum * sizeof(struct elf_phdr); - retval = -ENOMEM; - elf_phdata = kmalloc(size, GFP_KERNEL); + elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); if (!elf_phdata) goto out; - retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, - (char *)elf_phdata, size); - if (retval != size) { - if (retval >= 0) - retval = -EIO; - goto out_free_ph; - } - elf_ppnt = elf_phdata; elf_bss = 0; elf_brk = 0; @@ -699,12 +775,21 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_ppnt = elf_phdata; for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) - if (elf_ppnt->p_type == PT_GNU_STACK) { + switch (elf_ppnt->p_type) { + case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) executable_stack = EXSTACK_ENABLE_X; else executable_stack = EXSTACK_DISABLE_X; break; + + case PT_LOPROC ... PT_HIPROC: + retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + bprm->file, false, + &arch_state); + if (retval) + goto out_free_dentry; + break; } /* Some simple consistency checks for the interpreter */ @@ -716,8 +801,36 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; + + /* Load the interpreter program headers */ + interp_elf_phdata = load_elf_phdrs(&loc->interp_elf_ex, + interpreter); + if (!interp_elf_phdata) + goto out_free_dentry; + + /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ + elf_ppnt = interp_elf_phdata; + for (i = 0; i < loc->interp_elf_ex.e_phnum; i++, elf_ppnt++) + switch (elf_ppnt->p_type) { + case PT_LOPROC ... PT_HIPROC: + retval = arch_elf_pt_proc(&loc->interp_elf_ex, + elf_ppnt, interpreter, + true, &arch_state); + if (retval) + goto out_free_dentry; + break; + } } + /* + * Allow arch code to reject the ELF at this point, whilst it's + * still possible to return an error to the code that invoked + * the exec syscall. + */ + retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state); + if (retval) + goto out_free_dentry; + /* Flush all traces of the currently running executable */ retval = flush_old_exec(bprm); if (retval) @@ -725,7 +838,7 @@ static int load_elf_binary(struct linux_binprm *bprm) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY(loc->elf_ex); + SET_PERSONALITY2(loc->elf_ex, &arch_state); if (elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -890,7 +1003,7 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, &interp_map_addr, - load_bias); + load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { /* * load_elf_interp() returns relocation @@ -917,6 +1030,7 @@ static int load_elf_binary(struct linux_binprm *bprm) } } + kfree(interp_elf_phdata); kfree(elf_phdata); set_binfmt(&elf_format); @@ -981,6 +1095,7 @@ out_ret: /* error cleanup */ out_free_dentry: + kfree(interp_elf_phdata); allow_write_access(interpreter); if (interpreter) fput(interpreter); @@ -1994,18 +2109,6 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, shdr4extnum->sh_info = segs; } -static size_t elf_core_vma_data_size(struct vm_area_struct *gate_vma, - unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) - size += vma_dump_size(vma, mm_flags); - return size; -} - /* * Actual dumper * @@ -2017,7 +2120,8 @@ static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; mm_segment_t fs; - int segs; + int segs, i; + size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -2026,6 +2130,7 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; + elf_addr_t *vma_filesz = NULL; /* * We no longer stop all VM operations. @@ -2093,7 +2198,20 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(gate_vma, cprm->mm_flags); + vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (!vma_filesz) + goto end_coredump; + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma)) { + unsigned long dump_size; + + dump_size = vma_dump_size(vma, cprm->mm_flags); + vma_filesz[i++] = dump_size; + vma_data_size += dump_size; + } + + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2113,7 +2231,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (vma = first_vma(current, gate_vma); vma != NULL; + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { struct elf_phdr phdr; @@ -2121,7 +2239,7 @@ static int elf_core_dump(struct coredump_params *cprm) phdr.p_offset = offset; phdr.p_vaddr = vma->vm_start; phdr.p_paddr = 0; - phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); + phdr.p_filesz = vma_filesz[i++]; phdr.p_memsz = vma->vm_end - vma->vm_start; offset += phdr.p_filesz; phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; @@ -2149,12 +2267,12 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->written)) goto end_coredump; - for (vma = first_vma(current, gate_vma); vma != NULL; + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma)) { unsigned long addr; unsigned long end; - end = vma->vm_start + vma_dump_size(vma, cprm->mm_flags); + end = vma->vm_start + vma_filesz[i++]; for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; @@ -2187,6 +2305,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); + kfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f37b08cea1f7..490538536cb4 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -42,6 +42,10 @@ static int load_em86(struct linux_binprm *bprm) return -ENOEXEC; } + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fd8beb9657a2..97aff2879cda 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -1,21 +1,14 @@ /* - * binfmt_misc.c + * binfmt_misc.c * - * Copyright (C) 1997 Richard Günther + * Copyright (C) 1997 Richard Günther * - * binfmt_misc detects binaries via a magic or filename extension and invokes - * a specified wrapper. This should obsolete binfmt_java, binfmt_em86 and - * binfmt_mz. - * - * 1997-04-25 first version - * [...] - * 1997-05-19 cleanup - * 1997-06-26 hpa: pass the real filename rather than argv[0] - * 1997-06-30 minor cleanup - * 1997-08-09 removed extension stripping, locking cleanup - * 2001-02-28 AV: rewritten into something that resembles C. Original didn't. + * binfmt_misc detects binaries via a magic or filename extension and invokes + * a specified wrapper. See Documentation/binfmt_misc.txt for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> @@ -30,8 +23,13 @@ #include <linux/mount.h> #include <linux/syscalls.h> #include <linux/fs.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> +#ifdef DEBUG +# define USE_DEBUG 1 +#else +# define USE_DEBUG 0 +#endif enum { VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */ @@ -41,9 +39,9 @@ static LIST_HEAD(entries); static int enabled = 1; enum {Enabled, Magic}; -#define MISC_FMT_PRESERVE_ARGV0 (1<<31) -#define MISC_FMT_OPEN_BINARY (1<<30) -#define MISC_FMT_CREDENTIALS (1<<29) +#define MISC_FMT_PRESERVE_ARGV0 (1 << 31) +#define MISC_FMT_OPEN_BINARY (1 << 30) +#define MISC_FMT_CREDENTIALS (1 << 29) typedef struct { struct list_head list; @@ -87,20 +85,24 @@ static Node *check_file(struct linux_binprm *bprm) char *p = strrchr(bprm->interp, '.'); struct list_head *l; + /* Walk all the registered handlers. */ list_for_each(l, &entries) { Node *e = list_entry(l, Node, list); char *s; int j; + /* Make sure this one is currently enabled. */ if (!test_bit(Enabled, &e->flags)) continue; + /* Do matching based on extension if applicable. */ if (!test_bit(Magic, &e->flags)) { if (p && !strcmp(e->magic, p + 1)) return e; continue; } + /* Do matching based on magic & mask. */ s = bprm->buf + e->offset; if (e->mask) { for (j = 0; j < e->size; j++) @@ -123,7 +125,7 @@ static Node *check_file(struct linux_binprm *bprm) static int load_misc_binary(struct linux_binprm *bprm) { Node *fmt; - struct file * interp_file = NULL; + struct file *interp_file = NULL; char iname[BINPRM_BUF_SIZE]; const char *iname_addr = iname; int retval; @@ -131,7 +133,7 @@ static int load_misc_binary(struct linux_binprm *bprm) retval = -ENOEXEC; if (!enabled) - goto _ret; + goto ret; /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); @@ -140,25 +142,30 @@ static int load_misc_binary(struct linux_binprm *bprm) strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE); read_unlock(&entries_lock); if (!fmt) - goto _ret; + goto ret; + + /* Need to be able to load the file after exec */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; if (!(fmt->flags & MISC_FMT_PRESERVE_ARGV0)) { retval = remove_arg_zero(bprm); if (retval) - goto _ret; + goto ret; } if (fmt->flags & MISC_FMT_OPEN_BINARY) { /* if the binary should be opened on behalf of the * interpreter than keep it open and assign descriptor - * to it */ - fd_binary = get_unused_fd(); - if (fd_binary < 0) { - retval = fd_binary; - goto _ret; - } - fd_install(fd_binary, bprm->file); + * to it + */ + fd_binary = get_unused_fd_flags(0); + if (fd_binary < 0) { + retval = fd_binary; + goto ret; + } + fd_install(fd_binary, bprm->file); /* if the binary is not readable than enforce mm->dumpable=0 regardless of the interpreter's permissions */ @@ -171,32 +178,32 @@ static int load_misc_binary(struct linux_binprm *bprm) bprm->interp_flags |= BINPRM_FLAGS_EXECFD; bprm->interp_data = fd_binary; - } else { - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - } + } else { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; + } /* make argv[1] be the path to the binary */ - retval = copy_strings_kernel (1, &bprm->interp, bprm); + retval = copy_strings_kernel(1, &bprm->interp, bprm); if (retval < 0) - goto _error; + goto error; bprm->argc++; /* add the interp as argv[0] */ - retval = copy_strings_kernel (1, &iname_addr, bprm); + retval = copy_strings_kernel(1, &iname_addr, bprm); if (retval < 0) - goto _error; - bprm->argc ++; + goto error; + bprm->argc++; /* Update interp in case binfmt_script needs it. */ retval = bprm_change_interp(iname, bprm); if (retval < 0) - goto _error; + goto error; - interp_file = open_exec (iname); - retval = PTR_ERR (interp_file); - if (IS_ERR (interp_file)) - goto _error; + interp_file = open_exec(iname); + retval = PTR_ERR(interp_file); + if (IS_ERR(interp_file)) + goto error; bprm->file = interp_file; if (fmt->flags & MISC_FMT_CREDENTIALS) { @@ -207,23 +214,23 @@ static int load_misc_binary(struct linux_binprm *bprm) memset(bprm->buf, 0, BINPRM_BUF_SIZE); retval = kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE); } else - retval = prepare_binprm (bprm); + retval = prepare_binprm(bprm); if (retval < 0) - goto _error; + goto error; retval = search_binary_handler(bprm); if (retval < 0) - goto _error; + goto error; -_ret: +ret: return retval; -_error: +error: if (fd_binary > 0) sys_close(fd_binary); bprm->interp_flags = 0; bprm->interp_data = 0; - goto _ret; + goto ret; } /* Command parsers */ @@ -247,39 +254,44 @@ static char *scanarg(char *s, char del) return NULL; } } + s[-1] ='\0'; return s; } -static char * check_special_flags (char * sfs, Node * e) +static char *check_special_flags(char *sfs, Node *e) { - char * p = sfs; + char *p = sfs; int cont = 1; /* special flags */ while (cont) { switch (*p) { - case 'P': - p++; - e->flags |= MISC_FMT_PRESERVE_ARGV0; - break; - case 'O': - p++; - e->flags |= MISC_FMT_OPEN_BINARY; - break; - case 'C': - p++; - /* this flags also implies the - open-binary flag */ - e->flags |= (MISC_FMT_CREDENTIALS | - MISC_FMT_OPEN_BINARY); - break; - default: - cont = 0; + case 'P': + pr_debug("register: flag: P (preserve argv0)\n"); + p++; + e->flags |= MISC_FMT_PRESERVE_ARGV0; + break; + case 'O': + pr_debug("register: flag: O (open binary)\n"); + p++; + e->flags |= MISC_FMT_OPEN_BINARY; + break; + case 'C': + pr_debug("register: flag: C (preserve creds)\n"); + p++; + /* this flags also implies the + open-binary flag */ + e->flags |= (MISC_FMT_CREDENTIALS | + MISC_FMT_OPEN_BINARY); + break; + default: + cont = 0; } } return p; } + /* * This registers a new binary format, it recognises the syntax * ':name:type:offset:magic:mask:interpreter:flags' @@ -292,6 +304,8 @@ static Node *create_entry(const char __user *buffer, size_t count) char *buf, *p; char del; + pr_debug("register: received %zu bytes\n", count); + /* some sanity checks */ err = -EINVAL; if ((count < 11) || (count > MAX_REGISTER_LENGTH)) @@ -299,7 +313,7 @@ static Node *create_entry(const char __user *buffer, size_t count) err = -ENOMEM; memsize = sizeof(Node) + count + 8; - e = kmalloc(memsize, GFP_USER); + e = kmalloc(memsize, GFP_KERNEL); if (!e) goto out; @@ -307,98 +321,173 @@ static Node *create_entry(const char __user *buffer, size_t count) memset(e, 0, sizeof(Node)); if (copy_from_user(buf, buffer, count)) - goto Efault; + goto efault; del = *p++; /* delimeter */ - memset(buf+count, del, 8); + pr_debug("register: delim: %#x {%c}\n", del, del); + + /* Pad the buffer with the delim to simplify parsing below. */ + memset(buf + count, del, 8); + /* Parse the 'name' field. */ e->name = p; p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; if (!e->name[0] || !strcmp(e->name, ".") || !strcmp(e->name, "..") || strchr(e->name, '/')) - goto Einval; + goto einval; + + pr_debug("register: name: {%s}\n", e->name); + + /* Parse the 'type' field. */ switch (*p++) { - case 'E': e->flags = 1<<Enabled; break; - case 'M': e->flags = (1<<Enabled) | (1<<Magic); break; - default: goto Einval; + case 'E': + pr_debug("register: type: E (extension)\n"); + e->flags = 1 << Enabled; + break; + case 'M': + pr_debug("register: type: M (magic)\n"); + e->flags = (1 << Enabled) | (1 << Magic); + break; + default: + goto einval; } if (*p++ != del) - goto Einval; + goto einval; + if (test_bit(Magic, &e->flags)) { - char *s = strchr(p, del); + /* Handle the 'M' (magic) format. */ + char *s; + + /* Parse the 'offset' field. */ + s = strchr(p, del); if (!s) - goto Einval; + goto einval; *s++ = '\0'; e->offset = simple_strtoul(p, &p, 10); if (*p++) - goto Einval; + goto einval; + pr_debug("register: offset: %#x\n", e->offset); + + /* Parse the 'magic' field. */ e->magic = p; p = scanarg(p, del); if (!p) - goto Einval; - p[-1] = '\0'; + goto einval; if (!e->magic[0]) - goto Einval; + goto einval; + if (USE_DEBUG) + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[raw]: ", + DUMP_PREFIX_NONE, e->magic, p - e->magic); + + /* Parse the 'mask' field. */ e->mask = p; p = scanarg(p, del); if (!p) - goto Einval; - p[-1] = '\0'; - if (!e->mask[0]) + goto einval; + if (!e->mask[0]) { e->mask = NULL; + pr_debug("register: mask[raw]: none\n"); + } else if (USE_DEBUG) + print_hex_dump_bytes( + KBUILD_MODNAME ": register: mask[raw]: ", + DUMP_PREFIX_NONE, e->mask, p - e->mask); + + /* + * Decode the magic & mask fields. + * Note: while we might have accepted embedded NUL bytes from + * above, the unescape helpers here will stop at the first one + * it encounters. + */ e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX); if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) - goto Einval; + goto einval; if (e->size + e->offset > BINPRM_BUF_SIZE) - goto Einval; + goto einval; + pr_debug("register: magic/mask length: %i\n", e->size); + if (USE_DEBUG) { + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[decoded]: ", + DUMP_PREFIX_NONE, e->magic, e->size); + + if (e->mask) { + int i; + char *masked = kmalloc(e->size, GFP_KERNEL); + + print_hex_dump_bytes( + KBUILD_MODNAME ": register: mask[decoded]: ", + DUMP_PREFIX_NONE, e->mask, e->size); + + if (masked) { + for (i = 0; i < e->size; ++i) + masked[i] = e->magic[i] & e->mask[i]; + print_hex_dump_bytes( + KBUILD_MODNAME ": register: magic[masked]: ", + DUMP_PREFIX_NONE, masked, e->size); + + kfree(masked); + } + } + } } else { + /* Handle the 'E' (extension) format. */ + + /* Skip the 'offset' field. */ p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; + + /* Parse the 'magic' field. */ e->magic = p; p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; if (!e->magic[0] || strchr(e->magic, '/')) - goto Einval; + goto einval; + pr_debug("register: extension: {%s}\n", e->magic); + + /* Skip the 'mask' field. */ p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; } + + /* Parse the 'interpreter' field. */ e->interpreter = p; p = strchr(p, del); if (!p) - goto Einval; + goto einval; *p++ = '\0'; if (!e->interpreter[0]) - goto Einval; - - - p = check_special_flags (p, e); + goto einval; + pr_debug("register: interpreter: {%s}\n", e->interpreter); + /* Parse the 'flags' field. */ + p = check_special_flags(p, e); if (*p == '\n') p++; if (p != buf + count) - goto Einval; + goto einval; + return e; out: return ERR_PTR(err); -Efault: +efault: kfree(e); return ERR_PTR(-EFAULT); -Einval: +einval: kfree(e); return ERR_PTR(-EINVAL); } @@ -417,7 +506,7 @@ static int parse_command(const char __user *buffer, size_t count) return -EFAULT; if (!count) return 0; - if (s[count-1] == '\n') + if (s[count - 1] == '\n') count--; if (count == 1 && s[0] == '0') return 1; @@ -434,7 +523,7 @@ static void entry_status(Node *e, char *page) { char *dp; char *status = "disabled"; - const char * flags = "flags: "; + const char *flags = "flags: "; if (test_bit(Enabled, &e->flags)) status = "enabled"; @@ -448,19 +537,15 @@ static void entry_status(Node *e, char *page) dp = page + strlen(page); /* print the special flags */ - sprintf (dp, "%s", flags); - dp += strlen (flags); - if (e->flags & MISC_FMT_PRESERVE_ARGV0) { - *dp ++ = 'P'; - } - if (e->flags & MISC_FMT_OPEN_BINARY) { - *dp ++ = 'O'; - } - if (e->flags & MISC_FMT_CREDENTIALS) { - *dp ++ = 'C'; - } - *dp ++ = '\n'; - + sprintf(dp, "%s", flags); + dp += strlen(flags); + if (e->flags & MISC_FMT_PRESERVE_ARGV0) + *dp++ = 'P'; + if (e->flags & MISC_FMT_OPEN_BINARY) + *dp++ = 'O'; + if (e->flags & MISC_FMT_CREDENTIALS) + *dp++ = 'C'; + *dp++ = '\n'; if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); @@ -488,7 +573,7 @@ static void entry_status(Node *e, char *page) static struct inode *bm_get_inode(struct super_block *sb, int mode) { - struct inode * inode = new_inode(sb); + struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); @@ -528,13 +613,14 @@ static void kill_node(Node *e) /* /<entry> */ static ssize_t -bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos) +bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { Node *e = file_inode(file)->i_private; ssize_t res; char *page; - if (!(page = (char*) __get_free_page(GFP_KERNEL))) + page = (char *) __get_free_page(GFP_KERNEL); + if (!page) return -ENOMEM; entry_status(e, page); @@ -553,20 +639,28 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, int res = parse_command(buffer, count); switch (res) { - case 1: clear_bit(Enabled, &e->flags); - break; - case 2: set_bit(Enabled, &e->flags); - break; - case 3: root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); - - kill_node(e); - - mutex_unlock(&root->d_inode->i_mutex); - dput(root); - break; - default: return res; + case 1: + /* Disable this handler. */ + clear_bit(Enabled, &e->flags); + break; + case 2: + /* Enable this handler. */ + set_bit(Enabled, &e->flags); + break; + case 3: + /* Delete this handler. */ + root = dget(file->f_path.dentry->d_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + + kill_node(e); + + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + break; + default: + return res; } + return count; } @@ -654,26 +748,36 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } -static ssize_t bm_status_write(struct file * file, const char __user * buffer, +static ssize_t bm_status_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int res = parse_command(buffer, count); struct dentry *root; switch (res) { - case 1: enabled = 0; break; - case 2: enabled = 1; break; - case 3: root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); - - while (!list_empty(&entries)) - kill_node(list_entry(entries.next, Node, list)); - - mutex_unlock(&root->d_inode->i_mutex); - dput(root); - break; - default: return res; + case 1: + /* Disable all handlers. */ + enabled = 0; + break; + case 2: + /* Enable all handlers. */ + enabled = 1; + break; + case 3: + /* Delete all handlers. */ + root = dget(file->f_path.dentry->d_sb->s_root); + mutex_lock(&root->d_inode->i_mutex); + + while (!list_empty(&entries)) + kill_node(list_entry(entries.next, Node, list)); + + mutex_unlock(&root->d_inode->i_mutex); + dput(root); + break; + default: + return res; } + return count; } @@ -690,14 +794,16 @@ static const struct super_operations s_ops = { .evict_inode = bm_evict_inode, }; -static int bm_fill_super(struct super_block * sb, void * data, int silent) +static int bm_fill_super(struct super_block *sb, void *data, int silent) { + int err; static struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} }; - int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); + + err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files); if (!err) sb->s_op = &s_ops; return err; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 5027a3e14922..afdf4e3cafc2 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -24,6 +24,16 @@ static int load_script(struct linux_binprm *bprm) if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; + + /* + * If the script filename will be inaccessible after exec, typically + * because it is a "/dev/fd/<fd>/.." path against an O_CLOEXEC fd, give + * up now (on the assumption that the interpreter will want to load + * this file). + */ + if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) + return -ENOENT; + /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c deleted file mode 100644 index 4e00ed68d4a6..000000000000 --- a/fs/binfmt_som.c +++ /dev/null @@ -1,299 +0,0 @@ -/* - * linux/fs/binfmt_som.c - * - * These are the functions used to load SOM format executables as used - * by HP-UX. - * - * Copyright 1999 Matthew Wilcox <willy@bofh.ai> - * based on binfmt_elf which is - * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). - */ - -#include <linux/module.h> - -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/binfmts.h> -#include <linux/som.h> -#include <linux/string.h> -#include <linux/file.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/shm.h> -#include <linux/personality.h> -#include <linux/init.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> - - -#include <linux/elf.h> - -static int load_som_binary(struct linux_binprm * bprm); -static int load_som_library(struct file *); - -/* - * If we don't support core dumping, then supply a NULL so we - * don't even try. - */ -#if 0 -static int som_core_dump(struct coredump_params *cprm); -#else -#define som_core_dump NULL -#endif - -#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1)) -#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1)) -#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1)) - -static struct linux_binfmt som_format = { - .module = THIS_MODULE, - .load_binary = load_som_binary, - .load_shlib = load_som_library, - .core_dump = som_core_dump, - .min_coredump = SOM_PAGESIZE -}; - -/* - * create_som_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static void create_som_tables(struct linux_binprm *bprm) -{ - char **argv, **envp; - int argc = bprm->argc; - int envc = bprm->envc; - unsigned long p; - unsigned long *sp; - - /* Word-align the stack pointer */ - sp = (unsigned long *)((bprm->p + 3) & ~3); - - envp = (char **) sp; - sp += envc + 1; - argv = (char **) sp; - sp += argc + 1; - - __put_user((unsigned long) envp,++sp); - __put_user((unsigned long) argv,++sp); - - __put_user(argc, ++sp); - - bprm->p = (unsigned long) sp; - - p = current->mm->arg_start; - while (argc-- > 0) { - __put_user((char *)p,argv++); - p += strlen_user((char *)p); - } - __put_user(NULL, argv); - current->mm->arg_end = current->mm->env_start = p; - while (envc-- > 0) { - __put_user((char *)p,envp++); - p += strlen_user((char *)p); - } - __put_user(NULL, envp); - current->mm->env_end = p; -} - -static int check_som_header(struct som_hdr *som_ex) -{ - int *buf = (int *)som_ex; - int i, ck; - - if (som_ex->system_id != SOM_SID_PARISC_1_0 && - som_ex->system_id != SOM_SID_PARISC_1_1 && - som_ex->system_id != SOM_SID_PARISC_2_0) - return -ENOEXEC; - - if (som_ex->a_magic != SOM_EXEC_NONSHARE && - som_ex->a_magic != SOM_EXEC_SHARE && - som_ex->a_magic != SOM_EXEC_DEMAND) - return -ENOEXEC; - - if (som_ex->version_id != SOM_ID_OLD && - som_ex->version_id != SOM_ID_NEW) - return -ENOEXEC; - - ck = 0; - for (i=0; i<32; i++) - ck ^= buf[i]; - if (ck != 0) - return -ENOEXEC; - - return 0; -} - -static int map_som_binary(struct file *file, - const struct som_exec_auxhdr *hpuxhdr) -{ - unsigned long code_start, code_size, data_start, data_size; - unsigned long bss_start, som_brk; - int retval; - int prot = PROT_READ | PROT_EXEC; - int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; - - mm_segment_t old_fs = get_fs(); - set_fs(get_ds()); - - code_start = SOM_PAGESTART(hpuxhdr->exec_tmem); - code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize); - current->mm->start_code = code_start; - current->mm->end_code = code_start + code_size; - retval = vm_mmap(file, code_start, code_size, prot, - flags, SOM_PAGESTART(hpuxhdr->exec_tfile)); - if (retval < 0 && retval > -1024) - goto out; - - data_start = SOM_PAGESTART(hpuxhdr->exec_dmem); - data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize); - current->mm->start_data = data_start; - current->mm->end_data = bss_start = data_start + data_size; - retval = vm_mmap(file, data_start, data_size, - prot | PROT_WRITE, flags, - SOM_PAGESTART(hpuxhdr->exec_dfile)); - if (retval < 0 && retval > -1024) - goto out; - - som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize); - current->mm->start_brk = current->mm->brk = som_brk; - retval = vm_mmap(NULL, bss_start, som_brk - bss_start, - prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0); - if (retval > 0 || retval < -1024) - retval = 0; -out: - set_fs(old_fs); - return retval; -} - - -/* - * These are the functions used to load SOM executables and shared - * libraries. There is no binary dependent code anywhere else. - */ - -static int -load_som_binary(struct linux_binprm * bprm) -{ - int retval; - unsigned int size; - unsigned long som_entry; - struct som_hdr *som_ex; - struct som_exec_auxhdr *hpuxhdr; - struct pt_regs *regs = current_pt_regs(); - - /* Get the exec-header */ - som_ex = (struct som_hdr *) bprm->buf; - - retval = check_som_header(som_ex); - if (retval != 0) - goto out; - - /* Now read in the auxiliary header information */ - - retval = -ENOMEM; - size = som_ex->aux_header_size; - if (size > SOM_PAGESIZE) - goto out; - hpuxhdr = kmalloc(size, GFP_KERNEL); - if (!hpuxhdr) - goto out; - - retval = kernel_read(bprm->file, som_ex->aux_header_location, - (char *) hpuxhdr, size); - if (retval != size) { - if (retval >= 0) - retval = -EIO; - goto out_free; - } - - /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); - if (retval) - goto out_free; - - /* OK, This is the point of no return */ - current->personality = PER_HPUX; - setup_new_exec(bprm); - - /* Set the task size for HP-UX processes such that - * the gateway page is outside the address space. - * This can be fixed later, but for now, this is much - * easier. - */ - - current->thread.task_size = 0xc0000000; - - /* Set map base to allow enough room for hp-ux heap growth */ - - current->thread.map_base = 0x80000000; - - retval = map_som_binary(bprm->file, hpuxhdr); - if (retval < 0) - goto out_free; - - som_entry = hpuxhdr->exec_entry; - kfree(hpuxhdr); - - set_binfmt(&som_format); - install_exec_creds(bprm); - setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - - create_som_tables(bprm); - - current->mm->start_stack = bprm->p; - -#if 0 - printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); - printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code); - printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code); - printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data); - printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack); - printk("(brk) %08lx\n" , (unsigned long) current->mm->brk); -#endif - - map_hpux_gateway_page(current,current->mm); - - start_thread_som(regs, som_entry, bprm->p); - return 0; - - /* error cleanup */ -out_free: - kfree(hpuxhdr); -out: - return retval; -} - -static int load_som_library(struct file *f) -{ -/* No lib support in SOM yet. gizza chance.. */ - return -ENOEXEC; -} - /* Install the SOM loader. - * N.B. We *rely* on the table being the right size with the - * right number of free slots... - */ - -static int __init init_som_binfmt(void) -{ - register_binfmt(&som_format); - return 0; -} - -static void __exit exit_som_binfmt(void) -{ - /* Remove the SOM loader. */ - unregister_binfmt(&som_format); -} - -core_initcall(init_som_binfmt); -module_exit(exit_som_binfmt); - -MODULE_LICENSE("GPL"); diff --git a/fs/block_dev.c b/fs/block_dev.c index 1d9c9f3754f8..975266be67d3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode) } EXPORT_SYMBOL(I_BDEV); -/* - * Move the inode from its current bdi to a new bdi. Make sure the inode - * is clean before moving so that it doesn't linger on the old bdi. - */ -static void bdev_inode_switch_bdi(struct inode *inode, - struct backing_dev_info *dst) +static void bdev_write_inode(struct inode *inode) { - while (true) { - spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY)) { - inode->i_data.backing_dev_info = dst; - spin_unlock(&inode->i_lock); - return; - } + spin_lock(&inode->i_lock); + while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); WARN_ON_ONCE(write_inode_now(inode, true)); + spin_lock(&inode->i_lock); } + spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ @@ -235,7 +227,10 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb = get_active_super(bdev); if (!sb) goto out; - error = freeze_super(sb); + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb); + else + error = freeze_super(sb); if (error) { deactivate_super(sb); bdev->bd_fsfreeze_count--; @@ -272,7 +267,10 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) if (!sb) goto out; - error = thaw_super(sb); + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb); + else + error = thaw_super(sb); if (error) { bdev->bd_fsfreeze_count++; mutex_unlock(&bdev->bd_fsfreeze_mutex); @@ -423,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); +/** + * bdev_direct_access() - Get the address for directly-accessibly memory + * @bdev: The device containing the memory + * @sector: The offset within the device + * @addr: Where to put the address of the memory + * @pfn: The Page Frame Number for the memory + * @size: The number of bytes requested + * + * If a block device is made up of directly addressable memory, this function + * will tell the caller the PFN and the address of the memory. The address + * may be directly dereferenced within the kernel without the need to call + * ioremap(), kmap() or similar. The PFN is suitable for inserting into + * page tables. + * + * Return: negative errno if an error occurs, otherwise the number of bytes + * accessible at this address. + */ +long bdev_direct_access(struct block_device *bdev, sector_t sector, + void **addr, unsigned long *pfn, long size) +{ + long avail; + const struct block_device_operations *ops = bdev->bd_disk->fops; + + if (size < 0) + return size; + if (!ops->direct_access) + return -EOPNOTSUPP; + if ((sector + DIV_ROUND_UP(size, 512)) > + part_nr_sects_read(bdev->bd_part)) + return -ERANGE; + sector += get_start_sect(bdev); + if (sector % (PAGE_SIZE / 512)) + return -EINVAL; + avail = ops->direct_access(bdev, sector, addr, pfn, size); + if (!avail) + return -ERANGE; + return min(avail, size); +} +EXPORT_SYMBOL_GPL(bdev_direct_access); + /* * pseudo-fs */ @@ -578,7 +616,6 @@ struct block_device *bdget(dev_t dev) inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); - inode->i_data.backing_dev_info = &default_backing_dev_info; spin_lock(&bdev_lock); list_add(&bdev->bd_list, &all_bdevs); spin_unlock(&bdev_lock); @@ -1139,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { - struct backing_dev_info *bdi; - ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); if (!bdev->bd_part) @@ -1166,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - bdi = blk_get_backing_dev_info(bdev); - bdev_inode_switch_bdi(bdev->bd_inode, bdi); - } /* * If the device is invalidated, rescan partition @@ -1197,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_clear; bdev->bd_contains = whole; - bdev_inode_switch_bdi(bdev->bd_inode, - whole->bd_inode->i_data.backing_dev_info); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { @@ -1238,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1458,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); - /* ->release can cause the old bdi to disappear, - * so must switch it out first + /* + * ->release can cause the queue to disappear, so flush all + * dirty data before. */ - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); + bdev_write_inode(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index a66768ebc8d1..80e9c18ea64f 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -8,6 +8,7 @@ config BTRFS_FS select LZO_DECOMPRESS select RAID6_PQ select XOR_BLOCKS + select SRCU help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 2d3e32ebfd15..f55721ff9385 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, return ret; } -/* - * this makes the path point to (inum INODE_ITEM ioff) - */ -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path) -{ - struct btrfs_key key; - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_ITEM_KEY, &key); -} - -static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path, - struct btrfs_key *found_key) -{ - return btrfs_find_item(fs_root, path, inum, ioff, - BTRFS_INODE_REF_KEY, found_key); -} - int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, @@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } - ret = inode_ref_info(parent, 0, fs_root, path, &found_key); + ret = btrfs_find_item(fs_root, path, parent, 0, + BTRFS_INODE_REF_KEY, &found_key); if (ret > 0) ret = -ENOENT; if (ret) @@ -1552,7 +1534,6 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, { int ret; int type; - struct btrfs_tree_block_info *info; struct btrfs_extent_inline_ref *eiref; if (*ptr == (unsigned long)-1) @@ -1573,9 +1554,17 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, } /* we can treat both ref types equally here */ - info = (struct btrfs_tree_block_info *)(ei + 1); *out_root = btrfs_extent_inline_ref_offset(eb, eiref); - *out_level = btrfs_tree_block_level(eb, info); + + if (key->type == BTRFS_EXTENT_ITEM_KEY) { + struct btrfs_tree_block_info *info; + + info = (struct btrfs_tree_block_info *)(ei + 1); + *out_level = btrfs_tree_block_level(eb, info); + } else { + ASSERT(key->type == BTRFS_METADATA_ITEM_KEY); + *out_level = (u8)key->offset; + } if (ret == 1) *ptr = (unsigned long)-1; @@ -1720,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { - ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, - &found_key); + ret = btrfs_find_item(fs_root, path, inum, + parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY, + &found_key); + if (ret < 0) break; if (ret) { diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2a1ac6bfc724..9c41fbac3009 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -32,9 +32,6 @@ struct inode_fs_paths { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); -int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root, - struct btrfs_path *path); - int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4aadadcfab20..de5e4f2adfea 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -185,6 +185,9 @@ struct btrfs_inode { struct btrfs_delayed_node *delayed_node; + /* File creation time. */ + struct timespec i_otime; + struct inode vfs_inode; }; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index cb7f3fe9c9f6..d897ef803b3b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -94,6 +94,7 @@ #include <linux/mutex.h> #include <linux/genhd.h> #include <linux/blkdev.h> +#include <linux/vmalloc.h> #include "ctree.h" #include "disk-io.h" #include "hash.h" @@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state, static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfsic_block_data_ctx *block_ctx_out, int mirror_num); -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out); static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); @@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block( l = NULL; next_block->generation = BTRFSIC_GENERATION_UNKNOWN; } else { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch (!= stored %llu).\n", - next_bytenr, next_block_ctx->dev->name, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, next_block), - next_block->logical_bytenr); - } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Referenced block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - next_bytenr, next_block_ctx->dev->name, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, next_block)); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (next_block->logical_bytenr != next_bytenr && + !(!next_block->is_metadata && + 0 == next_block->logical_bytenr)) + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block), + next_block->logical_bytenr); + else + printk(KERN_INFO + "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", + next_bytenr, next_block_ctx->dev->name, + next_block_ctx->dev_bytenr, *mirror_nump, + btrfsic_get_block_type(state, + next_block)); + } next_block->logical_bytenr = next_bytenr; next_block->mirror_num = *mirror_nump; @@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data( return -1; } if (!block_was_created) { - if (next_block->logical_bytenr != next_bytenr && + if ((state->print_mask & + BTRFSIC_PRINT_MASK_VERBOSE) && + next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) { printk(KERN_INFO @@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, return ret; } -static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, - u32 len, struct block_device *bdev, - struct btrfsic_block_data_ctx *block_ctx_out) -{ - block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); - block_ctx_out->dev_bytenr = bytenr; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - if (NULL != block_ctx_out->dev) { - return 0; - } else { - printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); - return -ENXIO; - } -} - static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) { if (block_ctx->mem_to_free) { @@ -1901,25 +1883,26 @@ again: dev_state, dev_bytenr); } - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c," - " bytenr mismatch" - " (!= stored %llu).\n", - bytenr, dev_state->name, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block), - block->logical_bytenr); - else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - printk(KERN_INFO - "Written block @%llu (%s/%llu/%d)" - " found in hash table, %c.\n", - bytenr, dev_state->name, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); + if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { + if (block->logical_bytenr != bytenr && + !(!block->is_metadata && + block->logical_bytenr == 0)) + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", + bytenr, dev_state->name, + dev_bytenr, + block->mirror_num, + btrfsic_get_block_type(state, + block), + block->logical_bytenr); + else + printk(KERN_INFO + "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", + bytenr, dev_state->name, + dev_bytenr, block->mirror_num, + btrfsic_get_block_type(state, + block)); + } block->logical_bytenr = bytenr; } else { if (num_pages * PAGE_CACHE_SIZE < @@ -2002,24 +1985,13 @@ again: } } - if (block->is_superblock) - ret = btrfsic_map_superblock(state, bytenr, - processed_len, - bdev, &block_ctx); - else - ret = btrfsic_map_block(state, bytenr, processed_len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", bytenr); - goto continue_loop; - } - block_ctx.datav = mapped_datav; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ block_ctx.dev = dev_state; block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; if (is_metadata || state->include_extent_data) { block->never_written = 0; @@ -2133,10 +2105,6 @@ again: /* this is getting ugly for the * include_extent_data case... */ bytenr = 0; /* unknown */ - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.mem_to_free = NULL; - block_ctx.pagev = NULL; } else { processed_len = state->metablock_size; bytenr = btrfs_stack_header_bytenr( @@ -2149,22 +2117,15 @@ again: "Written block @%llu (%s/%llu/?)" " !found in hash table, M.\n", bytenr, dev_state->name, dev_bytenr); - - ret = btrfsic_map_block(state, bytenr, processed_len, - &block_ctx, 0); - if (ret) { - printk(KERN_INFO - "btrfsic: btrfsic_map_block(root @%llu)" - " failed!\n", - dev_bytenr); - goto continue_loop; - } } - block_ctx.datav = mapped_datav; - /* the following is required in case of writes to mirrors, - * use the same that was used for the lookup */ + block_ctx.dev = dev_state; block_ctx.dev_bytenr = dev_bytenr; + block_ctx.start = bytenr; + block_ctx.len = processed_len; + block_ctx.pagev = NULL; + block_ctx.mem_to_free = NULL; + block_ctx.datav = mapped_datav; block = btrfsic_block_alloc(); if (NULL == block) { @@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root, root->sectorsize, PAGE_CACHE_SIZE); return -1; } - state = kzalloc(sizeof(*state), GFP_NOFS); - if (NULL == state) { - printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); - return -1; + state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!state) { + state = vzalloc(sizeof(*state)); + if (!state) { + printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n"); + return -1; + } } if (!btrfsic_is_initialized) { @@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - kfree(state); + if (is_vmalloc_addr(state)) + vfree(state); + else + kfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d3220d31d3cb..e9df8862012c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -224,16 +224,19 @@ out: * Clear the writeback bits on all of the file * pages for a compressed write */ -static noinline void end_compressed_writeback(struct inode *inode, u64 start, - unsigned long ram_size) +static noinline void end_compressed_writeback(struct inode *inode, + const struct compressed_bio *cb) { - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + unsigned long index = cb->start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; int i; int ret; + if (cb->errors) + mapping_set_error(inode->i_mapping, -EIO); + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, min_t(unsigned long, @@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, continue; } for (i = 0; i < ret; i++) { + if (cb->errors) + SetPageError(pages[i]); end_page_writeback(pages[i]); page_cache_release(pages[i]); } @@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err) tree->ops->writepage_end_io_hook(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, - NULL, 1); + NULL, + err ? 0 : 1); cb->compressed_pages[0]->mapping = NULL; - end_compressed_writeback(inode, cb->start, cb->len); + end_compressed_writeback(inode, cb); /* note, our inode could be gone now */ /* @@ -1011,8 +1017,6 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, bytes = min(bytes, working_bytes); kaddr = kmap_atomic(page_out); memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); - if (*pg_index == (vcnt - 1) && *pg_offset == 0) - memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes); kunmap_atomic(kaddr); flush_dcache_page(page_out); @@ -1054,3 +1058,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, return 1; } + +/* + * When uncompressing data, we need to make sure and zero any parts of + * the biovec that were not filled in by the decompression code. pg_index + * and pg_offset indicate the last page and the last offset of that page + * that have been filled in. This will zero everything remaining in the + * biovec. + */ +void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, + unsigned long pg_index, + unsigned long pg_offset) +{ + while (pg_index < vcnt) { + struct page *page = bvec[pg_index].bv_page; + unsigned long off = bvec[pg_index].bv_offset; + unsigned long len = bvec[pg_index].bv_len; + + if (pg_offset < off) + pg_offset = off; + if (pg_offset < off + len) { + unsigned long bytes = off + len - pg_offset; + char *kaddr; + + kaddr = kmap_atomic(page); + memset(kaddr + pg_offset, 0, bytes); + kunmap_atomic(kaddr); + } + pg_index++; + pg_offset = 0; + } +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 0c803b4fbf93..d181f70caae0 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -45,7 +45,9 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); - +void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, + unsigned long pg_index, + unsigned long pg_offset); struct btrfs_compress_op { struct list_head *(*alloc_workspace)(void); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 19bc6162fb8e..6d67f32e648d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, { int i; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* lockdep really cares that we take all of these spinlocks - * in the right order. If any of the locks in the path are not - * currently blocking, it is going to complain. So, make really - * really sure by forcing the path to blocking before we clear - * the path blocking. - */ if (held) { btrfs_set_lock_blocking_rw(held, held_rw); if (held_rw == BTRFS_WRITE_LOCK) @@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, held_rw = BTRFS_READ_LOCK_BLOCKING; } btrfs_set_path_blocking(p); -#endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { if (p->nodes[i] && p->locks[i]) { @@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, } } -#ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) btrfs_clear_lock_blocking_rw(held, held_rw); -#endif } /* this also releases the path */ @@ -223,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || + !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) + return; + spin_lock(&root->fs_info->trans_lock); - if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) && - list_empty(&root->dirty_list)) { - list_add(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { + /* Want the extent tree to be the last on the list */ + if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) + list_move_tail(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + else + list_move(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); } spin_unlock(&root->fs_info->trans_lock); } @@ -1373,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(eb->start, - fs_info->tree_root->nodesize); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1454,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(logical, root->nodesize); + eb = alloc_dummy_extent_buffer(root->fs_info, logical); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1648,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, parent_nritems = btrfs_header_nritems(parent); blocksize = root->nodesize; - end_slot = parent_nritems; + end_slot = parent_nritems - 1; - if (parent_nritems == 1) + if (parent_nritems <= 1) return 0; btrfs_set_lock_blocking(parent); - for (i = start_slot; i < end_slot; i++) { + for (i = start_slot; i <= end_slot; i++) { int close = 1; btrfs_node_key(parent, &disk_key, i); @@ -1672,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, other = btrfs_node_blockptr(parent, i - 1); close = close_blocks(blocknr, other, blocksize); } - if (!close && i < end_slot - 2) { + if (!close && i < end_slot) { other = btrfs_node_blockptr(parent, i + 1); close = close_blocks(blocknr, other, blocksize); } @@ -2292,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize); + readahead_tree_block(root, search); nread += blocksize; } nscan++; @@ -2311,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, u64 gen; u64 block1 = 0; u64 block2 = 0; - int blocksize; parent = path->nodes[level + 1]; if (!parent) @@ -2319,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); @@ -2344,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root, } if (block1) - readahead_tree_block(root, block1, blocksize); + readahead_tree_block(root, block1); if (block2) - readahead_tree_block(root, block2, blocksize); + readahead_tree_block(root, block2); } @@ -2619,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key, return 0; } -int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path, +int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 iobjectid, u64 ioff, u8 key_type, struct btrfs_key *found_key) { int ret; struct btrfs_key key; struct extent_buffer *eb; - struct btrfs_path *path; + + ASSERT(path); + ASSERT(found_key); key.type = key_type; key.objectid = iobjectid; key.offset = ioff; - if (found_path == NULL) { - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - } else - path = found_path; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); - if ((ret < 0) || (found_key == NULL)) { - if (path != found_path) - btrfs_free_path(path); + if (ret < 0) return ret; - } eb = path->nodes[0]; if (ret && path->slots[0] >= btrfs_header_nritems(eb)) { @@ -2893,7 +2880,7 @@ cow_done: } p->locks[level] = BTRFS_WRITE_LOCK; } else { - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); @@ -2939,7 +2926,7 @@ done: */ if (!p->leave_spinning) btrfs_set_path_blocking(p); - if (ret < 0) + if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); return ret; } @@ -3025,7 +3012,7 @@ again: } level = btrfs_header_level(b); - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); @@ -3393,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); extent_buffer_get(c); path->nodes[level] = c; - path->locks[level] = BTRFS_WRITE_LOCK; + path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; path->slots[level] = 0; return 0; } @@ -4366,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, path->search_for_split = 1; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret > 0) + ret = -EAGAIN; if (ret < 0) goto err; ret = -EAGAIN; leaf = path->nodes[0]; - /* if our item isn't there or got smaller, return now */ - if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + /* if our item isn't there, return now */ + if (item_size != btrfs_item_size_nr(leaf, path->slots[0])) goto err; /* the leaf has changed, it now has room. return now */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe69edda11fb..f9c89cae39ee 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -607,6 +609,7 @@ struct btrfs_path { unsigned int leave_spinning:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; + unsigned int skip_release_on_error:1; }; /* @@ -1019,6 +1022,9 @@ enum btrfs_raid_types { BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID10) +#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ + BTRFS_BLOCK_GROUP_RAID6) + /* * We need a bit for restriper to be able to tell when chunks of type * SINGLE are available. This "extended" profile format is used in @@ -1170,6 +1176,8 @@ struct btrfs_space_info { struct percpu_counter total_bytes_pinned; struct list_head list; + /* Protected by the spinlock 'lock'. */ + struct list_head ro_bgs; struct rw_semaphore groups_sem; /* for block groups in our same type */ @@ -1236,7 +1244,6 @@ enum btrfs_disk_cache_state { BTRFS_DC_ERROR = 1, BTRFS_DC_CLEAR = 2, BTRFS_DC_SETUP = 3, - BTRFS_DC_NEED_WRITE = 4, }; struct btrfs_caching_control { @@ -1274,8 +1281,9 @@ struct btrfs_block_group_cache { unsigned long full_stripe_len; unsigned int ro:1; - unsigned int dirty:1; unsigned int iref:1; + unsigned int has_caching_ctl:1; + unsigned int removed:1; int disk_cache_state; @@ -1305,6 +1313,14 @@ struct btrfs_block_group_cache { /* For delayed block group creation or deletion of empty block groups */ struct list_head bg_list; + + /* For read-only block groups */ + struct list_head ro_list; + + atomic_t trimming; + + /* For dirty block groups */ + struct list_head dirty_list; }; /* delayed seq elem */ @@ -1402,6 +1418,11 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; unsigned long mount_opt; + /* + * Track requests for actions that need to be done during transaction + * commit (like for some mount options). + */ + unsigned long pending_changes; unsigned long compress_type:4; int commit_interval; /* @@ -1726,9 +1747,16 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; + + /* + * Chunks that can't be freed yet (under a trim/discard operation) + * and will be latter freed. Protected by fs_info->chunk_mutex. + */ + struct list_head pinned_chunks; }; struct btrfs_subvolume_writers { @@ -1755,6 +1783,7 @@ struct btrfs_subvolume_writers { #define BTRFS_ROOT_DEFRAG_RUNNING 6 #define BTRFS_ROOT_FORCE_COW 7 #define BTRFS_ROOT_MULTI_LOG_TASKS 8 +#define BTRFS_ROOT_DIRTY 9 /* * in ram representation of the tree. extent_root is used for all allocations @@ -1773,8 +1802,6 @@ struct btrfs_root { struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; - struct kobject root_kobj; - struct completion kobj_unregister; struct mutex objectid_mutex; spinlock_t accounting_lock; @@ -2093,7 +2120,6 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) -#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2103,6 +2129,7 @@ struct btrfs_ioctl_defrag_range_args { #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ BTRFS_MOUNT_##opt) + #define btrfs_set_and_info(root, opt, fmt, args...) \ { \ if (!btrfs_test_opt(root, opt)) \ @@ -2118,6 +2145,49 @@ struct btrfs_ioctl_defrag_range_args { } /* + * Requests for changes that need to be done during transaction commit. + * + * Internal mount options that are used for special handling of the real + * mount options (eg. cannot be set during remount and have to be set during + * transaction commit) + */ + +#define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) +#define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) +#define BTRFS_PENDING_COMMIT (2) + +#define btrfs_test_pending(info, opt) \ + test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_set_pending(info, opt) \ + set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) +#define btrfs_clear_pending(info, opt) \ + clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) + +/* + * Helpers for setting pending mount option changes. + * + * Expects corresponding macros + * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name + */ +#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), SET_##opt); \ + btrfs_clear_pending((info), CLEAR_##opt); \ + } \ +} while(0) + +#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ +do { \ + if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ + btrfs_info((info), fmt, ##args); \ + btrfs_set_pending((info), CLEAR_##opt); \ + btrfs_clear_pending((info), SET_##opt); \ + } \ +} while(0) + +/* * Inode flags */ #define BTRFS_INODE_NODATASUM (1 << 0) @@ -2401,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); - -static inline struct btrfs_timespec * -btrfs_inode_atime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, atime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_mtime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, mtime); - return (struct btrfs_timespec *)ptr; -} - -static inline struct btrfs_timespec * -btrfs_inode_ctime(struct btrfs_inode_item *inode_item) -{ - unsigned long ptr = (unsigned long)inode_item; - ptr += offsetof(struct btrfs_inode_item, ctime); - return (struct btrfs_timespec *)ptr; -} - BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); @@ -3342,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); @@ -3351,7 +3398,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start); + struct btrfs_root *root, u64 group_start, + struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3417,8 +3465,8 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes); +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 type); int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); @@ -3427,8 +3475,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); -int btrfs_start_nocow_write(struct btrfs_root *root); -void btrfs_end_nocow_write(struct btrfs_root *root); +int btrfs_start_write_no_snapshoting(struct btrfs_root *root); +void btrfs_end_write_no_snapshoting(struct btrfs_root *root); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3686,6 +3734,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int verify_dir_item(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, + int name_len); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, @@ -3857,7 +3909,11 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_inode_check_errors(struct inode *inode); extern const struct dentry_operations btrfs_dentry_operations; +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode); +#endif /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3901,6 +3957,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -4097,7 +4154,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} /* reada.c */ struct reada_control { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 054577bddaf2..82f0c7c95474 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); btrfs_set_stack_inode_block_group(inode_item, 0); - btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->atime, inode->i_atime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->atime, inode->i_atime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->mtime, inode->i_mtime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->mtime, inode->i_mtime.tv_nsec); - btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_sec(&inode_item->ctime, inode->i_ctime.tv_sec); - btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item), + btrfs_set_stack_timespec_nsec(&inode_item->ctime, inode->i_ctime.tv_nsec); + + btrfs_set_stack_timespec_sec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, + BTRFS_I(inode)->i_otime.tv_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) { struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) @@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) *rdev = btrfs_stack_inode_rdev(inode_item); BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1857,6 +1863,14 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode) { struct btrfs_delayed_node *delayed_node; + /* + * we don't do delayed inode updates during log recovery because it + * leads to enospc problems. This means we also can't do + * delayed inode refs + */ + if (BTRFS_I(inode)->root->fs_info->log_root_recovering) + return -EAGAIN; + delayed_node = btrfs_get_or_create_delayed_node(inode); if (IS_ERR(delayed_node)) return PTR_ERR(delayed_node); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6f662b34ba0e..5ec03d999c37 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, struct btrfs_device *tgt_device = NULL; struct btrfs_device *src_device = NULL; - if (btrfs_fs_incompat(fs_info, RAID56)) { - btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); - return -EOPNOTSUPP; - } - switch (args->start.cont_reading_from_srcdev_mode) { case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: @@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); - WARN_ON(ret); + /* don't warn if EINPROGRESS, someone else might be running scrub */ + if (ret == -EINPROGRESS) { + args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; + ret = 0; + } else { + WARN_ON(ret); + } - return 0; + return ret; leave: dev_replace->srcdev = NULL; @@ -439,18 +440,9 @@ leave: */ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) { - s64 writers; - DEFINE_WAIT(wait); - set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - do { - prepare_to_wait(&fs_info->replace_wait, &wait, - TASK_UNINTERRUPTIBLE); - writers = percpu_counter_sum(&fs_info->bio_counter); - if (writers) - schedule(); - finish_wait(&fs_info->replace_wait, &wait); - } while (writers); + wait_event(fs_info->replace_wait, !percpu_counter_sum( + &fs_info->bio_counter)); } /* @@ -542,7 +534,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); - return 0; + return scrub_ret; } printk_in_rcu(KERN_INFO @@ -571,15 +563,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); fs_info->fs_devices->rw_devices++; - /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); - btrfs_dev_replace_unlock(dev_replace); btrfs_rm_dev_replace_blocked(fs_info); - btrfs_rm_dev_replace_srcdev(fs_info, src_device); + btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); btrfs_rm_dev_replace_unblocked(fs_info); @@ -594,6 +582,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + /* replace the sysfs entry */ + btrfs_kobj_rm_device(fs_info, src_device); + btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); + /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); if (!IS_ERR(trans)) @@ -920,9 +913,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) percpu_counter_inc(&fs_info->bio_counter); } -void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) { - percpu_counter_dec(&fs_info->bio_counter); + percpu_counter_sub(&fs_info->bio_counter, amount); if (waitqueue_active(&fs_info->replace_wait)) wake_up(&fs_info->replace_wait); @@ -930,15 +923,15 @@ void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info) { - DEFINE_WAIT(wait); -again: - percpu_counter_inc(&fs_info->bio_counter); - if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) { + while (1) { + percpu_counter_inc(&fs_info->bio_counter); + if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING, + &fs_info->fs_state))) + break; + btrfs_bio_counter_dec(fs_info); wait_event(fs_info->replace_wait, !test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)); - goto again; } - } diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index fc8df866e919..1752625fb4dd 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -21,10 +21,6 @@ #include "hash.h" #include "transaction.h" -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len); - /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On @@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, - struct btrfs_path *path, - const char *name, int name_len) +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1bf9f897065d..639f2663ed3f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_INFO + printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", root->fs_info->sb->s_id, buf->start, @@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + printk_ratelimited(KERN_ERR + "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", eb->fs_info->sb->s_id, eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " "%llu %llu\n", eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_info(root->fs_info, "bad tree block level %d", + btrfs_err(root->fs_info, "bad tree block level %d", (int)btrfs_header_level(eb)); ret = -EIO; goto err; @@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, @@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; @@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return 0; @@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { if (btrfs_test_is_dummy_root(root)) - return alloc_test_extent_buffer(root->fs_info, bytenr, - blocksize); - return alloc_extent_buffer(root->fs_info, bytenr, blocksize); + return alloc_test_extent_buffer(root->fs_info, bytenr); + return alloc_extent_buffer(root->fs_info, bytenr); } @@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return NULL; @@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); - memset(&root->root_kobj, 0, sizeof(root->root_kobj)); if (fs_info) root->defrag_trans_start = fs_info->generation; else root->defrag_trans_start = 0; - init_completion(&root->kobj_unregister); root->root_key.objectid = objectid; root->anon_dev = 0; @@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, bool check_ref) { struct btrfs_root *root; + struct btrfs_path *path; + struct btrfs_key key; int ret; if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) @@ -1669,8 +1669,17 @@ again: if (ret) goto fail; - ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID, - location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL); + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto fail; + } + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); + btrfs_free_path(path); if (ret < 0) goto fail; if (ret == 0) @@ -1715,12 +1724,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; - bdi->capabilities = BDI_CAP_MAP_COPY; - err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); + err = bdi_setup_and_register(bdi, "btrfs"); if (err) return err; - bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; bdi->congested_fn = btrfs_congested_fn; bdi->congested_data = info; return 0; @@ -2233,6 +2241,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); @@ -2319,7 +2328,6 @@ int open_ctree(struct super_block *sb, */ fs_info->btree_inode->i_size = OFFSET_MAX; fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, @@ -2384,6 +2392,8 @@ int open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); + INIT_LIST_HEAD(&fs_info->pinned_chunks); + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; @@ -2496,7 +2506,7 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - printk(KERN_ERR "BTRFS: has skinny extents\n"); + printk(KERN_INFO "BTRFS: has skinny extents\n"); /* * flag our filesystem as having big metadata blocks if @@ -2520,7 +2530,7 @@ int open_ctree(struct super_block *sb, */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && (sectorsize != nodesize)) { - printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " + printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); goto fail_alloc; @@ -2628,12 +2638,12 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(sectorsize); if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id); + printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); goto fail_sb_buffer; } if (sectorsize != PAGE_SIZE) { - printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) " + printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } @@ -2642,7 +2652,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read the system " + printk(KERN_ERR "BTRFS: failed to read the system " "array on %s\n", sb->s_id); goto fail_sb_buffer; } @@ -2657,7 +2667,7 @@ int open_ctree(struct super_block *sb, generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { - printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2669,7 +2679,7 @@ int open_ctree(struct super_block *sb, ret = btrfs_read_chunk_tree(chunk_root); if (ret) { - printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n", + printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2681,7 +2691,7 @@ int open_ctree(struct super_block *sb, btrfs_close_extra_devices(fs_info, fs_devices, 0); if (!fs_devices->latest_bdev) { - printk(KERN_CRIT "BTRFS: failed to read devices on %s\n", + printk(KERN_ERR "BTRFS: failed to read devices on %s\n", sb->s_id); goto fail_tree_roots; } @@ -2765,7 +2775,7 @@ retry_root_backup: ret = btrfs_recover_balance(fs_info); if (ret) { - printk(KERN_WARNING "BTRFS: failed to recover balance\n"); + printk(KERN_ERR "BTRFS: failed to recover balance\n"); goto fail_block_groups; } @@ -2830,9 +2840,11 @@ retry_root_backup: btrfs_set_opt(fs_info->mount_opt, SSD); } - /* Set the real inode map cache flag */ - if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) - btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); + /* + * Mount does not set all options immediatelly, we can do it now and do + * not have to wait for transaction commit + */ + btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { @@ -3713,6 +3725,17 @@ void close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; + + lock_chunks(root); + while (!list_empty(&fs_info->pinned_chunks)) { + struct extent_map *em; + + em = list_first_entry(&fs_info->pinned_chunks, + struct extent_map, list); + list_del_init(&em->list); + free_extent_map(em); + } + unlock_chunks(root); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -3839,14 +3862,29 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, */ if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->root); + btrfs_super_root(sb)); if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->chunk_root); + printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", + btrfs_super_chunk_root(sb)); if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", btrfs_super_log_root(sb)); + /* + * Check the lower bound, the alignment and other constraints are + * checked later. + */ + if (btrfs_super_nodesize(sb) < 4096) { + printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", + btrfs_super_nodesize(sb)); + ret = -EINVAL; + } + if (btrfs_super_sectorsize(sb) < 4096) { + printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", + btrfs_super_sectorsize(sb)); + ret = -EINVAL; + } + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", fs_info->fsid, sb->dev_item.fsid); @@ -3860,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", btrfs_super_num_devices(sb)); + if (btrfs_super_num_devices(sb) == 0) { + printk(KERN_ERR "BTRFS: number of devices is 0\n"); + ret = -EINVAL; + } if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", @@ -3868,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* + * Obvious sys_chunk_array corruptions, it must hold at least one key + * and one chunk + */ + if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n", + btrfs_super_sys_array_size(sb), + BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); + ret = -EINVAL; + } + if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)) { + printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n", + btrfs_super_sys_array_size(sb), + sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk)); + ret = -EINVAL; + } + + /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ @@ -4106,12 +4167,6 @@ again: if (ret) break; - /* opt_discard */ - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_error_discard_extent(root, start, - end + 1 - start, - NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); @@ -4129,6 +4184,25 @@ again: return 0; } +static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4140,6 +4214,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); + btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 414651821fb3..27d44c0fd236 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -46,11 +46,11 @@ struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u64 parent_transid); -void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, +void readahead_tree_block(struct btrfs_root *root, u64 bytenr); +int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); int open_ctree(struct super_block *sb, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47c1ba141082..8b353ad02f03 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -74,8 +74,9 @@ enum { RESERVE_ALLOC_NO_ACCOUNT = 2, }; -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -315,12 +316,6 @@ get_caching_control(struct btrfs_block_group_cache *cache) struct btrfs_caching_control *ctl; spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_STARTED) { - spin_unlock(&cache->lock); - return NULL; - } - - /* We're loading it the fast way, so we don't have a caching_ctl. */ if (!cache->caching_ctl) { spin_unlock(&cache->lock); return NULL; @@ -594,6 +589,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, spin_unlock(&cache->lock); if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { + mutex_lock(&caching_ctl->mutex); ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); @@ -601,15 +597,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; + caching_ctl->progress = (u64)-1; } else { if (load_cache_only) { cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } } spin_unlock(&cache->lock); + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); @@ -627,6 +627,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->cached = BTRFS_CACHE_NO; } else { cache->cached = BTRFS_CACHE_STARTED; + cache->has_caching_ctl = 1; } spin_unlock(&cache->lock); wake_up(&caching_ctl->wait); @@ -1889,8 +1890,8 @@ static int btrfs_issue_discard(struct block_device *bdev, return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0); } -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) +int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *actual_bytes) { int ret; u64 discarded_bytes = 0; @@ -1925,7 +1926,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, */ ret = 0; } - kfree(bbio); + btrfs_put_bbio(bbio); } if (actual_bytes) @@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int ret; int run_all = count == (unsigned long)-1; - int run_most = 0; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, root = root->fs_info->tree_root; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) { + if (count == 0) count = atomic_read(&delayed_refs->num_entries) * 2; - run_most = 1; - } again: #ifdef SCRAMBLE_DELAYED_REFS @@ -3139,9 +3137,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); - if (ret < 0) + if (ret) { + if (ret > 0) + ret = -ENOENT; goto fail; - BUG_ON(ret); /* Corruption */ + } leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3149,11 +3149,9 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); fail: - if (ret) { + if (ret) btrfs_abort_transaction(trans, root, ret); - return ret; - } - return 0; + return ret; } @@ -3162,7 +3160,19 @@ next_block_group(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + + /* If our block group was removed, we need a full search. */ + if (RB_EMPTY_NODE(&cache->cache_node)) { + const u64 next_bytenr = cache->key.objectid + cache->key.offset; + + spin_unlock(&root->fs_info->block_group_cache_lock); + btrfs_put_block_group(cache); + cache = btrfs_lookup_first_block_group(root->fs_info, + next_bytenr); + return cache; + } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { @@ -3198,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, return 0; } + if (trans->aborted) + return 0; again: inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -3233,6 +3245,20 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, root, inode); + if (ret) { + /* + * So theoretically we could recover from this, simply set the + * super cache generation to 0 so we know to invalidate the + * cache, but then we'd have to keep track of the block groups + * that fail this way so we know we _have_ to reset this cache + * before the next commit or risk reading stale cache. So to + * limit our exposure to horrible edge cases lets just abort the + * transaction, this only happens in really bad situations + * anyway. + */ + btrfs_abort_transaction(trans, root, ret); + goto out_put; + } WARN_ON(ret); if (i_size_read(inode) > 0) { @@ -3299,124 +3325,72 @@ out: return ret; } -int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_block_group_cache *cache; - int err = 0; + struct btrfs_block_group_cache *cache, *tmp; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; - u64 last = 0; + + if (list_empty(&cur_trans->dirty_bgs) || + !btrfs_test_opt(root, SPACE_CACHE)) + return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: - while (1) { - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - err = cache_save_setup(cache, trans, path); - last = cache->key.objectid + cache->key.offset; - btrfs_put_block_group(cache); + /* Could add new block groups, use _safe just in case */ + list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs, + dirty_list) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); } - while (1) { - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } - - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - if (cache->disk_cache_state == BTRFS_DC_CLEAR) { - btrfs_put_block_group(cache); - goto again; - } - - if (cache->dirty) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } - - if (cache->disk_cache_state == BTRFS_DC_SETUP) - cache->disk_cache_state = BTRFS_DC_NEED_WRITE; - cache->dirty = 0; - last = cache->key.objectid + cache->key.offset; - - err = write_one_cache_group(trans, root, path, cache); - btrfs_put_block_group(cache); - if (err) /* File system offline */ - goto out; - } + btrfs_free_path(path); + return 0; +} - while (1) { - /* - * I don't think this is needed since we're just marking our - * preallocated extent as written, but just in case it can't - * hurt. - */ - if (last == 0) { - err = btrfs_run_delayed_refs(trans, root, - (unsigned long)-1); - if (err) /* File system offline */ - goto out; - } +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + struct btrfs_path *path; - cache = btrfs_lookup_first_block_group(root->fs_info, last); - while (cache) { - /* - * Really this shouldn't happen, but it could if we - * couldn't write the entire preallocated extent and - * splitting the extent resulted in a new block. - */ - if (cache->dirty) { - btrfs_put_block_group(cache); - goto again; - } - if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - break; - cache = next_block_group(root, cache); - } - if (!cache) { - if (last == 0) - break; - last = 0; - continue; - } + if (list_empty(&cur_trans->dirty_bgs)) + return 0; - err = btrfs_write_out_cache(root, trans, cache, path); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; - /* - * If we didn't have an error then the cache state is still - * NEED_WRITE, so we can set it to WRITTEN. - */ - if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE) - cache->disk_cache_state = BTRFS_DC_WRITTEN; - last = cache->key.objectid + cache->key.offset; + /* + * We don't need the lock here since we are protected by the transaction + * commit. We want to do the cache_save_setup first and then run the + * delayed refs to make sure we have the best chance at doing this all + * in one shot. + */ + while (!list_empty(&cur_trans->dirty_bgs)) { + cache = list_first_entry(&cur_trans->dirty_bgs, + struct btrfs_block_group_cache, + dirty_list); + list_del_init(&cache->dirty_list); + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, + (unsigned long) -1); + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) + btrfs_write_out_cache(root, trans, cache, path); + if (!ret) + ret = write_one_cache_group(trans, root, path, cache); btrfs_put_block_group(cache); } -out: btrfs_free_path(path); - return err; + return ret; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -3504,6 +3478,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + INIT_LIST_HEAD(&found->ro_bgs); ret = kobject_init_and_add(&found->kobj, &space_info_ktype, info->space_info_kobj, "%s", @@ -5030,19 +5005,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, /** * drop_outstanding_extent - drop an outstanding extent * @inode: the inode we're dropping the extent for + * @num_bytes: the number of bytes we're relaseing. * * This is called when we are freeing up an outstanding extent, either called * after an error or after an extent is written. This will return the number of * reserved extents that need to be freed. This must be called with * BTRFS_I(inode)->lock held. */ -static unsigned drop_outstanding_extent(struct inode *inode) +static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) { unsigned drop_inode_space = 0; unsigned dropped_extents = 0; + unsigned num_extents = 0; - BUG_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; + num_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + ASSERT(num_extents); + ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents); + BTRFS_I(inode)->outstanding_extents -= num_extents; if (BTRFS_I(inode)->outstanding_extents == 0 && test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED, @@ -5155,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; + nr_extents = (unsigned)div64_u64(num_bytes + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + BTRFS_I(inode)->outstanding_extents += nr_extents; + nr_extents = 0; if (BTRFS_I(inode)->outstanding_extents > BTRFS_I(inode)->reserved_extents) @@ -5213,7 +5198,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) out_fail: spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); /* * If the inodes csum_bytes is the same as the original * csum_bytes then we know we haven't raced with any free()ers @@ -5292,7 +5277,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) num_bytes = ALIGN(num_bytes, root->sectorsize); spin_lock(&BTRFS_I(inode)->lock); - dropped = drop_outstanding_extent(inode); + dropped = drop_outstanding_extent(inode, num_bytes); if (num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); @@ -5300,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) if (dropped > 0) to_free += btrfs_calc_trans_metadata_size(root, dropped); + if (btrfs_test_is_dummy_root(root)) + return; + trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { @@ -5362,8 +5350,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) btrfs_free_reserved_data_space(inode, num_bytes); } -static int update_block_group(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc) +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; @@ -5401,6 +5390,14 @@ static int update_block_group(struct btrfs_root *root, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5411,7 +5408,6 @@ static int update_block_group(struct btrfs_root *root, cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; - cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { @@ -5425,7 +5421,17 @@ static int update_block_group(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); /* * No longer have used bytes in this block group, queue * it for deletion. @@ -5439,17 +5445,6 @@ static int update_block_group(struct btrfs_root *root, } spin_unlock(&info->unused_bgs_lock); } - btrfs_set_block_group_used(&cache->item, old_val); - cache->pinned += num_bytes; - cache->space_info->bytes_pinned += num_bytes; - cache->space_info->bytes_used -= num_bytes; - cache->space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - - set_extent_dirty(info->pinned_extents, - bytenr, bytenr + num_bytes - 1, - GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -5715,7 +5710,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, + const bool return_free_space) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; @@ -5739,7 +5735,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (start < cache->last_byte_to_unpin) { len = min(len, cache->last_byte_to_unpin - start); - btrfs_add_free_space(cache, start, len); + if (return_free_space) + btrfs_add_free_space(cache, start, len); } start += len; @@ -5793,17 +5790,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, unpin = &fs_info->freed_extents[0]; while (1) { + mutex_lock(&fs_info->unused_bg_unpin_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, NULL); - if (ret) + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; + } if (btrfs_test_opt(root, DISCARD)) ret = btrfs_discard_extent(root, start, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); - unpin_extent_range(root, start, end); + unpin_extent_range(root, start, end, true); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -6089,7 +6090,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = update_block_group(root, bytenr, num_bytes, 0); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; @@ -6191,7 +6192,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { - struct btrfs_block_group_cache *cache = NULL; int pin = 1; int ret; @@ -6207,17 +6207,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (!last_ref) return; - cache = btrfs_lookup_block_group(root->fs_info, buf->start); - if (btrfs_header_generation(buf) == trans->transid) { + struct btrfs_block_group_cache *cache; + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { ret = check_ref_cleanup(trans, root, buf->start); if (!ret) goto out; } + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { pin_down_extent(root, cache, buf->start, buf->len, 1); + btrfs_put_block_group(cache); goto out; } @@ -6225,6 +6228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0); + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, buf->start, buf->len); pin = 0; } @@ -6239,7 +6243,6 @@ out: * anymore. */ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - btrfs_put_block_group(cache); } /* Can return -ENOMEM */ @@ -7049,7 +7052,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7138,7 +7141,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->nodesize, 1); + ret = update_block_group(trans, root, ins->objectid, root->nodesize, + 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -7203,11 +7207,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize, int level) + u64 bytenr, int level) { struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); @@ -7326,7 +7330,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - blocksize, level); + level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -7343,8 +7347,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, - blocksize, level); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); BUG_ON(IS_ERR(buf)); /* -ENOMEM */ if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { @@ -7473,7 +7476,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(root, bytenr, blocksize); + readahead_tree_block(root, bytenr); nread++; } wc->reada_slot = slot; @@ -7814,7 +7817,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, next = btrfs_find_tree_block(root, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; btrfs_set_buffer_lockdep_class(root->root_key.objectid, next, @@ -8511,6 +8514,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) min_allocable_bytes <= sinfo->total_bytes) { sinfo->bytes_readonly += num_bytes; cache->ro = 1; + list_add_tail(&cache->ro_list, &sinfo->ro_bgs); ret = 0; } out: @@ -8533,14 +8537,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache->flags); - if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret < 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8551,6 +8547,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache->flags); + check_system_chunk(trans, root, alloc_flags); + } + btrfs_end_transaction(trans, root); return ret; } @@ -8565,15 +8566,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, /* * helper to account the unused space of all the readonly block group in the - * list. takes mirrors into account. + * space_info. takes mirrors into account. */ -static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) { struct btrfs_block_group_cache *block_group; u64 free_bytes = 0; int factor; - list_for_each_entry(block_group, groups_list, list) { + /* It's df, we don't care if it's racey */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { spin_lock(&block_group->lock); if (!block_group->ro) { @@ -8594,26 +8600,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) spin_unlock(&block_group->lock); } - - return free_bytes; -} - -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - int i; - u64 free_bytes = 0; - - spin_lock(&sinfo->lock); - - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) - if (!list_empty(&sinfo->block_groups[i])) - free_bytes += __btrfs_get_ro_block_group_free_space( - &sinfo->block_groups[i]); - spin_unlock(&sinfo->lock); return free_bytes; @@ -8633,6 +8619,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, cache->bytes_super - btrfs_block_group_used(&cache->item); sinfo->bytes_readonly -= num_bytes; cache->ro = 0; + list_del_init(&cache->ro_list); spin_unlock(&cache->lock); spin_unlock(&sinfo->lock); } @@ -8873,6 +8860,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) cache_node); rb_erase(&block_group->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); spin_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); @@ -9002,7 +8990,10 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); + INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->dirty_list); btrfs_init_free_space_ctl(cache); + atomic_set(&cache->trimming, 0); return cache; } @@ -9064,9 +9055,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - cache->disk_cache_state = BTRFS_DC_CLEAR; if (btrfs_test_opt(root, SPACE_CACHE)) - cache->dirty = 1; + cache->disk_cache_state = BTRFS_DC_CLEAR; } read_extent_buffer(leaf, &cache->item, @@ -9129,6 +9119,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock(&info->block_group_cache_lock); rb_erase(&cache->cache_node, &info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&info->block_group_cache_lock); btrfs_put_block_group(cache); goto error; @@ -9195,9 +9186,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, int ret = 0; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - list_del_init(&block_group->bg_list); if (ret) - continue; + goto next; spin_lock(&block_group->lock); memcpy(&item, &block_group->item, sizeof(item)); @@ -9212,6 +9202,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); +next: + list_del_init(&block_group->bg_list); } } @@ -9269,6 +9261,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&cache->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&cache->cache_node); spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_put_block_group(cache); return ret; @@ -9304,7 +9297,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start) + struct btrfs_root *root, u64 group_start, + struct extent_map *em) { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; @@ -9316,6 +9310,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int ret; int index; int factor; + struct btrfs_caching_control *caching_ctl = NULL; + bool remove_em; root = root->fs_info->extent_root; @@ -9400,6 +9396,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); if (root->fs_info->first_logical_byte == block_group->key.objectid) root->fs_info->first_logical_byte = (u64)-1; @@ -9422,12 +9419,44 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, kobject_put(kobj); } + if (block_group->has_caching_ctl) + caching_ctl = get_caching_control(block_group); if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); + if (block_group->has_caching_ctl) { + down_write(&root->fs_info->commit_root_sem); + if (!caching_ctl) { + struct btrfs_caching_control *ctl; + + list_for_each_entry(ctl, + &root->fs_info->caching_block_groups, list) + if (ctl->block_group == block_group) { + caching_ctl = ctl; + atomic_inc(&caching_ctl->count); + break; + } + } + if (caching_ctl) + list_del_init(&caching_ctl->list); + up_write(&root->fs_info->commit_root_sem); + if (caching_ctl) { + /* Once for the caching bgs list and once for us. */ + put_caching_control(caching_ctl); + put_caching_control(caching_ctl); + } + } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); + list_del_init(&block_group->ro_list); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; @@ -9435,6 +9464,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + lock_chunks(root); + if (!list_empty(&em->list)) { + /* We're in the transaction->pending_chunks list. */ + free_extent_map(em); + } + spin_lock(&block_group->lock); + block_group->removed = 1; + /* + * At this point trimming can't start on this block group, because we + * removed the block group from the tree fs_info->block_group_cache_tree + * so no one can't find it anymore and even if someone already got this + * block group before we removed it from the rbtree, they have already + * incremented block_group->trimming - if they didn't, they won't find + * any free space entries because we already removed them all when we + * called btrfs_remove_free_space_cache(). + * + * And we must not remove the extent map from the fs_info->mapping_tree + * to prevent the same logical address range and physical device space + * ranges from being reused for a new block group. This is because our + * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is + * completely transactionless, so while it is trimming a range the + * currently running transaction might finish and a new one start, + * allowing for new block groups to be created that can reuse the same + * physical device locations unless we take this special care. + */ + remove_em = (atomic_read(&block_group->trimming) == 0); + /* + * Make sure a trimmer task always sees the em in the pinned_chunks list + * if it sees block_group->removed == 1 (needs to lock block_group->lock + * before checking block_group->removed). + */ + if (!remove_em) { + /* + * Our em might be in trans->transaction->pending_chunks which + * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), + * and so is the fs_info->pinned_chunks list. + * + * So at this point we must be holding the chunk_mutex to avoid + * any races with chunk allocation (more specifically at + * volumes.c:contains_pending_extent()), to ensure it always + * sees the em, either in the pending_chunks list or in the + * pinned_chunks list. + */ + list_move_tail(&em->list, &root->fs_info->pinned_chunks); + } + spin_unlock(&block_group->lock); + + if (remove_em) { + struct extent_map_tree *em_tree; + + em_tree = &root->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + /* + * The em might be in the pending_chunks list, so make sure the + * chunk mutex is locked, since remove_extent_mapping() will + * delete us from that list. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + /* once for the tree */ + free_extent_map(em); + } + + unlock_chunks(root); + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9510,7 +9604,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. */ - trans = btrfs_join_transaction(root); + /* 1 for btrfs_orphan_reserve_metadata() */ + trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_set_block_group_rw(root, block_group); ret = PTR_ERR(trans); @@ -9523,10 +9618,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ start = block_group->key.objectid; end = start + block_group->key.offset - 1; - clear_extent_bits(&fs_info->freed_extents[0], start, end, + /* + * Hold the unused_bg_unpin_mutex lock to avoid racing with + * btrfs_finish_extent_commit(). If we are at transaction N, + * another task might be running finish_extent_commit() for the + * previous transaction N - 1, and have seen a range belonging + * to the block group in freed_extents[] before we were able to + * clear the whole block group range from freed_extents[]. This + * means that task can lookup for the block group after we + * unpinned it from freed_extents[] and removed it, leading to + * a BUG_ON() at btrfs_unpin_extent_range(). + */ + mutex_lock(&fs_info->unused_bg_unpin_mutex); + ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_DIRTY, GFP_NOFS); - clear_extent_bits(&fs_info->freed_extents[1], start, end, + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY, GFP_NOFS); + if (ret) { + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_set_block_group_rw(root, block_group); + goto end_trans; + } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ block_group->pinned = 0; @@ -9537,6 +9655,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) */ ret = btrfs_remove_chunk(trans, root, block_group->key.objectid); +end_trans: btrfs_end_transaction(trans, root); next: btrfs_put_block_group(block_group); @@ -9585,13 +9704,7 @@ out: int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - return unpin_extent_range(root, start, end); -} - -int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *actual_bytes) -{ - return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes); + return unpin_extent_range(root, start, end, false); } int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) @@ -9657,12 +9770,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } /* - * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), - * they are used to prevent the some tasks writing data into the page cache - * by nocow before the subvolume is snapshoted, but flush the data into - * the disk after the snapshot creation. + * btrfs_{start,end}_write_no_snapshoting() are similar to + * mnt_{want,drop}_write(), they are used to prevent some tasks from writing + * data into the page cache through nocow before the subvolume is snapshoted, + * but flush the data into disk after the snapshot creation, or to prevent + * operations while snapshoting is ongoing and that cause the snapshot to be + * inconsistent (writes followed by expanding truncates for example). */ -void btrfs_end_nocow_write(struct btrfs_root *root) +void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* @@ -9674,7 +9789,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) wake_up(&root->subv_writers->wait); } -int btrfs_start_nocow_write(struct btrfs_root *root) +int btrfs_start_write_no_snapshoting(struct btrfs_root *root) { if (atomic_read(&root->will_be_snapshoted)) return 0; @@ -9685,7 +9800,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) */ smp_mb(); if (atomic_read(&root->will_be_snapshoted)) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); return 0; } return 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf3f424e0013..d688cfe5d496 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", + pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), atomic_read(&state->refs)); @@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->set_bit_hook) tree->ops->set_bit_hook(tree->mapping->host, state, bits); } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned long *bits); + struct extent_state *state, unsigned *bits); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned long *bits) + unsigned *bits) { struct rb_node *node; @@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits, int wake) + unsigned *bits, int wake) { struct extent_state *next; - unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * This takes the tree lock, and returns 0 on success and < 0 on error. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached_state, gfp_t mask) { @@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, clear = 1; again: if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Don't care for allocation failure here because we might end + * up not needing the pre-allocated extent state at all, which + * is the case if we only have in the tree extent states that + * cover our input range and don't cover too any other range. + * If we end up needing a new extent state we allocate it later. + */ prealloc = alloc_extent_state(mask); - if (!prealloc) - return -ENOMEM; } spin_lock(&tree->lock); @@ -784,9 +789,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { - unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS; + unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; set_state_cb(tree, state, bits); if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { @@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree, state->state |= bits_to_set; } -static void cache_state(struct extent_state *state, - struct extent_state **cached_ptr) +static void cache_state_if_flags(struct extent_state *state, + struct extent_state **cached_ptr, + unsigned flags) { if (cached_ptr && !(*cached_ptr)) { - if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + if (!flags || (state->state & flags)) { *cached_ptr = state; atomic_inc(&state->refs); } } } +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + return cache_state_if_flags(state, cached_ptr, + EXTENT_IOBITS | EXTENT_BOUNDARY); +} + /* * set some bits on a range in the tree. This may require allocations or * sleeping, so the gfp mask is used to indicate what is allowed. @@ -820,7 +833,7 @@ static void cache_state(struct extent_state *state, static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long exclusive_bits, + unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask) { @@ -1021,7 +1034,7 @@ search_again: } int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 * failed_start, + unsigned bits, u64 * failed_start, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, @@ -1047,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * boundary bits like LOCK. */ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; @@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int err = 0; u64 last_start; u64 last_end; + bool first_iteration = true; btrfs_debug_check_extent_io_range(tree, start, end); again: if (!prealloc && (mask & __GFP_WAIT)) { + /* + * Best effort, don't worry if extent state allocation fails + * here for the first iteration. We might have a cached state + * that matches exactly the target range, in which case no + * extent state allocations are needed. We'll only know this + * after locking the tree. + */ prealloc = alloc_extent_state(mask); - if (!prealloc) + if (!prealloc && !first_iteration) return -ENOMEM; } @@ -1234,6 +1255,7 @@ search_again: spin_unlock(&tree->lock); if (mask & __GFP_WAIT) cond_resched(); + first_iteration = false; goto again; } @@ -1246,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask) + unsigned bits, gfp_t mask) { return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } @@ -1308,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached_state) + unsigned bits, struct extent_state **cached_state) { int err; u64 failed_start; + while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, @@ -1385,8 +1408,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) while (index <= end_index) { page = find_get_page(inode->i_mapping, index); BUG_ON(!page); /* Pages should be in the extent_io_tree */ - account_page_redirty(page); __set_page_dirty_nobuffers(page); + account_page_redirty(page); page_cache_release(page); index++; } @@ -1418,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) */ static struct extent_state * find_first_extent_bit_state(struct extent_io_tree *tree, - u64 start, unsigned long bits) + u64 start, unsigned bits) { struct rb_node *node; struct extent_state *state; @@ -1452,7 +1475,7 @@ out: * If nothing was found, 1 is returned. If found something, return 0. */ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state) { struct extent_state *state; @@ -1482,7 +1505,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, state = find_first_extent_bit_state(tree, start, bits); got_it: if (state) { - cache_state(state, cached_state); + cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; ret = 0; @@ -1731,7 +1754,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long clear_bits, + unsigned clear_bits, unsigned long page_ops) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -1746,6 +1769,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, if (page_ops == 0) return 0; + if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) + mapping_set_error(inode->i_mapping, -EIO); + while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, min_t(unsigned long, @@ -1763,6 +1789,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_page_dirty_for_io(pages[i]); if (page_ops & PAGE_SET_WRITEBACK) set_page_writeback(pages[i]); + if (page_ops & PAGE_SET_ERROR) + SetPageError(pages[i]); if (page_ops & PAGE_END_WRITEBACK) end_page_writeback(pages[i]); if (page_ops & PAGE_UNLOCK) @@ -1783,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits, int contig) + unsigned bits, int contig) { struct rb_node *node; struct extent_state *state; @@ -1901,7 +1929,7 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, struct extent_state *cached) + unsigned bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; @@ -2030,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, sector = bbio->stripes[mirror_num-1].physical >> 9; bio->bi_iter.bi_sector = sector; dev = bbio->stripes[mirror_num-1].dev; - kfree(bbio); + btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { bio_put(bio); return -EIO; @@ -2163,7 +2191,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) next = next_state(state); - failrec = (struct io_failure_record *)state->private; + failrec = (struct io_failure_record *)(unsigned long)state->private; free_extent_state(state); kfree(failrec); @@ -2789,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, bio_add_page(bio, page, page_size, offset) < page_size) { ret = submit_one_bio(rw, bio, mirror_num, prev_bio_flags); - if (ret < 0) + if (ret < 0) { + *bio_ret = NULL; return ret; + } bio = NULL; } else { return 0; @@ -3212,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, page, &delalloc_start, &delalloc_end, - 128 * 1024 * 1024); + BTRFS_MAX_EXTENT_SIZE); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; @@ -4571,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) static struct extent_buffer * __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len, gfp_t mask) + unsigned long len) { struct extent_buffer *eb = NULL; - eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS); if (eb == NULL) return NULL; eb->start = start; @@ -4616,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) struct extent_buffer *new; unsigned long num_pages = num_extent_pages(src->start, src->len); - new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS); + new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) return NULL; @@ -4639,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - unsigned long num_pages = num_extent_pages(0, len); + unsigned long len; + unsigned long num_pages; unsigned long i; - eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS); + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + num_pages = num_extent_pages(0, len); + + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; @@ -4735,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4743,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(start, len); + eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) return NULL; eb->fs_info = fs_info; @@ -4781,8 +4824,9 @@ free_eb: #endif struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) + u64 start) { + unsigned long len = fs_info->tree_root->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_CACHE_SHIFT; @@ -4797,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS); + eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; @@ -4924,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb) /* Should be safe to release our pages at this point */ btrfs_release_extent_buffer_page(eb); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) { + __free_extent_buffer(eb); + return 1; + } +#endif call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6d4b938be986..695b0ccfb755 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -4,22 +4,22 @@ #include <linux/rbtree.h> /* bits for the extent state */ -#define EXTENT_DIRTY 1 -#define EXTENT_WRITEBACK (1 << 1) -#define EXTENT_UPTODATE (1 << 2) -#define EXTENT_LOCKED (1 << 3) -#define EXTENT_NEW (1 << 4) -#define EXTENT_DELALLOC (1 << 5) -#define EXTENT_DEFRAG (1 << 6) -#define EXTENT_BOUNDARY (1 << 9) -#define EXTENT_NODATASUM (1 << 10) -#define EXTENT_DO_ACCOUNTING (1 << 11) -#define EXTENT_FIRST_DELALLOC (1 << 12) -#define EXTENT_NEED_WAIT (1 << 13) -#define EXTENT_DAMAGED (1 << 14) -#define EXTENT_NORESERVE (1 << 15) -#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) -#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) +#define EXTENT_DIRTY (1U << 0) +#define EXTENT_WRITEBACK (1U << 1) +#define EXTENT_UPTODATE (1U << 2) +#define EXTENT_LOCKED (1U << 3) +#define EXTENT_NEW (1U << 4) +#define EXTENT_DELALLOC (1U << 5) +#define EXTENT_DEFRAG (1U << 6) +#define EXTENT_BOUNDARY (1U << 9) +#define EXTENT_NODATASUM (1U << 10) +#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_FIRST_DELALLOC (1U << 12) +#define EXTENT_NEED_WAIT (1U << 13) +#define EXTENT_DAMAGED (1U << 14) +#define EXTENT_NORESERVE (1U << 15) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* * flags for bio submission. The high bits indicate the compression @@ -49,6 +49,7 @@ #define PAGE_SET_WRITEBACK (1 << 2) #define PAGE_END_WRITEBACK (1 << 3) #define PAGE_SET_PRIVATE2 (1 << 4) +#define PAGE_SET_ERROR (1 << 5) /* * page->private values. Every page that is controlled by the extent @@ -80,9 +81,9 @@ struct extent_io_ops { int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long *bits); + unsigned *bits); void (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -107,7 +108,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; atomic_t refs; - unsigned long state; + unsigned state; /* for use by the FS */ u64 private; @@ -187,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, int try_release_extent_buffer(struct page *page); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, struct extent_state **cached); + unsigned bits, struct extent_state **cached); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached, gfp_t mask); @@ -201,21 +202,21 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits, int contig); + u64 max_bytes, unsigned bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int filled, + unsigned bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, int wake, int delete, + unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, gfp_t mask); + unsigned bits, gfp_t mask); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, u64 *failed_start, + unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); @@ -228,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned long bits, unsigned long clear_bits, + unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask); int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, unsigned long bits, + u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); int extent_invalidatepage(struct extent_io_tree *tree, struct page *page, unsigned long offset); @@ -261,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); -struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len); + u64 start); +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -321,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, - unsigned long bits_to_clear, + unsigned bits_to_clear, unsigned long page_ops); struct bio * btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, @@ -376,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); + u64 start); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 225302b39afb..6a98bddd8f33 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) - list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a18ceabd99a8..30982bbd31c3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_nocow_write(root); + ret = btrfs_start_write_no_snapshoting(root); if (!ret) return -ENOSPC; @@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, btrfs_free_reserved_data_space(inode, reserve_bytes); else - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); break; } @@ -1632,7 +1632,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { u64 lockstart = round_down(pos, root->sectorsize); @@ -1661,7 +1661,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { btrfs_delalloc_release_space(inode, release_bytes); @@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t pos) { struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); ssize_t written; ssize_t written_buffered; loff_t endbyte; @@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, err = written_buffered; goto out; } + /* + * Ensure all data is persisted. We want the next direct IO read to be + * able to read what was just written. + */ endbyte = pos + written_buffered - 1; - err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); + err = btrfs_fdatawrite_range(inode, pos, endbyte); + if (err) + goto out; + err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); if (err) goto out; written += written_buffered; @@ -1738,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_lock(&inode->i_mutex); - current->backing_dev_info = inode->i_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) { mutex_unlock(&inode->i_mutex); @@ -1803,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, mutex_unlock(&inode->i_mutex); /* - * we want to make sure fsync finds this change - * but we haven't joined a transaction running right now. - * - * Later on, someone is sure to update the inode and get the - * real transid recorded. - * - * We set last_trans now to the fs_info generation + 1, - * this will either be one more than the running transaction - * or the generation used for the next transaction if there isn't - * one running right now. - * * We also have to set last_sub_trans to the current log transid, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; BTRFS_I(inode)->last_sub_trans = root->log_transid; if (num_written > 0) { err = generic_write_sync(file, pos, num_written); @@ -1854,10 +1850,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int ret; atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + ret = btrfs_fdatawrite_range(inode, start, end); atomic_dec(&BTRFS_I(inode)->sync_writers); return ret; @@ -1954,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * check the transaction that last modified this inode - * and see if its already been committed - */ - if (!BTRFS_I(inode)->last_trans) { - mutex_unlock(&inode->i_mutex); - goto out; - } - - /* - * if the last transaction that changed this file was before - * the current transaction, we can bail out now without any - * syncing + * If the last transaction that changed this file was before the current + * transaction and we have the full sync flag set in our inode, we can + * bail out now without any syncing. + * + * Note that we can't bail out if the full sync flag isn't set. This is + * because when the full sync flag is set we start all ordered extents + * and wait for them to fully complete - when they complete they update + * the inode's last_trans field through: + * + * btrfs_finish_ordered_io() -> + * btrfs_update_inode_fallback() -> + * btrfs_update_inode() -> + * btrfs_set_inode_last_trans() + * + * So we are sure that last_trans is up to date and can do this check to + * bail out safely. For the fast path, when the full sync flag is not + * set in our inode, we can not do it because we start only our ordered + * extents and don't wait for them to complete (that is when + * btrfs_finish_ordered_io runs), so here at this point their last_trans + * value might be less than or equals to fs_info->last_trans_committed, + * and setting a speculative last_trans for an inode when a buffered + * write is made (such as fs_info->generation + 1 for example) would not + * be reliable since after setting the value and before fsync is called + * any number of transactions can start and commit (transaction kthread + * commits the current transaction periodically), and a transaction + * commit does not start nor waits for ordered extents to complete. */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed) { - BTRFS_I(inode)->last_trans = 0; - + (full_sync && BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed)) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2076,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) @@ -2271,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; + bool truncated_page = false; + bool updated_inode = false; ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -2302,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * entire page. */ if (same_page && len < PAGE_CACHE_SIZE) { - if (offset < ino_size) + if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, len, 0); + } else { + ret = 0; + } goto out_only_mutex; } /* zero back part of the first page */ if (offset < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { mutex_unlock(&inode->i_mutex); @@ -2344,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { + truncated_page = true; ret = btrfs_truncate_page(inode, tail_start + tail_len, 0, 1); if (ret) @@ -2353,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (lockend < lockstart) { - mutex_unlock(&inode->i_mutex); - return 0; + ret = 0; + goto out_only_mutex; } while (1) { @@ -2502,6 +2514,7 @@ out_trans: trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); + updated_inode = true; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); out_free: @@ -2511,6 +2524,22 @@ out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); out_only_mutex: + if (!updated_inode && truncated_page && !ret && !err) { + /* + * If we only end up zeroing part of a page, we still need to + * update the inode item, so that all the time fields are + * updated as well as the necessary btrfs inode in memory fields + * for detecting, at fsync time, if the inode isn't yet in the + * log tree or it's there but not up to date. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + } else { + err = btrfs_update_inode(trans, root, inode); + ret = btrfs_end_transaction(trans, root); + } + } mutex_unlock(&inode->i_mutex); if (ret && !err) err = ret; @@ -2810,3 +2839,29 @@ int btrfs_auto_defrag_init(void) return 0; } + +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + /* + * So with compression we will find and lock a dirty page and clear the + * first one as dirty, setup an async extent, and immediately return + * with the entire range locked but with nobody actually marked with + * writeback. So we can't just filemap_write_and_wait_range() and + * expect it to work since it will just kick off a thread to do the + * actual work. So we need to call filemap_fdatawrite_range _again_ + * since it will wait on the page lock, which won't be unlocked until + * after the pages have been marked as writeback and so we're good to go + * from there. We have to do this otherwise we'll miss the ordered + * extents and that results in badness. Please Josef, do not think you + * know better and pull this out at some point in the future, it is + * right and you are wrong. + */ + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + + return ret; +} diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 33848196550e..a71978578fa7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,10 +27,17 @@ #include "disk-io.h" #include "extent_io.h" #include "inode-map.h" +#include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +struct btrfs_trim_range { + u64 start; + u64 bytes; + struct list_head list; +}; + static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -644,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; - struct list_head bitmaps; + LIST_HEAD(bitmaps); u64 num_entries; u64 num_bitmaps; u64 generation; u8 type; int ret = 0; - INIT_LIST_HEAD(&bitmaps); - /* Nothing in the space cache, goodbye */ if (!i_size_read(inode)) return 0; @@ -881,6 +886,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, int ret; struct btrfs_free_cluster *cluster = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); + struct btrfs_trim_range *trim_entry; /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { @@ -916,6 +922,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, cluster = NULL; } } + + /* + * Make sure we don't miss any range that was removed from our rbtree + * because trimming is running. Otherwise after a umount+mount (or crash + * after committing the transaction) we would leak free space and get + * an inconsistent free space cache report from fsck. + */ + list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { + ret = io_ctl_add_entry(io_ctl, trim_entry->start, + trim_entry->bytes, NULL); + if (ret) + goto fail; + *entries += 1; + } + return 0; fail: return -ENOSPC; @@ -1135,12 +1156,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_set_generation(&io_ctl, trans->transid); + mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ ret = write_cache_extent_entries(&io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) + if (ret) { + mutex_unlock(&ctl->cache_writeout_mutex); goto out_nospc; + } /* * Some spaces that are freed in the current transaction are pinned, @@ -1148,11 +1172,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * committed, we shouldn't lose them. */ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) + if (ret) { + mutex_unlock(&ctl->cache_writeout_mutex); goto out_nospc; + } - /* At last, we write out all the bitmaps. */ + /* + * At last, we write out all the bitmaps and keep cache_writeout_mutex + * locked while doing it because a concurrent trim can be manipulating + * or freeing the bitmap. + */ ret = write_bitmap_entries(&io_ctl, &bitmap_list); + mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; @@ -1210,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; + enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1233,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root, ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, path, block_group->key.objectid); if (ret) { - spin_lock(&block_group->lock); - block_group->disk_cache_state = BTRFS_DC_ERROR; - spin_unlock(&block_group->lock); + dcs = BTRFS_DC_ERROR; ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, @@ -1244,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, #endif } + spin_lock(&block_group->lock); + block_group->disk_cache_state = dcs; + spin_unlock(&block_group->lock); iput(inode); return ret; } @@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) ctl->start = block_group->key.objectid; ctl->private = block_group; ctl->op = &free_space_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); /* * we only want to have 32k of ram per block group for keeping @@ -2868,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root, trace_btrfs_find_cluster(block_group, offset, bytes, empty_size, min_bytes); - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes + empty_size, cont1_bytes, min_bytes); @@ -2911,10 +2945,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) static int do_trimming(struct btrfs_block_group_cache *block_group, u64 *total_trimmed, u64 start, u64 bytes, - u64 reserved_start, u64 reserved_bytes) + u64 reserved_start, u64 reserved_bytes, + struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; u64 trimmed = 0; @@ -2929,12 +2965,15 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); - ret = btrfs_error_discard_extent(fs_info->extent_root, - start, bytes, &trimmed); + ret = btrfs_discard_extent(fs_info->extent_root, + start, bytes, &trimmed); if (!ret) *total_trimmed += trimmed; + mutex_lock(&ctl->cache_writeout_mutex); btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + list_del(&trim_entry->list); + mutex_unlock(&ctl->cache_writeout_mutex); if (update) { spin_lock(&space_info->lock); @@ -2962,16 +3001,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, u64 bytes; while (start < end) { + struct btrfs_trim_range trim_entry; + + mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, start, 0, 1); if (!entry) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } @@ -2980,6 +3024,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, node = rb_next(&entry->offset_index); if (!node) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto out; } entry = rb_entry(node, struct btrfs_free_space, @@ -2988,6 +3033,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, if (entry->offset >= end) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } @@ -2997,6 +3043,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, bytes = min(extent_start + extent_bytes, end) - start; if (bytes < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto next; } @@ -3004,9 +3051,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, kmem_cache_free(btrfs_free_space_cachep, entry); spin_unlock(&ctl->tree_lock); + trim_entry.start = extent_start; + trim_entry.bytes = extent_bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes); + extent_start, extent_bytes, &trim_entry); if (ret) break; next: @@ -3035,17 +3086,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, while (offset < end) { bool next_bitmap = false; + struct btrfs_trim_range trim_entry; + mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); if (!entry) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } @@ -3054,6 +3109,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, ret2 = search_bitmap(ctl, entry, &start, &bytes); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } @@ -3061,6 +3117,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, bytes = min(bytes, end - start); if (bytes < minlen) { spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); goto next; } @@ -3069,9 +3126,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, free_bitmap(ctl, entry); spin_unlock(&ctl->tree_lock); + trim_entry.start = start; + trim_entry.bytes = bytes; + list_add_tail(&trim_entry.list, &ctl->trimming_ranges); + mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes); + start, bytes, &trim_entry); if (ret) break; next: @@ -3101,11 +3162,54 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, *trimmed = 0; + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + atomic_inc(&block_group->trimming); + spin_unlock(&block_group->lock); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); if (ret) - return ret; + goto out; ret = trim_bitmaps(block_group, trimmed, start, end, minlen); +out: + spin_lock(&block_group->lock); + if (atomic_dec_and_test(&block_group->trimming) && + block_group->removed) { + struct extent_map_tree *em_tree; + struct extent_map *em; + + spin_unlock(&block_group->lock); + + lock_chunks(block_group->fs_info->chunk_root); + em_tree = &block_group->fs_info->mapping_tree.map_tree; + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, block_group->key.objectid, + 1); + BUG_ON(!em); /* logic error, can't happen */ + /* + * remove_extent_mapping() will delete us from the pinned_chunks + * list, which is protected by the chunk mutex. + */ + remove_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + unlock_chunks(block_group->fs_info->chunk_root); + + /* once for us and once for the tree */ + free_extent_map(em); + free_extent_map(em); + + /* + * We've left one free space entry and other tasks trimming + * this block group have left 1 entry each one. Free them. + */ + __btrfs_remove_free_space_cache(block_group->free_space_ctl); + } else { + spin_unlock(&block_group->lock); + } return ret; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 0cf4977ef70d..88b2238a0aed 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -38,6 +38,8 @@ struct btrfs_free_space_ctl { u64 start; struct btrfs_free_space_op *op; void *private; + struct mutex cache_writeout_mutex; + struct list_head trimming_ranges; }; struct btrfs_free_space_op { diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 8ffa4783cbf4..265e03c73f4d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { @@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { - if (ret == -EOVERFLOW) - ret = -EMLINK; + if (ret == -EOVERFLOW) { + if (find_name_in_backref(path, name, name_len, &ref)) + ret = -EEXIST; + else + ret = -EMLINK; + } goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 83d646bd2e4b..74faea3a516e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) root->root_key.objectid); if (IS_ERR(tsk)) { btrfs_warn(root->fs_info, "failed to start inode caching task"); - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE, "disabling inode map caching"); } } @@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root) ctl->start = 0; ctl->private = NULL; ctl->op = &free_ino_op; + INIT_LIST_HEAD(&ctl->trimming_ranges); + mutex_init(&ctl->cache_writeout_mutex); /* * Initially we allow to use 16K of ram to cache chunks of diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d23362f4464e..d2e732d7af52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, static int btrfs_dirty_inode(struct inode *inode); +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +void btrfs_test_inode_set_ops(struct inode *inode) +{ + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; +} +#endif + static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -382,7 +389,7 @@ static inline int inode_need_compress(struct inode *inode) * are written in the same order that the flusher thread sent them * down. */ -static noinline int compress_file_range(struct inode *inode, +static noinline void compress_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, struct async_cow *async_cow, @@ -411,14 +418,6 @@ static noinline int compress_file_range(struct inode *inode, (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); - /* - * skip compression for a small file range(<=blocksize) that - * isn't an inline extent, since it dosen't save disk space at all. - */ - if ((end - start + 1) <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) - goto cleanup_and_bail_uncompressed; - actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -440,6 +439,14 @@ again: total_compressed = actual_end - start; + /* + * skip compression for a small file range(<=blocksize) that + * isn't an inline extent, since it dosen't save disk space at all. + */ + if (total_compressed <= blocksize && + (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + goto cleanup_and_bail_uncompressed; + /* we want to make sure that amount of ram required to uncompress * an extent is reasonable, so we limit the total size in ram * of a compressed extent to 128k. This is a crucial number @@ -527,7 +534,10 @@ cont: if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DEFRAG; + unsigned long page_error_op; + clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; + page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; /* * inline extent creation worked or returned error, @@ -538,6 +548,7 @@ cont: clear_flags, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + page_error_op | PAGE_END_WRITEBACK); goto free_pages_out; } @@ -620,8 +631,7 @@ cleanup_and_bail_uncompressed: *num_added += 1; } -out: - return ret; + return; free_pages_out: for (i = 0; i < nr_pages_ret; i++) { @@ -629,8 +639,22 @@ free_pages_out: page_cache_release(pages[i]); } kfree(pages); +} - goto out; +static void free_async_extent_pages(struct async_extent *async_extent) +{ + int i; + + if (!async_extent->pages) + return; + + for (i = 0; i < async_extent->nr_pages; i++) { + WARN_ON(async_extent->pages[i]->mapping); + page_cache_release(async_extent->pages[i]); + } + kfree(async_extent->pages); + async_extent->nr_pages = 0; + async_extent->pages = NULL; } /* @@ -639,7 +663,7 @@ free_pages_out: * queued. We walk all the async extents created by compress_file_range * and send them down to the disk. */ -static noinline int submit_compressed_extents(struct inode *inode, +static noinline void submit_compressed_extents(struct inode *inode, struct async_cow *async_cow) { struct async_extent *async_extent; @@ -651,9 +675,6 @@ static noinline int submit_compressed_extents(struct inode *inode, struct extent_io_tree *io_tree; int ret = 0; - if (list_empty(&async_cow->extents)) - return 0; - again: while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, @@ -709,15 +730,7 @@ retry: async_extent->compressed_size, 0, alloc_hint, &ins, 1, 1); if (ret) { - int i; - - for (i = 0; i < async_extent->nr_pages; i++) { - WARN_ON(async_extent->pages[i]->mapping); - page_cache_release(async_extent->pages[i]); - } - kfree(async_extent->pages); - async_extent->nr_pages = 0; - async_extent->pages = NULL; + free_async_extent_pages(async_extent); if (ret == -ENOSPC) { unlock_extent(io_tree, async_extent->start, @@ -814,15 +827,26 @@ retry: ins.objectid, ins.offset, async_extent->pages, async_extent->nr_pages); + if (ret) { + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct page *p = async_extent->pages[0]; + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + p->mapping = inode->i_mapping; + tree->ops->writepage_end_io_hook(p, start, end, + NULL, 0); + p->mapping = NULL; + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } alloc_hint = ins.objectid + ins.offset; kfree(async_extent); - if (ret) - goto out; cond_resched(); } - ret = 0; -out: - return ret; + return; out_free_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_free: @@ -832,7 +856,9 @@ out_free: NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | + PAGE_SET_ERROR); + free_async_extent_pages(async_extent); kfree(async_extent); goto again; } @@ -1318,7 +1344,7 @@ next_slot: * we fall into common COW way. */ if (!nolock) { - err = btrfs_start_nocow_write(root); + err = btrfs_start_write_no_snapshoting(root); if (!err) goto out_check; } @@ -1342,7 +1368,7 @@ out_check: if (extent_end <= start) { path->slots[0]++; if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto next_slot; } if (!nocow) { @@ -1362,7 +1388,7 @@ out_check: page_started, nr_written, 1); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } cow_start = (u64)-1; @@ -1413,7 +1439,7 @@ out_check: num_bytes); if (ret) { if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); goto error; } } @@ -1424,7 +1450,7 @@ out_check: EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_SET_PRIVATE2); if (!nolock && nocow) - btrfs_end_nocow_write(root); + btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; if (cur_offset > end) break; @@ -1511,10 +1537,32 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static void btrfs_split_extent_hook(struct inode *inode, struct extent_state *orig, u64 split) { + u64 size; + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; + size = orig->end - orig->start + 1; + if (size > BTRFS_MAX_EXTENT_SIZE) { + u64 num_extents; + u64 new_size; + + /* + * See the explanation in btrfs_merge_extent_hook, the same + * applies here, just in reverse. + */ + new_size = orig->end - split + 1; + num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + new_size = split - orig->start; + num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) + return; + } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); @@ -1530,10 +1578,55 @@ static void btrfs_merge_extent_hook(struct inode *inode, struct extent_state *new, struct extent_state *other) { + u64 new_size, old_size; + u64 num_extents; + /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; + if (new->start > other->start) + new_size = new->end - other->start + 1; + else + new_size = other->end - new->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->lock); + return; + } + + /* + * We have to add up either side to figure out how many extents were + * accounted for before we merged into one big extent. If the number of + * extents we accounted for is <= the amount we need for the new range + * then we can return, otherwise drop. Think of it like this + * + * [ 4k][MAX_SIZE] + * + * So we've grown the extent by a MAX_SIZE extent, this would mean we + * need 2 outstanding extents, on one side we have 1 and the other side + * we have 1 so they are == and we can return. But in this case + * + * [MAX_SIZE+4k][MAX_SIZE+4k] + * + * Each range on their own accounts for 2 extents, but merged together + * they are only 3 extents worth of accounting, so we need to drop in + * this case. + */ + old_size = other->end - other->start + 1; + num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + old_size = new->end - new->start + 1; + num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE) >= num_extents) + return; + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents--; spin_unlock(&BTRFS_I(inode)->lock); @@ -1585,7 +1678,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * have pending delalloc work to be done. */ static void btrfs_set_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long *bits) + struct extent_state *state, unsigned *bits) { if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) @@ -1608,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode, spin_unlock(&BTRFS_I(inode)->lock); } + /* For sanity tests */ + if (btrfs_test_is_dummy_root(root)) + return; + __percpu_counter_add(&root->fs_info->delalloc_bytes, len, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); @@ -1626,9 +1723,11 @@ static void btrfs_set_bit_hook(struct inode *inode, */ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, - unsigned long *bits) + unsigned *bits) { u64 len = state->end + 1 - state->start; + u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, + BTRFS_MAX_EXTENT_SIZE); spin_lock(&BTRFS_I(inode)->lock); if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) @@ -1648,7 +1747,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, *bits &= ~EXTENT_FIRST_DELALLOC; } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents--; + BTRFS_I(inode)->outstanding_extents -= num_extents; spin_unlock(&BTRFS_I(inode)->lock); } @@ -1661,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode, root != root->fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); + /* For sanity tests. */ + if (btrfs_test_is_dummy_root(root)) + return; + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) btrfs_free_reserved_data_space(inode, len); @@ -2926,7 +3029,7 @@ static int __readpage_endio_check(struct inode *inode, return 0; zeroit: if (__ratelimit(&_rs)) - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_warn(BTRFS_I(inode)->root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); @@ -3388,7 +3491,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) - btrfs_crit(root->fs_info, + btrfs_err(root->fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; @@ -3471,7 +3574,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; @@ -3508,17 +3610,19 @@ static void btrfs_read_locked_inode(struct inode *inode) i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); - tspec = btrfs_inode_atime(inode_item); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); - tspec = btrfs_inode_mtime(inode_item); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - tspec = btrfs_inode_ctime(inode_item); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + + BTRFS_I(inode)->i_otime.tv_sec = + btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime.tv_nsec = + btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3589,7 +3693,6 @@ cache_acl: switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -3604,7 +3707,6 @@ cache_acl: case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; break; default: inode->i_op = &btrfs_special_inode_operations; @@ -3639,21 +3741,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); + btrfs_set_token_timespec_sec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_sec, &token); + btrfs_set_token_timespec_nsec(leaf, &item->otime, + BTRFS_I(inode)->i_otime.tv_nsec, &token); + btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), &token); btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -4580,6 +4687,26 @@ next: return err; } +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +static void wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} + static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4604,17 +4731,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize > oldsize) { truncate_pagecache(inode, newsize); + /* + * Don't do an expanding truncate while snapshoting is ongoing. + * This is to ensure the snapshot captures a fully consistent + * state of this file - if the snapshot captures this expanding + * truncation, it must capture all writes that happened before + * this truncation. + */ + wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); - if (ret) + if (ret) { + btrfs_end_write_no_snapshoting(root); return ret; + } trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + btrfs_end_write_no_snapshoting(root); return PTR_ERR(trans); + } i_size_write(inode, newsize); btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); ret = btrfs_update_inode(trans, root, inode); + btrfs_end_write_no_snapshoting(root); btrfs_end_transaction(trans, root); } else { @@ -4957,6 +5097,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; + struct btrfs_key key; int ret; int err = 0; @@ -4967,9 +5108,12 @@ static int fixup_tree_root_location(struct btrfs_root *root, } err = -ENOENT; - ret = btrfs_find_item(root->fs_info->tree_root, path, - BTRFS_I(dir)->root->root_key.objectid, - location->objectid, BTRFS_ROOT_REF_KEY, NULL); + key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = location->objectid; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, + 0, 0); if (ret) { if (ret < 0) err = ret; @@ -5208,7 +5352,10 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; return inode; } @@ -5303,7 +5450,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_CAST(inode); } - return d_materialise_unique(dentry, inode); + return d_splice_alias(inode, dentry); } unsigned char btrfs_filetype_table[] = { @@ -5776,7 +5923,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + inode->i_mtime = CURRENT_TIME; + inode->i_atime = inode->i_mtime; + inode->i_ctime = inode->i_mtime; + BTRFS_I(inode)->i_otime = inode->i_mtime; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, @@ -6036,7 +6188,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -6203,8 +6354,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: btrfs_end_transaction(trans, root); - if (drop_on_err) + if (drop_on_err) { + inode_dec_link_count(inode); iput(inode); + } btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; @@ -7000,9 +7153,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_put_ordered_extent(ordered); } else { /* Screw you mmap */ - ret = filemap_write_and_wait_range(inode->i_mapping, - lockstart, - lockend); + ret = btrfs_fdatawrite_range(inode, lockstart, lockend); + if (ret) + break; + ret = filemap_fdatawait_range(inode->i_mapping, + lockstart, + lockend); if (ret) break; @@ -7080,17 +7236,28 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; u64 len = bh_result->b_size; + u64 *outstanding_extents = NULL; int unlock_bits = EXTENT_LOCKED; int ret = 0; if (create) - unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; + unlock_bits |= EXTENT_DIRTY; else len = min_t(u64, len, root->sectorsize); lockstart = start; lockend = start + len - 1; + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transction doesn't get + * confused. + */ + outstanding_extents = current->journal_info; + current->journal_info = NULL; + } + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. @@ -7151,7 +7318,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { int type; - int ret; u64 block_start, orig_start, orig_block_len, ram_bytes; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) @@ -7215,14 +7381,21 @@ unlock: if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->lock); + /* + * If we have an outstanding_extents count still set then we're + * within our reservation, otherwise we need to adjust our inode + * counter appropriately. + */ + if (*outstanding_extents) { + (*outstanding_extents)--; + } else { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->lock); + } - ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockstart + len - 1, EXTENT_DELALLOC, NULL, - &cached_state, GFP_NOFS); - BUG_ON(ret); + current->journal_info = outstanding_extents; + btrfs_free_reserved_data_space(inode, len); } /* @@ -7245,6 +7418,8 @@ unlock: unlock_err: clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, unlock_bits, 1, 0, &cached_state, GFP_NOFS); + if (outstanding_extents) + current->journal_info = outstanding_extents; return ret; } @@ -7751,8 +7926,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, } /* async crcs make it difficult to collect full stripe writes. */ - if (btrfs_get_alloc_profile(root, 1) & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) + if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK) async_submit = 0; else async_submit = 1; @@ -7945,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + u64 outstanding_extents = 0; size_t count = 0; int flags = 0; bool wakeup = true; @@ -7982,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; + outstanding_extents = div64_u64(count + + BTRFS_MAX_EXTENT_SIZE - 1, + BTRFS_MAX_EXTENT_SIZE); + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); @@ -7994,13 +8179,12 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, iter, offset, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (rw & WRITE) { + current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) btrfs_delalloc_release_space(inode, count); else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); - else - btrfs_delalloc_release_metadata(inode, 0); } out: if (wakeup) @@ -8521,6 +8705,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; + ei->i_otime.tv_sec = 0; + ei->i_otime.tv_nsec = 0; + inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(&ei->io_tree, &inode->i_data); @@ -9146,7 +9333,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); @@ -9190,7 +9376,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_symlink_inode_operations; inode->i_mapping->a_ops = &btrfs_symlink_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); @@ -9402,7 +9587,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); @@ -9442,6 +9626,21 @@ out_inode: } +/* Inspired by filemap_check_errors() */ +int btrfs_inode_check_errors(struct inode *inode) +{ + int ret = 0; + + if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && + test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) + ret = -ENOSPC; + if (test_bit(AS_EIO, &inode->i_mapping->flags) && + test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) + ret = -EIO; + + return ret; +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4399f0c3a4ce..74609b931ba5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -617,7 +617,7 @@ fail: return ret; } -static void btrfs_wait_nocow_write(struct btrfs_root *root) +static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) { s64 writers; DEFINE_WAIT(wait); @@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); - btrfs_wait_nocow_write(root); + btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - /* - * If orphan cleanup did remove any orphans, it means the tree was - * modified and therefore the commit root is not the same as the - * current root anymore. This is a problem, because send uses the - * commit root and therefore can see inode items that don't exist - * in the current root anymore, and for example make calls to - * btrfs_iget, which will do tree lookups based on the current root - * and not on the commit root. Those lookups will fail, returning a - * -ESTALE error, and making send fail with that error. So make sure - * a send does not see any orphans we have just removed, and that it - * will see the same inodes regardless of whether a transaction - * commit happened before it started (meaning that the commit root - * will be the same as the current root) or not. - */ - if (readonly && pending_snapshot->snap->node != - pending_snapshot->snap->commit_root) { - trans = btrfs_join_transaction(pending_snapshot->snap); - if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto fail; - } - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, - pending_snapshot->snap); - if (ret) - goto fail; - } - } - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -761,7 +732,8 @@ fail: free: kfree(pending_snapshot); out: - atomic_dec(&root->will_be_snapshoted); + if (atomic_dec_and_test(&root->will_be_snapshoted)) + wake_up_atomic_t(&root->will_be_snapshoted); return ret; } @@ -804,11 +776,11 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { - if (!S_ISDIR(victim->d_inode->i_mode)) + if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; - } else if (S_ISDIR(victim->d_inode->i_mode)) + } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; @@ -5296,7 +5268,7 @@ long btrfs_ioctl(struct file *file, unsigned int ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); if (ret) return ret; - ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); + ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); /* * The transaction thread may want to do more work, * namely it pokes the cleaner ktread that will start diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5665d2149249..f8229ef1b46d 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -128,6 +128,26 @@ again: } /* + * take a spinning read lock. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers + */ +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) +{ + if (atomic_read(&eb->blocking_writers)) + return 0; + + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; +} + +/* * returns 1 if we get the read lock and 0 if we don't * this won't wait for blocking writers */ @@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) atomic_read(&eb->blocking_readers)) return 0; - if (!write_trylock(&eb->lock)) - return 0; - + write_lock(&eb->lock); if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index b81e0e9a4894..c44a9d5f5362 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); + static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 78285f30909e..617553cdb7d3 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -373,6 +373,8 @@ cont: } done: kunmap(pages_in[page_in_index]); + if (!ret) + btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); return ret; } @@ -410,10 +412,23 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in, goto out; } + /* + * the caller is already checking against PAGE_SIZE, but lets + * move this check closer to the memcpy/memset + */ + destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); kaddr = kmap_atomic(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); + + /* + * btrfs_getblock is doing a zero on the tail of the page too, + * but this will cover anything missing from the decompressed + * data. + */ + if (bytes < destlen) + memset(kaddr+bytes, 0, destlen-bytes); kunmap_atomic(kaddr); out: return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec4cc20..157cc54fc634 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); INIT_LIST_HEAD(&entry->log_list); + INIT_LIST_HEAD(&entry->trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -431,19 +432,29 @@ out: /* Needs to either be called under a log transaction or the log_mutex */ void btrfs_get_logged_extents(struct inode *inode, - struct list_head *logged_list) + struct list_head *logged_list, + const loff_t start, + const loff_t end) { struct btrfs_ordered_inode_tree *tree; struct btrfs_ordered_extent *ordered; struct rb_node *n; + struct rb_node *prev; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + n = __tree_search(&tree->tree, end, &prev); + if (!n) + n = prev; + for (; n; n = rb_prev(n)) { ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); - if (!list_empty(&ordered->log_list)) + if (ordered->file_offset > end) continue; - list_add_tail(&ordered->log_list, logged_list); + if (entry_end(ordered) <= start) + break; + if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + list_add(&ordered->log_list, logged_list); atomic_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); @@ -472,7 +483,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, spin_unlock_irq(&log->log_extents_lock[index]); } -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; int index = transid % 2; @@ -497,7 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - btrfs_put_ordered_extent(ordered); + list_add_tail(&ordered->trans_list, &trans->ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -725,30 +737,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) /* start IO across the range first to instantiate any delalloc * extents */ - ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); + ret = btrfs_fdatawrite_range(inode, start, orig_end); if (ret) return ret; - /* - * So with compression we will find and lock a dirty page and clear the - * first one as dirty, setup an async extent, and immediately return - * with the entire range locked but with nobody actually marked with - * writeback. So we can't just filemap_write_and_wait_range() and - * expect it to work since it will just kick off a thread to do the - * actual work. So we need to call filemap_fdatawrite_range _again_ - * since it will wait on the page lock, which won't be unlocked until - * after the pages have been marked as writeback and so we're good to go - * from there. We have to do this otherwise we'll miss the ordered - * extents and that results in badness. Please Josef, do not think you - * know better and pull this out at some point in the future, it is - * right and you are wrong. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - orig_end); - if (ret) - return ret; - } + ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); if (ret) return ret; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274d621e..e96cd4ccd805 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -71,6 +71,8 @@ struct btrfs_ordered_sum { ordered extent */ #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ +#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent + * in the logging code. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -121,6 +123,9 @@ struct btrfs_ordered_extent { /* If we need to wait on this to be done */ struct list_head log_list; + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); void btrfs_get_logged_extents(struct inode *inode, - struct list_head *logged_list); + struct list_head *logged_list, + const loff_t start, + const loff_t end); void btrfs_put_logged_extents(struct list_head *logged_list); void btrfs_submit_logged_extents(struct list_head *logged_list, struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 48b60dbf807f..058c79eecbfb 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1, if (oper1->seq < oper2->seq) return -1; if (oper1->seq > oper2->seq) - return -1; + return 1; if (oper1->ref_root < oper2->ref_root) return -1; if (oper1->ref_root > oper2->ref_root) @@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup = u64_to_ptr(unode->aux); qgroup->rfer += sign * oper->num_bytes; qgroup->rfer_cmpr += sign * oper->num_bytes; + WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); qgroup->excl += sign * oper->num_bytes; - if (sign < 0) - WARN_ON(qgroup->excl < oper->num_bytes); qgroup->excl_cmpr += sign * oper->num_bytes; qgroup_dirty(fs_info, qgroup); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a41631cb959..5264858ed768 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -58,20 +58,18 @@ */ #define RBIO_CACHE_READY_BIT 3 - #define RBIO_CACHE_SIZE 1024 +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE = 0, + BTRFS_RBIO_READ_REBUILD = 1, + BTRFS_RBIO_PARITY_SCRUB = 2, +}; + struct btrfs_raid_bio { struct btrfs_fs_info *fs_info; struct btrfs_bio *bbio; - /* - * logical block numbers for the start of each stripe - * The last one or two are p/q. These are sorted, - * so raid_map[0] is the start of our full stripe - */ - u64 *raid_map; - /* while we're doing rmw on a stripe * we put it into a hash table so we can * lock the stripe and merge more rbios @@ -117,13 +115,16 @@ struct btrfs_raid_bio { /* number of data stripes (no p/q) */ int nr_data; + int real_stripes; + + int stripe_npages; /* * set if we're doing a parity rebuild * for a read from higher up, which is handled * differently from a parity rebuild as part of * rmw */ - int read_rebuild; + enum btrfs_rbio_ops operation; /* first bad stripe */ int faila; @@ -131,6 +132,7 @@ struct btrfs_raid_bio { /* second bad stripe (for raid6 use) */ int failb; + int scrubp; /* * number of pages needed to represent the full * stripe @@ -144,8 +146,13 @@ struct btrfs_raid_bio { */ int bio_list_bytes; + int generic_bio_cnt; + atomic_t refs; + atomic_t stripes_pending; + + atomic_t error; /* * these are two arrays of pointers. We allocate the * rbio big enough to hold them both and setup their @@ -162,6 +169,11 @@ struct btrfs_raid_bio { * here for faster lookup */ struct page **bio_pages; + + /* + * bitmap to record which horizontal stripe has data + */ + unsigned long *dbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -176,6 +188,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check); +static void async_scrub_parity(struct btrfs_raid_bio *rbio); + /* * the stripe hash table is used for locking, and to collect * bios in hopes of making a full stripe @@ -271,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->raid_map[0]; + u64 num = rbio->bbio->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -324,6 +340,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -573,15 +590,24 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->raid_map[0] != - cur->raid_map[0]) + if (last->bbio->raid_map[0] != + cur->bbio->raid_map[0]) return 0; - /* reads can't merge with writes */ - if (last->read_rebuild != - cur->read_rebuild) { + /* we can't merge with different operations */ + if (last->operation != cur->operation) + return 0; + /* + * We've need read the full stripe from the drive. + * check and repair the parity and write the new results. + * + * We're not allowed to add any new bios to the + * bio list here, anyone else that wants to + * change this stripe needs to do their own rmw. + */ + if (last->operation == BTRFS_RBIO_PARITY_SCRUB || + cur->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - } return 1; } @@ -601,7 +627,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) */ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { - if (rbio->nr_data + 1 == rbio->bbio->num_stripes) + if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; index += ((rbio->nr_data + 1) * rbio->stripe_len) >> @@ -647,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { walk++; - if (cur->raid_map[0] == rbio->raid_map[0]) { + if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) { spin_lock(&cur->bio_list_lock); /* can we steal this cached rbio's pages? */ @@ -772,11 +798,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); - if (next->read_rebuild) + if (next->operation == BTRFS_RBIO_READ_REBUILD) async_read_rebuild(next); - else { + else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); async_rmw_stripe(next); + } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { + steal_rbio(rbio, next); + async_scrub_parity(next); } goto done_nolock; @@ -814,8 +843,8 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) rbio->stripe_pages[i] = NULL; } } - kfree(rbio->raid_map); - kfree(rbio->bbio); + + btrfs_put_bbio(rbio->bbio); kfree(rbio); } @@ -833,6 +862,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *next; + + if (rbio->generic_bio_cnt) + btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + free_raid_bio(rbio); while (cur) { @@ -858,13 +891,13 @@ static void raid_write_end_io(struct bio *bio, int err) bio_put(bio); - if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + if (!atomic_dec_and_test(&rbio->stripes_pending)) return; err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) err = -EIO; rbio_orig_end_io(rbio, err, 0); @@ -920,21 +953,20 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; - int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); + int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + int num_pages = rbio_nr_pages(stripe_len, real_stripes); + int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; - rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, + rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), GFP_NOFS); - if (!rbio) { - kfree(raid_map); - kfree(bbio); + if (!rbio) return ERR_PTR(-ENOMEM); - } bio_list_init(&rbio->bio_list); INIT_LIST_HEAD(&rbio->plug_list); @@ -942,13 +974,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bbio = bbio; - rbio->raid_map = raid_map; rbio->fs_info = root->fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->real_stripes = real_stripes; + rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; atomic_set(&rbio->refs, 1); + atomic_set(&rbio->error, 0); + atomic_set(&rbio->stripes_pending, 0); /* * the stripe_pages and bio_pages array point to the extra @@ -957,11 +992,14 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, p = rbio + 1; rbio->stripe_pages = p; rbio->bio_pages = p + sizeof(struct page *) * num_pages; + rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; - if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) - nr_data = bbio->num_stripes - 2; + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + nr_data = real_stripes - 1; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + nr_data = real_stripes - 2; else - nr_data = bbio->num_stripes - 1; + BUG(); rbio->nr_data = nr_data; return rbio; @@ -1073,7 +1111,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) { if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); + BUG_ON(rbio->faila == rbio->real_stripes - 1); __raid56_parity_recover(rbio); } else { finish_rmw(rbio); @@ -1112,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_lock_irq(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) { start = (u64)bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->raid_map[0]; + stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_CACHE_SHIFT; for (i = 0; i < bio->bi_vcnt; i++) { @@ -1134,7 +1172,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; - void *pointers[bbio->num_stripes]; + void *pointers[rbio->real_stripes]; int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; @@ -1148,11 +1186,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) bio_list_init(&bio_list); - if (bbio->num_stripes - rbio->nr_data == 1) { - p_stripe = bbio->num_stripes - 1; - } else if (bbio->num_stripes - rbio->nr_data == 2) { - p_stripe = bbio->num_stripes - 2; - q_stripe = bbio->num_stripes - 1; + if (rbio->real_stripes - rbio->nr_data == 1) { + p_stripe = rbio->real_stripes - 1; + } else if (rbio->real_stripes - rbio->nr_data == 2) { + p_stripe = rbio->real_stripes - 2; + q_stripe = rbio->real_stripes - 1; } else { BUG(); } @@ -1169,7 +1207,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); - atomic_set(&rbio->bbio->error, 0); + atomic_set(&rbio->error, 0); /* * now that we've set rmw_locked, run through the @@ -1209,7 +1247,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) SetPageUptodate(p); pointers[stripe++] = kmap(p); - raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, pointers); } else { /* raid5 */ @@ -1218,7 +1256,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } - for (stripe = 0; stripe < bbio->num_stripes; stripe++) + for (stripe = 0; stripe < rbio->real_stripes; stripe++) kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); } @@ -1227,7 +1265,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { @@ -1245,8 +1283,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } } - atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&bbio->stripes_pending) == 0); + if (likely(!bbio->num_tgtdevs)) + goto write_data; + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + if (!bbio->tgtdev_map[stripe]) + continue; + + for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + struct page *page; + if (stripe < rbio->nr_data) { + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (!page) + continue; + } else { + page = rbio_stripe_page(rbio, stripe, pagenr); + } + + ret = rbio_add_io_page(rbio, &bio_list, page, + rbio->bbio->tgtdev_map[stripe], + pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + +write_data: + atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); + BUG_ON(atomic_read(&rbio->stripes_pending) == 0); while (1) { bio = bio_list_pop(&bio_list); @@ -1283,7 +1347,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe = &rbio->bbio->stripes[i]; stripe_start = stripe->physical; if (physical >= stripe_start && - physical < stripe_start + rbio->stripe_len) { + physical < stripe_start + rbio->stripe_len && + bio->bi_bdev == stripe->dev->bdev) { return i; } } @@ -1305,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, logical <<= 9; for (i = 0; i < rbio->nr_data; i++) { - stripe_start = rbio->raid_map[i]; + stripe_start = rbio->bbio->raid_map[i]; if (logical >= stripe_start && logical < stripe_start + rbio->stripe_len) { return i; @@ -1331,11 +1396,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) if (rbio->faila == -1) { /* first failure on this rbio */ rbio->faila = failed; - atomic_inc(&rbio->bbio->error); + atomic_inc(&rbio->error); } else if (rbio->failb == -1) { /* second failure on this rbio */ rbio->failb = failed; - atomic_inc(&rbio->bbio->error); + atomic_inc(&rbio->error); } else { ret = -EIO; } @@ -1394,11 +1459,11 @@ static void raid_rmw_end_io(struct bio *bio, int err) bio_put(bio); - if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + if (!atomic_dec_and_test(&rbio->stripes_pending)) return; err = 0; - if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) goto cleanup; /* @@ -1439,7 +1504,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio) static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; - struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); @@ -1455,7 +1519,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - atomic_set(&rbio->bbio->error, 0); + atomic_set(&rbio->error, 0); /* * build a list of bios to read all the missing parts of this * stripe @@ -1503,7 +1567,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * the bbio may be freed once we submit the last bio. Make sure * not to touch it after that */ - atomic_set(&bbio->stripes_pending, bios_to_read); + atomic_set(&rbio->stripes_pending, bios_to_read); while (1) { bio = bio_list_pop(&bio_list); if (!bio) @@ -1680,25 +1744,35 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) * our main entry point for writes from the rest of the FS. */ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len) + struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; + int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) + rbio = alloc_rbio(root, bbio, stripe_len); + if (IS_ERR(rbio)) { + btrfs_put_bbio(bbio); return PTR_ERR(rbio); + } bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio->operation = BTRFS_RBIO_WRITE; + + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; /* * don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) - return full_stripe_write(rbio); + if (rbio_is_full(rbio)) { + ret = full_stripe_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); + return ret; + } cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, sizeof(*plug)); @@ -1709,10 +1783,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); + ret = 0; } else { - return __raid56_parity_write(rbio); + ret = __raid56_parity_write(rbio); + if (ret) + btrfs_bio_counter_dec(root->fs_info); } - return 0; + return ret; } /* @@ -1730,7 +1807,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int err; int i; - pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), + pointers = kzalloc(rbio->real_stripes * sizeof(void *), GFP_NOFS); if (!pointers) { err = -ENOMEM; @@ -1740,7 +1817,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) faila = rbio->faila; failb = rbio->failb; - if (rbio->read_rebuild) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock_irq(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); @@ -1749,15 +1826,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); for (pagenr = 0; pagenr < nr_pages; pagenr++) { + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(pagenr, rbio->dbitmap)) + continue; + /* setup our array of pointers with pages * from each stripe */ - for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->read_rebuild && + if (rbio->operation == BTRFS_RBIO_READ_REBUILD && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1767,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->raid_map[rbio->bbio->num_stripes - 1] == - RAID6_Q_STRIPE) { - + if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style @@ -1804,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->raid_map[faila] == RAID5_P_STRIPE) { + if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bbio->raid_map[faila] == + RAID5_P_STRIPE) { err = -EIO; goto cleanup; } @@ -1816,11 +1900,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->bbio->num_stripes, + if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { - raid6_2data_recov(rbio->bbio->num_stripes, + raid6_2data_recov(rbio->real_stripes, PAGE_SIZE, faila, failb, pointers); } @@ -1850,7 +1934,7 @@ pstripe: * know they can be trusted. If this was a read reconstruction, * other endio functions will fiddle the uptodate bits */ - if (!rbio->read_rebuild) { + if (rbio->operation == BTRFS_RBIO_WRITE) { for (i = 0; i < nr_pages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); @@ -1862,12 +1946,12 @@ pstripe: } } } - for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* * if we're rebuilding a read, we have to use * pages from the bio list */ - if (rbio->read_rebuild && + if (rbio->operation == BTRFS_RBIO_READ_REBUILD && (stripe == faila || stripe == failb)) { page = page_in_rbio(rbio, stripe, pagenr, 0); } else { @@ -1882,8 +1966,7 @@ cleanup: kfree(pointers); cleanup_io: - - if (rbio->read_rebuild) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { if (err == 0) cache_rbio_pages(rbio); else @@ -1893,7 +1976,13 @@ cleanup_io: } else if (err == 0) { rbio->faila = -1; rbio->failb = -1; - finish_rmw(rbio); + + if (rbio->operation == BTRFS_RBIO_WRITE) + finish_rmw(rbio); + else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) + finish_parity_scrub(rbio, 0); + else + BUG(); } else { rbio_orig_end_io(rbio, err, 0); } @@ -1917,10 +2006,10 @@ static void raid_recover_end_io(struct bio *bio, int err) set_bio_pages_uptodate(bio); bio_put(bio); - if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) + if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) rbio_orig_end_io(rbio, -EIO, 0); else __raid_recover_end_io(rbio); @@ -1937,7 +2026,6 @@ static void raid_recover_end_io(struct bio *bio, int err) static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; - struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); @@ -1951,16 +2039,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) if (ret) goto cleanup; - atomic_set(&rbio->bbio->error, 0); + atomic_set(&rbio->error, 0); /* * read everything that hasn't failed. Thanks to the * stripe cache, it is possible that some or all of these * pages are going to be uptodate. */ - for (stripe = 0; stripe < bbio->num_stripes; stripe++) { + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->bbio->error); + atomic_inc(&rbio->error); continue; } @@ -1990,7 +2078,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * were up to date, or we might have no bios to read because * the devices were gone. */ - if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { + if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { __raid_recover_end_io(rbio); goto out; } else { @@ -2002,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * the bbio may be freed once we submit the last bio. Make sure * not to touch it after that */ - atomic_set(&bbio->stripes_pending, bios_to_read); + atomic_set(&rbio->stripes_pending, bios_to_read); while (1) { bio = bio_list_pop(&bio_list); if (!bio) @@ -2021,7 +2109,7 @@ out: return 0; cleanup: - if (rbio->read_rebuild) + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) rbio_orig_end_io(rbio, -EIO, 0); return -EIO; } @@ -2033,35 +2121,45 @@ cleanup: * of the drive. */ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num) + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; - rbio = alloc_rbio(root, bbio, raid_map, stripe_len); - if (IS_ERR(rbio)) + rbio = alloc_rbio(root, bbio, stripe_len); + if (IS_ERR(rbio)) { + if (generic_io) + btrfs_put_bbio(bbio); return PTR_ERR(rbio); + } - rbio->read_rebuild = 1; + rbio->operation = BTRFS_RBIO_READ_REBUILD; bio_list_add(&rbio->bio_list, bio); rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { BUG(); - kfree(raid_map); - kfree(bbio); + if (generic_io) + btrfs_put_bbio(bbio); kfree(rbio); return -EIO; } + if (generic_io) { + btrfs_bio_counter_inc_noblocked(root->fs_info); + rbio->generic_bio_cnt = 1; + } else { + btrfs_get_bbio(bbio); + } + /* * reconstruct from the q stripe if they are * asking for mirror 3 */ if (mirror_num == 3) - rbio->failb = bbio->num_stripes - 2; + rbio->failb = rbio->real_stripes - 2; ret = lock_stripe_add(rbio); @@ -2098,3 +2196,483 @@ static void read_rebuild_work(struct btrfs_work *work) rbio = container_of(work, struct btrfs_raid_bio, work); __raid56_parity_recover(rbio); } + +/* + * The following code is used to scrub/replace the parity stripe + * + * Note: We need make sure all the pages that add into the scrub/replace + * raid bio are correct and not be changed during the scrub/replace. That + * is those pages just hold metadata or file data with checksum. + */ + +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) +{ + struct btrfs_raid_bio *rbio; + int i; + + rbio = alloc_rbio(root, bbio, stripe_len); + if (IS_ERR(rbio)) + return NULL; + bio_list_add(&rbio->bio_list, bio); + /* + * This is a special bio which is used to hold the completion handler + * and make the scrub rbio is similar to the other types + */ + ASSERT(!bio->bi_iter.bi_size); + rbio->operation = BTRFS_RBIO_PARITY_SCRUB; + + for (i = 0; i < rbio->real_stripes; i++) { + if (bbio->stripes[i].dev == scrub_dev) { + rbio->scrubp = i; + break; + } + } + + /* Now we just support the sectorsize equals to page size */ + ASSERT(root->sectorsize == PAGE_SIZE); + ASSERT(rbio->stripe_npages == stripe_nsectors); + bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + + return rbio; +} + +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical) +{ + int stripe_offset; + int index; + + ASSERT(logical >= rbio->bbio->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + + rbio->stripe_len * rbio->nr_data); + stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); + index = stripe_offset >> PAGE_CACHE_SHIFT; + rbio->bio_pages[index] = page; +} + +/* + * We just scrub the parity that we have correct data on the same horizontal, + * so we needn't allocate all pages for all the stripes. + */ +static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) +{ + int i; + int bit; + int index; + struct page *page; + + for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { + for (i = 0; i < rbio->real_stripes; i++) { + index = i * rbio->stripe_npages + bit; + if (rbio->stripe_pages[index]) + continue; + + page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; + ClearPageUptodate(page); + } + } + return 0; +} + +/* + * end io function used by finish_rmw. When we finally + * get here, we've written a full stripe + */ +static void raid_write_parity_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + err = 0; + + if (atomic_read(&rbio->error)) + err = -EIO; + + rbio_orig_end_io(rbio, err, 0); +} + +static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, + int need_check) +{ + struct btrfs_bio *bbio = rbio->bbio; + void *pointers[rbio->real_stripes]; + DECLARE_BITMAP(pbitmap, rbio->stripe_npages); + int nr_data = rbio->nr_data; + int stripe; + int pagenr; + int p_stripe = -1; + int q_stripe = -1; + struct page *p_page = NULL; + struct page *q_page = NULL; + struct bio_list bio_list; + struct bio *bio; + int is_replace = 0; + int ret; + + bio_list_init(&bio_list); + + if (rbio->real_stripes - rbio->nr_data == 1) { + p_stripe = rbio->real_stripes - 1; + } else if (rbio->real_stripes - rbio->nr_data == 2) { + p_stripe = rbio->real_stripes - 2; + q_stripe = rbio->real_stripes - 1; + } else { + BUG(); + } + + if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + is_replace = 1; + bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + } + + /* + * Because the higher layers(scrubber) are unlikely to + * use this area of the disk again soon, so don't cache + * it. + */ + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + if (!need_check) + goto writeback; + + p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!p_page) + goto cleanup; + SetPageUptodate(p_page); + + if (q_stripe != -1) { + q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!q_page) { + __free_page(p_page); + goto cleanup; + } + SetPageUptodate(q_page); + } + + atomic_set(&rbio->error, 0); + + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *p; + void *parity; + /* first collect one page from each data stripe */ + for (stripe = 0; stripe < nr_data; stripe++) { + p = page_in_rbio(rbio, stripe, pagenr, 0); + pointers[stripe] = kmap(p); + } + + /* then add the parity stripe */ + pointers[stripe++] = kmap(p_page); + + if (q_stripe != -1) { + + /* + * raid6, add the qstripe and call the + * library function to fill in our p/q + */ + pointers[stripe++] = kmap(q_page); + + raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + pointers); + } else { + /* raid5 */ + memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); + run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); + } + + /* Check scrubbing pairty and repair it */ + p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + parity = kmap(p); + if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) + memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); + else + /* Parity is right, needn't writeback */ + bitmap_clear(rbio->dbitmap, pagenr, 1); + kunmap(p); + + for (stripe = 0; stripe < rbio->real_stripes; stripe++) + kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + } + + __free_page(p_page); + if (q_page) + __free_page(q_page); + +writeback: + /* + * time to start writing. Make bios for everything from the + * higher layers (the bio_list in our rbio) and our p/q. Ignore + * everything else. + */ + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, + page, rbio->scrubp, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + + if (!is_replace) + goto submit_write; + + for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { + struct page *page; + + page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); + ret = rbio_add_io_page(rbio, &bio_list, page, + bbio->tgtdev_map[rbio->scrubp], + pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + +submit_write: + nr_data = bio_list_size(&bio_list); + if (!nr_data) { + /* Every parity is right */ + rbio_orig_end_io(rbio, 0, 0); + return; + } + + atomic_set(&rbio->stripes_pending, nr_data); + + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid_write_parity_end_io; + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(WRITE, bio); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) +{ + if (stripe >= 0 && stripe < rbio->nr_data) + return 1; + return 0; +} + +/* + * While we're doing the parity check and repair, we could have errors + * in reading pages off the disk. This checks for errors and if we're + * not able to read the page it'll trigger parity reconstruction. The + * parity scrub will be finished after we've reconstructed the failed + * stripes + */ +static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +{ + if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + goto cleanup; + + if (rbio->faila >= 0 || rbio->failb >= 0) { + int dfail = 0, failp = -1; + + if (is_data_stripe(rbio, rbio->faila)) + dfail++; + else if (is_parity_stripe(rbio->faila)) + failp = rbio->faila; + + if (is_data_stripe(rbio, rbio->failb)) + dfail++; + else if (is_parity_stripe(rbio->failb)) + failp = rbio->failb; + + /* + * Because we can not use a scrubbing parity to repair + * the data, so the capability of the repair is declined. + * (In the case of RAID5, we can not repair anything) + */ + if (dfail > rbio->bbio->max_errors - 1) + goto cleanup; + + /* + * If all data is good, only parity is correctly, just + * repair the parity. + */ + if (dfail == 0) { + finish_parity_scrub(rbio, 0); + return; + } + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity + * is scrubbing parity, luckly, use the other one to repair + * the data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) + goto cleanup; + + __raid_recover_end_io(rbio); + } else { + finish_parity_scrub(rbio, 1); + } + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); +} + +/* + * end io for the read phase of the rmw cycle. All the bios here are physical + * stripe bios we've read from the disk so we can recalculate the parity of the + * stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_parity_scrub_end_io(struct bio *bio, int err) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (err) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(bio); + + bio_put(bio); + + if (!atomic_dec_and_test(&rbio->stripes_pending)) + return; + + /* + * this will normally call finish_rmw to start our write + * but if there are any failed stripes we'll reconstruct + * from parity first + */ + validate_rbio_for_parity_scrub(rbio); +} + +static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + int pagenr; + int stripe; + struct bio *bio; + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; + + bio_list_init(&bio_list); + + atomic_set(&rbio->error, 0); + /* + * build a list of bios to read all the missing parts of this + * stripe + */ + for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { + struct page *page; + /* + * we want to find all the pages missing from + * the rbio and read them from the disk. If + * page_in_rbio finds a page in the bio list + * we don't need to read it off the stripe. + */ + page = page_in_rbio(rbio, stripe, pagenr, 1); + if (page) + continue; + + page = rbio_stripe_page(rbio, stripe, pagenr); + /* + * the bio cache may have handed us an uptodate + * page. If so, be happy and use it + */ + if (PageUptodate(page)) + continue; + + ret = rbio_add_io_page(rbio, &bio_list, page, + stripe, pagenr, rbio->stripe_len); + if (ret) + goto cleanup; + } + } + + bios_to_read = bio_list_size(&bio_list); + if (!bios_to_read) { + /* + * this can happen if others have merged with + * us, it means there is nothing left to read. + * But if there are missing devices it may not be + * safe to do the full stripe write yet. + */ + goto finish; + } + + /* + * the bbio may be freed once we submit the last bio. Make sure + * not to touch it after that + */ + atomic_set(&rbio->stripes_pending, bios_to_read); + while (1) { + bio = bio_list_pop(&bio_list); + if (!bio) + break; + + bio->bi_private = rbio; + bio->bi_end_io = raid56_parity_scrub_end_io; + + btrfs_bio_wq_end_io(rbio->fs_info, bio, + BTRFS_WQ_ENDIO_RAID56); + + BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); + submit_bio(READ, bio); + } + /* the actual write will happen once the reads are done */ + return; + +cleanup: + rbio_orig_end_io(rbio, -EIO, 0); + return; + +finish: + validate_rbio_for_parity_scrub(rbio); +} + +static void scrub_parity_work(struct btrfs_work *work) +{ + struct btrfs_raid_bio *rbio; + + rbio = container_of(work, struct btrfs_raid_bio, work); + raid56_parity_scrub_stripe(rbio); +} + +static void async_scrub_parity(struct btrfs_raid_bio *rbio) +{ + btrfs_init_work(&rbio->work, btrfs_rmw_helper, + scrub_parity_work, NULL, NULL); + + btrfs_queue_work(rbio->fs_info->rmw_workers, + &rbio->work); +} + +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) +{ + if (!lock_stripe_add(rbio)) + async_scrub_parity(rbio); +} diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index ea5d73bfdfbe..2b5d7977d83b 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -39,12 +39,23 @@ static inline int nr_data_stripes(struct map_lookup *map) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) +struct btrfs_raid_bio; +struct btrfs_device; + int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len, int mirror_num); + struct btrfs_bio *bbio, u64 stripe_len, + int mirror_num, int generic_io); int raid56_parity_write(struct btrfs_root *root, struct bio *bio, - struct btrfs_bio *bbio, u64 *raid_map, - u64 stripe_len); + struct btrfs_bio *bbio, u64 stripe_len); + +struct btrfs_raid_bio * +raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, + struct btrfs_bio *bbio, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); +void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, + struct page *page, u64 logical); +void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index b63ae20618fb..0e7beea92b4c 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -66,7 +66,6 @@ struct reada_extctl { struct reada_extent { u64 logical; struct btrfs_key top; - u32 blocksize; int err; struct list_head extctl; int refcnt; @@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, blocksize = root->nodesize; re->logical = logical; - re->blocksize = blocksize; re->top = *top; INIT_LIST_HEAD(&re->extctl); spin_lock_init(&re->lock); @@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace); - kfree(bbio); + btrfs_put_bbio(bbio); return re; error: @@ -488,7 +486,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - kfree(bbio); + btrfs_put_bbio(bbio); kfree(re); return re_exist; } @@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, int mirror_num = 0; struct extent_buffer *eb = NULL; u64 logical; - u32 blocksize; int ret; int i; int need_kick = 0; @@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); return 0; } - dev->reada_next = re->logical + re->blocksize; + dev->reada_next = re->logical + fs_info->tree_root->nodesize; re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, } } logical = re->logical; - blocksize = re->blocksize; spin_lock(&re->lock); if (re->scheduled_for == NULL) { @@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, return 0; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize, - mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info->extent_root, logical, + mirror_num, &eb); if (ret) __readahead_hook(fs_info->extent_root, NULL, logical, ret); else if (eb) @@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) break; printk(KERN_DEBUG " re: logical %llu size %u empty %d for %lld", - re->logical, re->blocksize, + re->logical, fs_info->tree_root->nodesize, list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); @@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) } printk(KERN_DEBUG "re: logical %llu size %u list empty %d for %lld", - re->logical, re->blocksize, list_empty(&re->extctl), + re->logical, fs_info->tree_root->nodesize, + list_empty(&re->extctl), re->scheduled_for ? re->scheduled_for->devid : -1); for (i = 0; i < re->nzones; ++i) { printk(KERN_CONT " zone %llu-%llu devs", diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74257d6436ad..d83085381bcc 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc, } } -static int tree_block_processed(u64 bytenr, u32 blocksize, - struct reloc_control *rc) +static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { + u32 blocksize = rc->extent_root->nodesize; + if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; @@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid); + readahead_tree_block(rc->extent_root, block->bytenr); rb_node = rb_next(rb_node); } @@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc, bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, SKINNY_METADATA); - if (tree_block_processed(bytenr, blocksize, rc)) + if (tree_block_processed(bytenr, rc)) return 0; if (tree_search(blocks, bytenr)) @@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc, if (added) goto next; - if (!tree_block_processed(leaf->start, leaf->len, rc)) { + if (!tree_block_processed(leaf->start, rc)) { block = kmalloc(sizeof(*block), GFP_NOFS); if (!block) { err = -ENOMEM; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index efa083113827..ec57687c9a4d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -63,22 +63,31 @@ struct scrub_ctx; */ #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ +struct scrub_recover { + atomic_t refs; + struct btrfs_bio *bbio; + u64 map_length; +}; + struct scrub_page { struct scrub_block *sblock; struct page *page; struct btrfs_device *dev; + struct list_head list; u64 flags; /* extent flags */ u64 generation; u64 logical; u64 physical; u64 physical_for_dev_replace; - atomic_t ref_count; + atomic_t refs; struct { unsigned int mirror_num:8; unsigned int have_csum:1; unsigned int io_error:1; }; u8 csum[BTRFS_CSUM_SIZE]; + + struct scrub_recover *recover; }; struct scrub_bio { @@ -103,16 +112,54 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t ref_count; /* free mem on transition to zero */ + atomic_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; + struct scrub_parity *sparity; struct { unsigned int header_error:1; unsigned int checksum_error:1; unsigned int no_io_error_seen:1; unsigned int generation_error:1; /* also sets header_error */ + + /* The following is for the data used to check parity */ + /* It is for the data with checksum */ + unsigned int data_corrected:1; }; }; +/* Used for the chunks with parity stripe such RAID5/6 */ +struct scrub_parity { + struct scrub_ctx *sctx; + + struct btrfs_device *scrub_dev; + + u64 logic_start; + + u64 logic_end; + + int nsectors; + + int stripe_len; + + atomic_t refs; + + struct list_head spages; + + /* Work of parity check and repair */ + struct btrfs_work work; + + /* Mark the parity blocks which have data */ + unsigned long *dbitmap; + + /* + * Mark the parity blocks which have data, but errors happen when + * read data or check data + */ + unsigned long *ebitmap; + + unsigned long bitmap[0]; +}; + struct scrub_wr_ctx { struct scrub_bio *wr_curr_bio; struct btrfs_device *tgtdev; @@ -146,6 +193,15 @@ struct scrub_ctx { */ struct btrfs_scrub_progress stat; spinlock_t stat_lock; + + /* + * Use a ref counter to avoid use-after-free issues. Scrub workers + * decrement bios_in_flight and workers_pending and then do a wakeup + * on the list_wait wait queue. We must ensure the main scrub task + * doesn't free the scrub context before or while the workers are + * doing the wakeup() call. + */ + atomic_t refs; }; struct scrub_fixup_nodatasum { @@ -188,23 +244,19 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx); static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, - u16 csum_size); + u16 csum_size, int retry_failed_mirror); static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, const u8 *csum, u64 generation, u16 csum_size); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write); + struct scrub_block *sblock_good); static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, int page_num, int force_write); @@ -218,6 +270,8 @@ static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); +static void scrub_parity_get(struct scrub_parity *sparity); +static void scrub_parity_put(struct scrub_parity *sparity); static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -252,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, static void copy_nocow_pages_worker(struct btrfs_work *work); static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); +static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { + atomic_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -263,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx) { atomic_dec(&sctx->bios_in_flight); wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); } static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) @@ -296,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + atomic_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -338,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) atomic_dec(&sctx->workers_pending); wake_up(&fs_info->scrub_pause_wait); wake_up(&sctx->list_wait); + scrub_put_ctx(sctx); } static void scrub_free_csums(struct scrub_ctx *sctx) @@ -383,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) kfree(sctx); } +static void scrub_put_ctx(struct scrub_ctx *sctx) +{ + if (atomic_dec_and_test(&sctx->refs)) + scrub_free_ctx(sctx); +} + static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { @@ -407,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_NOFS); if (!sctx) goto nomem; + atomic_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = pages_per_rd_bio; sctx->curr = -1; @@ -470,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key root_key; + struct btrfs_key key; root_key.objectid = root; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -480,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, goto err; } - ret = inode_item_info(inum, 0, local_root, swarn->path); + /* + * this makes the path point to (inum INODE_ITEM ioff) + */ + key.objectid = inum; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0); if (ret) { btrfs_release_path(swarn->path); goto err; @@ -790,6 +864,19 @@ out: scrub_pending_trans_workers_dec(sctx); } +static inline void scrub_get_recover(struct scrub_recover *recover) +{ + atomic_inc(&recover->refs); +} + +static inline void scrub_put_recover(struct scrub_recover *recover) +{ + if (atomic_dec_and_test(&recover->refs)) { + btrfs_put_bbio(recover->bbio); + kfree(recover); + } +} + /* * scrub_handle_errored_block gets called when either verification of the * pages failed or the bio failed to read, e.g. with EIO. In the latter @@ -891,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) } /* setup the context, map the logical blocks and alloc the pages */ - ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length, - logical, sblocks_for_recheck); + ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; @@ -906,7 +992,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) /* build and submit the bios for the failed mirror, check checksums */ scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, - csum, generation, sctx->csum_size); + csum, generation, sctx->csum_size, 1); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { @@ -920,6 +1006,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) */ spin_lock(&sctx->stat_lock); sctx->stat.unverified_errors++; + sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); if (sctx->is_dev_replace) @@ -965,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!is_metadata && !have_csum) { struct scrub_fixup_nodatasum *fixup_nodatasum; -nodatasum_case: WARN_ON(sctx->is_dev_replace); +nodatasum_case: + /* * !is_metadata and !have_csum, this means that the data * might not be COW'ed, that it might be modified @@ -1019,83 +1107,27 @@ nodatasum_case: /* build and submit the bios, check checksums */ scrub_recheck_block(fs_info, sblock_other, is_metadata, have_csum, csum, generation, - sctx->csum_size); + sctx->csum_size, 0); if (!sblock_other->header_error && !sblock_other->checksum_error && sblock_other->no_io_error_seen) { if (sctx->is_dev_replace) { scrub_write_block_to_dev_replace(sblock_other); + goto corrected_error; } else { - int force_write = is_metadata || have_csum; - ret = scrub_repair_block_from_good_copy( - sblock_bad, sblock_other, - force_write); + sblock_bad, sblock_other); + if (!ret) + goto corrected_error; } - if (0 == ret) - goto corrected_error; } } - /* - * for dev_replace, pick good pages and write to the target device. - */ - if (sctx->is_dev_replace) { - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - int sub_success; - - sub_success = 0; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = - sblocks_for_recheck + mirror_index; - struct scrub_page *page_other = - sblock_other->pagev[page_num]; - - if (!page_other->io_error) { - ret = scrub_write_page_to_dev_replace( - sblock_other, page_num); - if (ret == 0) { - /* succeeded for this page */ - sub_success = 1; - break; - } else { - btrfs_dev_replace_stats_inc( - &sctx->dev_root-> - fs_info->dev_replace. - num_write_errors); - } - } - } - - if (!sub_success) { - /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request - */ - success = 0; - ret = scrub_write_page_to_dev_replace( - sblock_bad, page_num); - if (ret) - btrfs_dev_replace_stats_inc( - &sctx->dev_root->fs_info-> - dev_replace.num_write_errors); - } - } - - goto out; - } + if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) + goto did_not_correct_error; /* - * for regular scrub, repair those pages that are errored. * In case of I/O errors in the area that is supposed to be * repaired, continue by picking good copies of those pages. * Select the good pages from mirrors to rewrite bad pages from @@ -1119,44 +1151,64 @@ nodatasum_case: * mirror, even if other 512 byte sectors in the same PAGE_SIZE * area are unreadable. */ - - /* can only fix I/O errors from here on */ - if (sblock_bad->no_io_error_seen) - goto did_not_correct_error; - success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (page_num = 0; page_num < sblock_bad->page_count; + page_num++) { struct scrub_page *page_bad = sblock_bad->pagev[page_num]; + struct scrub_block *sblock_other = NULL; - if (!page_bad->io_error) + /* skip no-io-error page in scrub */ + if (!page_bad->io_error && !sctx->is_dev_replace) continue; - for (mirror_index = 0; - mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; - mirror_index++) { - struct scrub_block *sblock_other = sblocks_for_recheck + - mirror_index; - struct scrub_page *page_other = sblock_other->pagev[ - page_num]; - - if (!page_other->io_error) { - ret = scrub_repair_page_from_good_copy( - sblock_bad, sblock_other, page_num, 0); - if (0 == ret) { - page_bad->io_error = 0; - break; /* succeeded for this page */ + /* try to find no-io-error page in mirrors */ + if (page_bad->io_error) { + for (mirror_index = 0; + mirror_index < BTRFS_MAX_MIRRORS && + sblocks_for_recheck[mirror_index].page_count > 0; + mirror_index++) { + if (!sblocks_for_recheck[mirror_index]. + pagev[page_num]->io_error) { + sblock_other = sblocks_for_recheck + + mirror_index; + break; } } + if (!sblock_other) + success = 0; } - if (page_bad->io_error) { - /* did not find a mirror to copy the page from */ - success = 0; + if (sctx->is_dev_replace) { + /* + * did not find a mirror to fetch the page + * from. scrub_write_page_to_dev_replace() + * handles this case (page->io_error), by + * filling the block with zeros before + * submitting the write request + */ + if (!sblock_other) + sblock_other = sblock_bad; + + if (scrub_write_page_to_dev_replace(sblock_other, + page_num) != 0) { + btrfs_dev_replace_stats_inc( + &sctx->dev_root-> + fs_info->dev_replace. + num_write_errors); + success = 0; + } + } else if (sblock_other) { + ret = scrub_repair_page_from_good_copy(sblock_bad, + sblock_other, + page_num, 0); + if (0 == ret) + page_bad->io_error = 0; + else + success = 0; } } - if (success) { + if (success && !sctx->is_dev_replace) { if (is_metadata || have_csum) { /* * need to verify the checksum now that all @@ -1169,7 +1221,7 @@ nodatasum_case: */ scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, csum, - generation, sctx->csum_size); + generation, sctx->csum_size, 1); if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) @@ -1180,6 +1232,7 @@ nodatasum_case: corrected_error: spin_lock(&sctx->stat_lock); sctx->stat.corrected_errors++; + sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); printk_ratelimited_in_rcu(KERN_ERR "BTRFS: fixed up error at logical %llu on dev %s\n", @@ -1201,11 +1254,18 @@ out: mirror_index++) { struct scrub_block *sblock = sblocks_for_recheck + mirror_index; + struct scrub_recover *recover; int page_index; for (page_index = 0; page_index < sblock->page_count; page_index++) { sblock->pagev[page_index]->sblock = NULL; + recover = sblock->pagev[page_index]->recover; + if (recover) { + scrub_put_recover(recover); + sblock->pagev[page_index]->recover = + NULL; + } scrub_page_put(sblock->pagev[page_index]); } } @@ -1215,48 +1275,105 @@ out: return 0; } -static int scrub_setup_recheck_block(struct scrub_ctx *sctx, - struct btrfs_fs_info *fs_info, - struct scrub_block *original_sblock, - u64 length, u64 logical, +static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) +{ + if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + return 2; + else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + return 3; + else + return (int)bbio->num_stripes; +} + +static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, + u64 *raid_map, + u64 mapped_length, + int nstripes, int mirror, + int *stripe_index, + u64 *stripe_offset) +{ + int i; + + if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* RAID5/6 */ + for (i = 0; i < nstripes; i++) { + if (raid_map[i] == RAID6_Q_STRIPE || + raid_map[i] == RAID5_P_STRIPE) + continue; + + if (logical >= raid_map[i] && + logical < raid_map[i] + mapped_length) + break; + } + + *stripe_index = i; + *stripe_offset = logical - raid_map[i]; + } else { + /* The other RAID type */ + *stripe_index = mirror; + *stripe_offset = 0; + } +} + +static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck) { - int page_index; + struct scrub_ctx *sctx = original_sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + u64 length = original_sblock->page_count * PAGE_SIZE; + u64 logical = original_sblock->pagev[0]->logical; + struct scrub_recover *recover; + struct btrfs_bio *bbio; + u64 sublen; + u64 mapped_length; + u64 stripe_offset; + int stripe_index; + int page_index = 0; int mirror_index; + int nmirrors; int ret; /* - * note: the two members ref_count and outstanding_pages + * note: the two members refs and outstanding_pages * are not used (and not set) in the blocks that are used for * the recheck procedure */ - page_index = 0; while (length > 0) { - u64 sublen = min_t(u64, length, PAGE_SIZE); - u64 mapped_length = sublen; - struct btrfs_bio *bbio = NULL; + sublen = min_t(u64, length, PAGE_SIZE); + mapped_length = sublen; + bbio = NULL; /* * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0); + ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, + &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { - kfree(bbio); + btrfs_put_bbio(bbio); return -EIO; } + recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + if (!recover) { + btrfs_put_bbio(bbio); + return -ENOMEM; + } + + atomic_set(&recover->refs, 1); + recover->bbio = bbio; + recover->map_length = mapped_length; + BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); - for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; + + nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + + for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; struct scrub_page *page; - if (mirror_index >= BTRFS_MAX_MIRRORS) - continue; - sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; page = kzalloc(sizeof(*page), GFP_NOFS); @@ -1265,26 +1382,41 @@ leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - kfree(bbio); + scrub_put_recover(recover); return -ENOMEM; } scrub_page_get(page); sblock->pagev[page_index] = page; page->logical = logical; - page->physical = bbio->stripes[mirror_index].physical; + + scrub_stripe_index_and_offset(logical, + bbio->map_type, + bbio->raid_map, + mapped_length, + bbio->num_stripes - + bbio->num_tgtdevs, + mirror_index, + &stripe_index, + &stripe_offset); + page->physical = bbio->stripes[stripe_index].physical + + stripe_offset; + page->dev = bbio->stripes[stripe_index].dev; + BUG_ON(page_index >= original_sblock->page_count); page->physical_for_dev_replace = original_sblock->pagev[page_index]-> physical_for_dev_replace; /* for missing devices, dev->bdev is NULL */ - page->dev = bbio->stripes[mirror_index].dev; page->mirror_num = mirror_index + 1; sblock->page_count++; page->page = alloc_page(GFP_NOFS); if (!page->page) goto leave_nomem; + + scrub_get_recover(recover); + page->recover = recover; } - kfree(bbio); + scrub_put_recover(recover); length -= sublen; logical += sublen; page_index++; @@ -1293,6 +1425,51 @@ leave_nomem: return 0; } +struct scrub_bio_ret { + struct completion event; + int error; +}; + +static void scrub_bio_wait_endio(struct bio *bio, int error) +{ + struct scrub_bio_ret *ret = bio->bi_private; + + ret->error = error; + complete(&ret->event); +} + +static inline int scrub_is_page_on_raid56(struct scrub_page *page) +{ + return page->recover && + (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); +} + +static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, + struct bio *bio, + struct scrub_page *page) +{ + struct scrub_bio_ret done; + int ret; + + init_completion(&done.event); + done.error = 0; + bio->bi_iter.bi_sector = page->logical >> 9; + bio->bi_private = &done; + bio->bi_end_io = scrub_bio_wait_endio; + + ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, + page->recover->map_length, + page->mirror_num, 0); + if (ret) + return ret; + + wait_for_completion(&done.event); + if (done.error) + return -EIO; + + return 0; +} + /* * this function will check the on disk data for checksum errors, header * errors and read I/O errors. If any I/O errors happen, the exact pages @@ -1303,7 +1480,7 @@ leave_nomem: static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, u8 *csum, u64 generation, - u16 csum_size) + u16 csum_size, int retry_failed_mirror) { int page_num; @@ -1329,11 +1506,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, continue; } bio->bi_bdev = page->dev->bdev; - bio->bi_iter.bi_sector = page->physical >> 9; bio_add_page(bio, page->page, PAGE_SIZE, 0); - if (btrfsic_submit_bio_wait(READ, bio)) - sblock->no_io_error_seen = 0; + if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + sblock->no_io_error_seen = 0; + } else { + bio->bi_iter.bi_sector = page->physical >> 9; + + if (btrfsic_submit_bio_wait(READ, bio)) + sblock->no_io_error_seen = 0; + } bio_put(bio); } @@ -1414,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, } static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int force_write) + struct scrub_block *sblock_good) { int page_num; int ret = 0; @@ -1425,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, ret_sub = scrub_repair_page_from_good_copy(sblock_bad, sblock_good, - page_num, - force_write); + page_num, 1); if (ret_sub) ret = ret_sub; } @@ -1486,6 +1667,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { int page_num; + /* + * This block is used for the check of the parity on the source device, + * so the data needn't be written into the destination device. + */ + if (sblock->sparity) + return; + for (page_num = 0; page_num < sblock->page_count; page_num++) { int ret; @@ -1859,14 +2047,17 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->ref_count); + atomic_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->ref_count)) { + if (atomic_dec_and_test(&sblock->refs)) { int i; + if (sblock->sparity) + scrub_parity_put(sblock->sparity); + for (i = 0; i < sblock->page_count; i++) scrub_page_put(sblock->pagev[i]); kfree(sblock); @@ -1875,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock) static void scrub_page_get(struct scrub_page *spage) { - atomic_inc(&spage->ref_count); + atomic_inc(&spage->refs); } static void scrub_page_put(struct scrub_page *spage) { - if (atomic_dec_and_test(&spage->ref_count)) { + if (atomic_dec_and_test(&spage->refs)) { if (spage->page) __free_page(spage->page); kfree(spage); @@ -2006,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->ref_count, 1); + atomic_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2124,9 +2315,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) scrub_pending_bio_dec(sctx); } +static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, + unsigned long *bitmap, + u64 start, u64 len) +{ + int offset; + int nsectors; + int sectorsize = sparity->sctx->dev_root->sectorsize; + + if (len >= sparity->stripe_len) { + bitmap_set(bitmap, 0, sparity->nsectors); + return; + } + + start -= sparity->logic_start; + offset = (int)do_div(start, sparity->stripe_len); + offset /= sectorsize; + nsectors = (int)len / sectorsize; + + if (offset + nsectors <= sparity->nsectors) { + bitmap_set(bitmap, offset, nsectors); + return; + } + + bitmap_set(bitmap, offset, sparity->nsectors - offset); + bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); +} + +static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); +} + +static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, + u64 start, u64 len) +{ + __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); +} + static void scrub_block_complete(struct scrub_block *sblock) { + int corrupted = 0; + if (!sblock->no_io_error_seen) { + corrupted = 1; scrub_handle_errored_block(sblock); } else { /* @@ -2134,9 +2367,19 @@ static void scrub_block_complete(struct scrub_block *sblock) * dev replace case, otherwise write here in dev replace * case. */ - if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) + corrupted = scrub_checksum(sblock); + if (!corrupted && sblock->sctx->is_dev_replace) scrub_write_block_to_dev_replace(sblock); } + + if (sblock->sparity && corrupted && !sblock->data_corrected) { + u64 start = sblock->pagev[0]->logical; + u64 end = sblock->pagev[sblock->page_count - 1]->logical + + PAGE_SIZE; + + scrub_parity_mark_sectors_error(sblock->sparity, + start, end - start); + } } static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, @@ -2228,6 +2471,132 @@ behind_scrub_pages: return 0; } +static int scrub_pages_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num, u8 *csum) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_block *sblock; + int index; + + sblock = kzalloc(sizeof(*sblock), GFP_NOFS); + if (!sblock) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + /* one ref inside this function, plus one for each page added to + * a bio later on */ + atomic_set(&sblock->refs, 1); + sblock->sctx = sctx; + sblock->no_io_error_seen = 1; + sblock->sparity = sparity; + scrub_parity_get(sparity); + + for (index = 0; len > 0; index++) { + struct scrub_page *spage; + u64 l = min_t(u64, len, PAGE_SIZE); + + spage = kzalloc(sizeof(*spage), GFP_NOFS); + if (!spage) { +leave_nomem: + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + scrub_block_put(sblock); + return -ENOMEM; + } + BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); + /* For scrub block */ + scrub_page_get(spage); + sblock->pagev[index] = spage; + /* For scrub parity */ + scrub_page_get(spage); + list_add_tail(&spage->list, &sparity->spages); + spage->sblock = sblock; + spage->dev = dev; + spage->flags = flags; + spage->generation = gen; + spage->logical = logical; + spage->physical = physical; + spage->mirror_num = mirror_num; + if (csum) { + spage->have_csum = 1; + memcpy(spage->csum, csum, sctx->csum_size); + } else { + spage->have_csum = 0; + } + sblock->page_count++; + spage->page = alloc_page(GFP_NOFS); + if (!spage->page) + goto leave_nomem; + len -= l; + logical += l; + physical += l; + } + + WARN_ON(sblock->page_count == 0); + for (index = 0; index < sblock->page_count; index++) { + struct scrub_page *spage = sblock->pagev[index]; + int ret; + + ret = scrub_add_page_to_rd_bio(sctx, spage); + if (ret) { + scrub_block_put(sblock); + return ret; + } + } + + /* last one frees, either here or in bio completion for last page */ + scrub_block_put(sblock); + return 0; +} + +static int scrub_extent_for_parity(struct scrub_parity *sparity, + u64 logical, u64 len, + u64 physical, struct btrfs_device *dev, + u64 flags, u64 gen, int mirror_num) +{ + struct scrub_ctx *sctx = sparity->sctx; + int ret; + u8 csum[BTRFS_CSUM_SIZE]; + u32 blocksize; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + blocksize = sctx->sectorsize; + } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + blocksize = sctx->nodesize; + } else { + blocksize = sctx->sectorsize; + WARN_ON(1); + } + + while (len) { + u64 l = min_t(u64, len, blocksize); + int have_csum = 0; + + if (flags & BTRFS_EXTENT_FLAG_DATA) { + /* push csums to sbio */ + have_csum = scrub_find_csum(sctx, logical, l, csum); + if (have_csum == 0) + goto skip; + } + ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + flags, gen, mirror_num, + have_csum ? csum : NULL); + if (ret) + return ret; +skip: + len -= l; + logical += l; + physical += l; + } + return 0; +} + /* * Given a physical address, this will calculate it's * logical offset. if this is a parity stripe, it will return @@ -2236,7 +2605,8 @@ behind_scrub_pages: * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset) + struct map_lookup *map, u64 *offset, + u64 *stripe_start) { int i; int j = 0; @@ -2247,6 +2617,9 @@ static int get_raid56_logic_offset(u64 physical, int num, last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); + if (stripe_start) + *stripe_start = last_offset; + *offset = last_offset; for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; @@ -2269,13 +2642,327 @@ static int get_raid56_logic_offset(u64 physical, int num, return 1; } +static void scrub_free_parity(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct scrub_page *curr, *next; + int nbits; + + nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + if (nbits) { + spin_lock(&sctx->stat_lock); + sctx->stat.read_errors += nbits; + sctx->stat.uncorrectable_errors += nbits; + spin_unlock(&sctx->stat_lock); + } + + list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_del_init(&curr->list); + scrub_page_put(curr); + } + + kfree(sparity); +} + +static void scrub_parity_bio_endio(struct bio *bio, int error) +{ + struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_ctx *sctx = sparity->sctx; + + if (error) + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); + bio_put(bio); +} + +static void scrub_parity_check_and_repair(struct scrub_parity *sparity) +{ + struct scrub_ctx *sctx = sparity->sctx; + struct bio *bio; + struct btrfs_raid_bio *rbio; + struct scrub_page *spage; + struct btrfs_bio *bbio = NULL; + u64 length; + int ret; + + if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, + sparity->nsectors)) + goto out; + + length = sparity->logic_end - sparity->logic_start + 1; + ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, + sparity->logic_start, + &length, &bbio, 0, 1); + if (ret || !bbio || !bbio->raid_map) + goto bbio_out; + + bio = btrfs_io_bio_alloc(GFP_NOFS, 0); + if (!bio) + goto bbio_out; + + bio->bi_iter.bi_sector = sparity->logic_start >> 9; + bio->bi_private = sparity; + bio->bi_end_io = scrub_parity_bio_endio; + + rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, + length, sparity->scrub_dev, + sparity->dbitmap, + sparity->nsectors); + if (!rbio) + goto rbio_out; + + list_for_each_entry(spage, &sparity->spages, list) + raid56_parity_add_scrub_pages(rbio, spage->page, + spage->logical); + + scrub_pending_bio_inc(sctx); + raid56_parity_submit_scrub_rbio(rbio); + return; + +rbio_out: + bio_put(bio); +bbio_out: + btrfs_put_bbio(bbio); + bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + sparity->nsectors); + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); +out: + scrub_free_parity(sparity); +} + +static inline int scrub_calc_parity_bitmap_len(int nsectors) +{ + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); +} + +static void scrub_parity_get(struct scrub_parity *sparity) +{ + atomic_inc(&sparity->refs); +} + +static void scrub_parity_put(struct scrub_parity *sparity) +{ + if (!atomic_dec_and_test(&sparity->refs)) + return; + + scrub_parity_check_and_repair(sparity); +} + +static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logic_start, + u64 logic_end) +{ + struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_root *csum_root = fs_info->csum_root; + struct btrfs_extent_item *extent; + u64 flags; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + u64 generation; + u64 extent_logical; + u64 extent_physical; + u64 extent_len; + struct btrfs_device *extent_dev; + struct scrub_parity *sparity; + int nsectors; + int bitmap_len; + int extent_mirror_num; + int stop_loop = 0; + + nsectors = map->stripe_len / root->sectorsize; + bitmap_len = scrub_calc_parity_bitmap_len(nsectors); + sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, + GFP_NOFS); + if (!sparity) { + spin_lock(&sctx->stat_lock); + sctx->stat.malloc_errors++; + spin_unlock(&sctx->stat_lock); + return -ENOMEM; + } + + sparity->stripe_len = map->stripe_len; + sparity->nsectors = nsectors; + sparity->sctx = sctx; + sparity->scrub_dev = sdev; + sparity->logic_start = logic_start; + sparity->logic_end = logic_end; + atomic_set(&sparity->refs, 1); + INIT_LIST_HEAD(&sparity->spages); + sparity->dbitmap = sparity->bitmap; + sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; + + ret = 0; + while (logic_start < logic_end) { + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = logic_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = btrfs_previous_extent_item(root, path, 0); + if (ret < 0) + goto out; + if (ret > 0) { + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } + } + + stop_loop = 0; + while (1) { + u64 bytes; + + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + stop_loop = 1; + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.type == BTRFS_METADATA_ITEM_KEY) + bytes = root->nodesize; + else + bytes = key.offset; + + if (key.objectid + bytes <= logic_start) + goto next; + + if (key.type != BTRFS_EXTENT_ITEM_KEY && + key.type != BTRFS_METADATA_ITEM_KEY) + goto next; + + if (key.objectid > logic_end) { + stop_loop = 1; + break; + } + + while (key.objectid >= logic_start + map->stripe_len) + logic_start += map->stripe_len; + + extent = btrfs_item_ptr(l, slot, + struct btrfs_extent_item); + flags = btrfs_extent_flags(l, extent); + generation = btrfs_extent_generation(l, extent); + + if (key.objectid < logic_start && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + key.objectid, logic_start); + goto next; + } +again: + extent_logical = key.objectid; + extent_len = bytes; + + if (extent_logical < logic_start) { + extent_len -= logic_start - extent_logical; + extent_logical = logic_start; + } + + if (extent_logical + extent_len > + logic_start + map->stripe_len) + extent_len = logic_start + map->stripe_len - + extent_logical; + + scrub_parity_mark_sectors_data(sparity, extent_logical, + extent_len); + + scrub_remap_extent(fs_info, extent_logical, + extent_len, &extent_physical, + &extent_dev, + &extent_mirror_num); + + ret = btrfs_lookup_csums_range(csum_root, + extent_logical, + extent_logical + extent_len - 1, + &sctx->csum_list, 1); + if (ret) + goto out; + + ret = scrub_extent_for_parity(sparity, extent_logical, + extent_len, + extent_physical, + extent_dev, flags, + generation, + extent_mirror_num); + if (ret) + goto out; + + scrub_free_csums(sctx); + if (extent_logical + extent_len < + key.objectid + bytes) { + logic_start += map->stripe_len; + + if (logic_start >= logic_end) { + stop_loop = 1; + break; + } + + if (logic_start < key.objectid + bytes) { + cond_resched(); + goto again; + } + } +next: + path->slots[0]++; + } + + btrfs_release_path(path); + + if (stop_loop) + break; + + logic_start += map->stripe_len; + } +out: + if (ret < 0) + scrub_parity_mark_sectors_error(sparity, logic_start, + logic_end - logic_start + 1); + scrub_parity_put(sparity); + scrub_submit(sctx); + mutex_lock(&sctx->wr_ctx.wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_ctx.wr_lock); + + btrfs_release_path(path); + return ret < 0 ? ret : 0; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *scrub_dev, int num, u64 base, u64 length, int is_dev_replace) { - struct btrfs_path *path; + struct btrfs_path *path, *ppath; struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; @@ -2302,6 +2989,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 extent_logical; u64 extent_physical; u64 extent_len; + u64 stripe_logical; + u64 stripe_end; struct btrfs_device *extent_dev; int extent_mirror_num; int stop_loop = 0; @@ -2325,9 +3014,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { increment = map->stripe_len; mirror_num = num % map->num_stripes + 1; - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - get_raid56_logic_offset(physical, num, map, &offset); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + get_raid56_logic_offset(physical, num, map, &offset, NULL); increment = map->stripe_len * nr_data_stripes(map); mirror_num = 1; } else { @@ -2339,6 +3027,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, if (!path) return -ENOMEM; + ppath = btrfs_alloc_path(); + if (!ppath) { + btrfs_free_path(path); + return -ENOMEM; + } + /* * work on commit root. The related disk blocks are static as * long as COW is applied. This means, it is save to rewrite @@ -2347,6 +3041,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, path->search_commit_root = 1; path->skip_locking = 1; + ppath->search_commit_root = 1; + ppath->skip_locking = 1; /* * trigger the readahead for extent tree csum tree and wait for * completion. During readahead, the scrub is officially paused @@ -2354,10 +3050,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ logical = base + offset; physical_end = physical + nstripes * map->stripe_len; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { get_raid56_logic_offset(physical_end, num, - map, &logic_end); + map, &logic_end, NULL); logic_end += base; } else { logic_end = logical + increment * nstripes; @@ -2401,13 +3096,20 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ret = 0; while (physical < physical_end) { /* for raid56, we skip parity stripe */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = get_raid56_logic_offset(physical, num, - map, &logical); + map, &logical, &stripe_logical); logical += base; - if (ret) + if (ret) { + stripe_logical += base; + stripe_end = stripe_logical + increment - 1; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + ppath, stripe_logical, + stripe_end); + if (ret) + goto out; goto skip; + } } /* * canceled? @@ -2552,19 +3254,30 @@ again: scrub_free_csums(sctx); if (extent_logical + extent_len < key.objectid + bytes) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { /* * loop until we find next data stripe * or we have finished all stripes. */ - do { - physical += map->stripe_len; - ret = get_raid56_logic_offset( - physical, num, - map, &logical); - logical += base; - } while (physical < physical_end && ret); +loop: + physical += map->stripe_len; + ret = get_raid56_logic_offset(physical, + num, map, &logical, + &stripe_logical); + logical += base; + + if (ret && physical < physical_end) { + stripe_logical += base; + stripe_end = stripe_logical + + increment - 1; + ret = scrub_raid56_parity(sctx, + map, scrub_dev, ppath, + stripe_logical, + stripe_end); + if (ret) + goto out; + goto loop; + } } else { physical += map->stripe_len; logical += increment; @@ -2605,6 +3318,7 @@ out: blk_finish_plug(&plug); btrfs_free_path(path); + btrfs_free_path(ppath); return ret < 0 ? ret : 0; } @@ -3034,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, scrub_workers_put(fs_info); mutex_unlock(&fs_info->scrub_lock); - scrub_free_ctx(sctx); + scrub_put_ctx(sctx); return ret; } @@ -3140,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < extent_len || !bbio->stripes[0].dev->bdev) { - kfree(bbio); + btrfs_put_bbio(bbio); return; } *extent_physical = bbio->stripes[0].physical; *extent_mirror_num = bbio->mirror_num; *extent_dev = bbio->stripes[0].dev; - kfree(bbio); + btrfs_put_bbio(bbio); } static int scrub_setup_wr_ctx(struct scrub_ctx *sctx, @@ -3310,6 +4024,50 @@ out: scrub_pending_trans_workers_dec(sctx); } +static int check_extent_to_block(struct inode *inode, u64 start, u64 len, + u64 logical) +{ + struct extent_state *cached_state = NULL; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree; + struct extent_map *em; + u64 lockstart = start, lockend = start + len - 1; + int ret = 0; + + io_tree = &BTRFS_I(inode)->io_tree; + + lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, lockstart, len); + if (ordered) { + btrfs_put_ordered_extent(ordered); + ret = 1; + goto out_unlock; + } + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_unlock; + } + + /* + * This extent does not actually cover the logical extent anymore, + * move on to the next inode. + */ + if (em->block_start > logical || + em->block_start + em->block_len < logical + len) { + free_extent_map(em); + ret = 1; + goto out_unlock; + } + free_extent_map(em); + +out_unlock: + unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, + GFP_NOFS); + return ret; +} + static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct scrub_copy_nocow_ctx *nocow_ctx) { @@ -3318,13 +4076,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct inode *inode; struct page *page; struct btrfs_root *local_root; - struct btrfs_ordered_extent *ordered; - struct extent_map *em; - struct extent_state *cached_state = NULL; struct extent_io_tree *io_tree; u64 physical_for_dev_replace; + u64 nocow_ctx_logical; u64 len = nocow_ctx->len; - u64 lockstart = offset, lockend = offset + len - 1; unsigned long index; int srcu_index; int ret = 0; @@ -3356,30 +4111,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; io_tree = &BTRFS_I(inode)->io_tree; + nocow_ctx_logical = nocow_ctx->logical; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, lockstart, len); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock; - } - - /* - * This extent does not actually cover the logical extent anymore, - * move on to the next inode. - */ - if (em->block_start > nocow_ctx->logical || - em->block_start + em->block_len < nocow_ctx->logical + len) { - free_extent_map(em); - goto out_unlock; + ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto out; } - free_extent_map(em); while (len >= PAGE_CACHE_SIZE) { index = offset >> PAGE_CACHE_SHIFT; @@ -3396,7 +4134,7 @@ again: goto next_page; } else { ClearPageError(page); - err = extent_read_full_page_nolock(io_tree, page, + err = extent_read_full_page(io_tree, page, btrfs_get_extent, nocow_ctx->mirror_num); if (err) { @@ -3421,6 +4159,14 @@ again: goto next_page; } } + + ret = check_extent_to_block(inode, offset, len, + nocow_ctx_logical); + if (ret) { + ret = ret > 0 ? 0 : ret; + goto next_page; + } + err = write_page_nocow(nocow_ctx->sctx, physical_for_dev_replace, page); if (err) @@ -3434,12 +4180,10 @@ next_page: offset += PAGE_CACHE_SIZE; physical_for_dev_replace += PAGE_CACHE_SIZE; + nocow_ctx_logical += PAGE_CACHE_SIZE; len -= PAGE_CACHE_SIZE; } ret = COPY_COMPLETE; -out_unlock: - unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, - GFP_NOFS); out: mutex_unlock(&inode->i_mutex); iput(inode); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 874828dd0a86..d6033f540cc7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -230,6 +230,7 @@ struct pending_dir_move { u64 parent_ino; u64 ino; u64 gen; + bool is_orphan; struct list_head update_refs; }; @@ -2471,12 +2472,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino); if (ret < 0) goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, - btrfs_inode_atime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, - btrfs_inode_mtime(ii)); - TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, - btrfs_inode_ctime(ii)); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); /* TODO Add otime support when the otime patches get into upstream */ ret = send_cmd(sctx); @@ -2987,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx, u64 ino_gen, u64 parent_ino, struct list_head *new_refs, - struct list_head *deleted_refs) + struct list_head *deleted_refs, + const bool is_orphan) { struct rb_node **p = &sctx->pending_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -3002,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, pm->parent_ino = parent_ino; pm->ino = ino; pm->gen = ino_gen; + pm->is_orphan = is_orphan; INIT_LIST_HEAD(&pm->list); INIT_LIST_HEAD(&pm->update_refs); RB_CLEAR_NODE(&pm->node); @@ -3134,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) rmdir_ino = dm->rmdir_ino; free_waiting_dir_move(sctx, dm); - ret = get_first_ref(sctx->parent_root, pm->ino, - &parent_ino, &parent_gen, name); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, parent_ino, parent_gen, - from_path); - if (ret < 0) - goto out; - ret = fs_path_add_path(from_path, name); + if (pm->is_orphan) { + ret = gen_unique_name(sctx, pm->ino, + pm->gen, from_path); + } else { + ret = get_first_ref(sctx->parent_root, pm->ino, + &parent_ino, &parent_gen, name); + if (ret < 0) + goto out; + ret = get_cur_path(sctx, parent_ino, parent_gen, + from_path); + if (ret < 0) + goto out; + ret = fs_path_add_path(from_path, name); + } if (ret < 0) goto out; @@ -3153,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) LIST_HEAD(deleted_refs); ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs); + &pm->update_refs, &deleted_refs, + pm->is_orphan); if (ret < 0) goto out; if (rmdir_ino) { @@ -3286,6 +3291,127 @@ out: return ret; } +/* + * We might need to delay a directory rename even when no ancestor directory + * (in the send root) with a higher inode number than ours (sctx->cur_ino) was + * renamed. This happens when we rename a directory to the old name (the name + * in the parent root) of some other unrelated directory that got its rename + * delayed due to some ancestor with higher number that got renamed. + * + * Example: + * + * Parent snapshot: + * . (ino 256) + * |---- a/ (ino 257) + * | |---- file (ino 260) + * | + * |---- b/ (ino 258) + * |---- c/ (ino 259) + * + * Send snapshot: + * . (ino 256) + * |---- a/ (ino 258) + * |---- x/ (ino 259) + * |---- y/ (ino 257) + * |----- file (ino 260) + * + * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257 + * from 'a' to 'x/y' happening first, which in turn depends on the rename of + * inode 259 from 'c' to 'x'. So the order of rename commands the send stream + * must issue is: + * + * 1 - rename 259 from 'c' to 'x' + * 2 - rename 257 from 'a' to 'x/y' + * 3 - rename 258 from 'b' to 'a' + * + * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can + * be done right away and < 0 on error. + */ +static int wait_for_dest_dir_move(struct send_ctx *sctx, + struct recorded_ref *parent_ref, + const bool is_orphan) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key di_key; + struct btrfs_dir_item *di; + u64 left_gen; + u64 right_gen; + int ret = 0; + + if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) + return 0; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + key.objectid = parent_ref->dir; + key.type = BTRFS_DIR_ITEM_KEY; + key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len); + + ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + goto out; + } + + di = btrfs_match_dir_item_name(sctx->parent_root, path, + parent_ref->name, parent_ref->name_len); + if (!di) { + ret = 0; + goto out; + } + /* + * di_key.objectid has the number of the inode that has a dentry in the + * parent directory with the same name that sctx->cur_ino is being + * renamed to. We need to check if that inode is in the send root as + * well and if it is currently marked as an inode with a pending rename, + * if it is, we need to delay the rename of sctx->cur_ino as well, so + * that it happens after that other inode is renamed. + */ + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key); + if (di_key.type != BTRFS_INODE_ITEM_KEY) { + ret = 0; + goto out; + } + + ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, + &left_gen, NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, + &right_gen, NULL, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + /* Different inode, no need to delay the rename of sctx->cur_ino */ + if (right_gen != left_gen) { + ret = 0; + goto out; + } + + if (is_waiting_for_move(sctx, di_key.objectid)) { + ret = add_pending_dir_move(sctx, + sctx->cur_ino, + sctx->cur_inode_gen, + di_key.objectid, + &sctx->new_refs, + &sctx->deleted_refs, + is_orphan); + if (!ret) + ret = 1; + } +out: + btrfs_free_path(path); + return ret; +} + static int wait_for_parent_move(struct send_ctx *sctx, struct recorded_ref *parent_ref) { @@ -3352,7 +3478,8 @@ out: sctx->cur_inode_gen, ino, &sctx->new_refs, - &sctx->deleted_refs); + &sctx->deleted_refs, + false); if (!ret) ret = 1; } @@ -3375,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int did_overwrite = 0; int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool can_rename = true; verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); @@ -3493,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { + ret = wait_for_dest_dir_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move * it depending on the inode mode. */ - if (is_orphan) { + if (is_orphan && can_rename) { ret = send_rename(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -3506,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); ret = fs_path_copy(valid_path, cur->full_path); if (ret < 0) goto out; - } else { + } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* * Dirs can't be linked, so move it. For moved @@ -5507,6 +5645,51 @@ out: return ret; } +/* + * If orphan cleanup did remove any orphans from a root, it means the tree + * was modified and therefore the commit root is not the same as the current + * root anymore. This is a problem, because send uses the commit root and + * therefore can see inode items that don't exist in the current root anymore, + * and for example make calls to btrfs_iget, which will do tree lookups based + * on the current root and not on the commit root. Those lookups will fail, + * returning a -ESTALE error, and making send fail with that error. So make + * sure a send does not see any orphans we have just removed, and that it will + * see the same inodes regardless of whether a transaction commit happened + * before it started (meaning that the commit root will be the same as the + * current root) or not. + */ +static int ensure_commit_roots_uptodate(struct send_ctx *sctx) +{ + int i; + struct btrfs_trans_handle *trans = NULL; + +again: + if (sctx->parent_root && + sctx->parent_root->node != sctx->parent_root->commit_root) + goto commit_trans; + + for (i = 0; i < sctx->clone_roots_cnt; i++) + if (sctx->clone_roots[i].root->node != + sctx->clone_roots[i].root->commit_root) + goto commit_trans; + + if (trans) + return btrfs_end_transaction(trans, sctx->send_root); + + return 0; + +commit_trans: + /* Use any root, all fs roots will get their commit roots updated. */ + if (!trans) { + trans = btrfs_join_transaction(sctx->send_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto again; + } + + return btrfs_commit_transaction(trans, sctx->send_root); +} + static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) { spin_lock(&root->root_item_lock); @@ -5728,6 +5911,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; + ret = ensure_commit_roots_uptodate(sctx); + if (ret) + goto out; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 54bd91ece35b..05fef198ff94 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ - if (!trans->blocks_used) { + if (!trans->blocks_used && list_empty(&trans->new_bgs)) { const char *errstr; errstr = btrfs_decode_error(errno); @@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "disabling disk space caching"); break; case Opt_inode_cache: - btrfs_set_and_info(root, CHANGE_INODE_CACHE, + btrfs_set_pending_and_info(info, INODE_MAP_CACHE, "enabling inode map caching"); break; case Opt_noinode_cache: - btrfs_clear_and_info(root, CHANGE_INODE_CACHE, + btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, "disabling inode map caching"); break; case Opt_clear_cache: @@ -993,9 +993,27 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { /* no transaction, don't bother */ - if (PTR_ERR(trans) == -ENOENT) - return 0; - return PTR_ERR(trans); + if (PTR_ERR(trans) == -ENOENT) { + /* + * Exit unless we have some pending changes + * that need to go through commit + */ + if (fs_info->pending_changes == 0) + return 0; + /* + * A non-blocking test if the fs is frozen. We must not + * start a new transaction here otherwise a deadlock + * happens. The pending operations are delayed to the + * next commit after thawing. + */ + if (__sb_start_write(sb, SB_FREEZE_WRITE, false)) + __sb_end_write(sb, SB_FREEZE_WRITE); + else + return 0; + trans = btrfs_start_transaction(root, 0); + } + if (IS_ERR(trans)) + return PTR_ERR(trans); } return btrfs_commit_transaction(trans, root); } @@ -1644,8 +1662,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int i = 0, nr_devices; int ret; + /* + * We aren't under the device list lock, so this is racey-ish, but good + * enough for our purposes. + */ nr_devices = fs_info->fs_devices->open_devices; - BUG_ON(!nr_devices); + if (!nr_devices) { + smp_mb(); + nr_devices = fs_info->fs_devices->open_devices; + ASSERT(nr_devices); + if (!nr_devices) { + *free_bytes = 0; + return 0; + } + } devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), GFP_NOFS); @@ -1670,11 +1700,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (fs_info->alloc_start) + mutex_lock(&fs_devices->device_list_mutex); + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (!device->in_fs_metadata || !device->bdev || device->is_tgtdev_for_dev_replace) continue; + if (i >= nr_devices) + break; + avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ @@ -1689,24 +1725,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) skip_space = 1024 * 1024; /* user can set the offset in fs_info->alloc_start. */ - if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= - device->total_bytes) + if (fs_info->alloc_start && + fs_info->alloc_start + BTRFS_STRIPE_LEN <= + device->total_bytes) { + rcu_read_unlock(); skip_space = max(fs_info->alloc_start, skip_space); - /* - * btrfs can not use the free space in [0, skip_space - 1], - * we must subtract it from the total. In order to implement - * it, we account the used space in this range first. - */ - ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, - &used_space); - if (ret) { - kfree(devices_info); - return ret; - } + /* + * btrfs can not use the free space in + * [0, skip_space - 1], we must subtract it from the + * total. In order to implement it, we account the used + * space in this range first. + */ + ret = btrfs_account_dev_extents_size(device, 0, + skip_space - 1, + &used_space); + if (ret) { + kfree(devices_info); + mutex_unlock(&fs_devices->device_list_mutex); + return ret; + } - /* calc the free space in [0, skip_space - 1] */ - skip_space -= used_space; + rcu_read_lock(); + + /* calc the free space in [0, skip_space - 1] */ + skip_space -= used_space; + } /* * we can use the free space in [0, skip_space - 1], subtract @@ -1725,6 +1769,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) i++; } + rcu_read_unlock(); + if (fs_info->alloc_start) + mutex_unlock(&fs_devices->device_list_mutex); nr_devices = i; @@ -1787,8 +1834,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * holding chunk_muext to avoid allocating new chunks, holding * device_list_mutex to avoid the device being removed */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -1824,17 +1869,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bfree -= block_rsv->size >> bits; spin_unlock(&block_rsv->lock); - buf->f_bavail = total_free_data; + buf->f_bavail = div_u64(total_free_data, factor); ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); - if (ret) { - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + if (ret) return ret; - } buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; - mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; @@ -1918,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb) return btrfs_commit_transaction(trans, root); } -static int btrfs_unfreeze(struct super_block *sb) -{ - return 0; -} - static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); @@ -1971,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = { .statfs = btrfs_statfs, .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, - .unfreeze_fs = btrfs_unfreeze, }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b2e7bb4393f6..94edb0a2a026 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, { struct btrfs_fs_info *fs_info; struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); - struct btrfs_trans_handle *trans; u64 features, set, clear; unsigned long val; int ret; @@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, btrfs_info(fs_info, "%s %s feature flag", val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); - trans = btrfs_start_transaction(fs_info->fs_root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - spin_lock(&fs_info->super_lock); features = get_features(fs_info, fa->feature_set); if (val) @@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, set_features(fs_info, fa->feature_set, features); spin_unlock(&fs_info->super_lock); - ret = btrfs_commit_transaction(trans, fs_info->fs_root); - if (ret) - return ret; + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); return count; } @@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj, const char *buf, size_t len) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = fs_info->fs_root; - int ret; size_t p_len; if (fs_info->sb->s_flags & MS_RDONLY) @@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj, if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - spin_lock(&root->fs_info->super_lock); + spin_lock(&fs_info->super_lock); memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); memcpy(fs_info->super_copy->label, buf, p_len); - spin_unlock(&root->fs_info->super_lock); - ret = btrfs_commit_transaction(trans, root); + spin_unlock(&fs_info->super_lock); - if (!ret) - return len; + /* + * We don't want to do full transaction commit from inside sysfs + */ + btrfs_set_pending(fs_info, COMMIT); + wake_up_process(fs_info->transaction_kthread); - return ret; + return len; } BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); @@ -741,10 +733,18 @@ int btrfs_init_sysfs(void) ret = btrfs_init_debugfs(); if (ret) - return ret; + goto out1; init_feature_attrs(); ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); + if (ret) + goto out2; + + return 0; +out2: + debugfs_remove_recursive(btrfs_debugfs_root_dentry); +out1: + kset_unregister(btrfs_kset); return ret; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index cc286ce97d1e..f51963a8f929 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -53,7 +53,7 @@ static int test_btrfs_split_item(void) return -ENOMEM; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096); + path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 7e99c2f98dd0..9e9f2368177d 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -258,8 +258,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, - (unsigned long)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); out: if (locked_page) page_cache_release(locked_page); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ae0f5b8bb80..054fc0d97131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -843,7 +843,7 @@ static int test_hole_first(void) goto out; } - root->node = alloc_dummy_extent_buffer(0, 4096); + root->node = alloc_dummy_extent_buffer(NULL, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -911,6 +911,197 @@ out: return ret; } +static int test_extent_accounting(void) +{ + struct inode *inode = NULL; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; + + inode = btrfs_new_test_inode(); + if (!inode) { + test_msg("Couldn't allocate inode\n"); + return ret; + } + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate root\n"); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + goto out; + } + + BTRFS_I(inode)->root = root; + btrfs_test_inode_set_ops(inode); + + /* [BTRFS_MAX_EXTENT_SIZE] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 1) { + ret = -EINVAL; + test_msg("Miscount, wanted 1, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, + BTRFS_MAX_EXTENT_SIZE + 4095, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + EXTENT_DELALLOC | EXTENT_DIRTY | + EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE][4K] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 2) { + ret = -EINVAL; + test_msg("Miscount, wanted 2, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K] + * + * I'm artificially adding 2 to outstanding_extents because in the + * buffered IO case we'd add things up as we go, but I don't feel like + * doing that here, this isn't the interesting case we want to test. + */ + BTRFS_I(inode)->outstanding_extents += 2; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192, + (BTRFS_MAX_EXTENT_SIZE << 1) + 12287, + NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 4) { + ret = -EINVAL; + test_msg("Miscount, wanted 4, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* + * Refill the hole again just for good measure, because I thought it + * might fail and I'd rather satisfy my paranoia at this point. + */ + BTRFS_I(inode)->outstanding_extents++; + ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096, + BTRFS_MAX_EXTENT_SIZE+8191, NULL); + if (ret) { + test_msg("btrfs_set_extent_delalloc returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents != 3) { + ret = -EINVAL; + test_msg("Miscount, wanted 3, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + if (ret) { + test_msg("clear_extent_bit returned %d\n", ret); + goto out; + } + if (BTRFS_I(inode)->outstanding_extents) { + ret = -EINVAL; + test_msg("Miscount, wanted 0, got %u\n", + BTRFS_I(inode)->outstanding_extents); + goto out; + } + ret = 0; +out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, + NULL, GFP_NOFS); + iput(inode); + btrfs_free_dummy_root(root); + return ret; +} + int btrfs_test_inodes(void) { int ret; @@ -924,5 +1115,9 @@ int btrfs_test_inodes(void) if (ret) return ret; test_msg("Running hole first btrfs_get_extent test\n"); - return test_hole_first(); + ret = test_hole_first(); + if (ret) + return ret; + test_msg("Running outstanding_extents tests\n"); + return test_extent_accounting(); } diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ec3dcb202357..73f299ebdabb 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -404,12 +404,22 @@ int btrfs_test_qgroups(void) ret = -ENOMEM; goto out; } + /* We are using this root as our extent root */ + root->fs_info->extent_root = root; + + /* + * Some of the paths we test assume we have a filled out fs_info, so we + * just need to add the root in there so we don't panic. + */ + root->fs_info->tree_root = root; + root->fs_info->quota_root = root; + root->fs_info->quota_enabled = 1; /* * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096); + root->node = alloc_test_extent_buffer(root->fs_info, 4096); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -448,17 +458,6 @@ int btrfs_test_qgroups(void) goto out; } - /* We are using this root as our extent root */ - root->fs_info->extent_root = root; - - /* - * Some of the paths we test assume we have a filled out fs_info, so we - * just need to addt he root in there so we don't panic. - */ - root->fs_info->tree_root = root; - root->fs_info->quota_root = root; - root->fs_info->quota_enabled = 1; - test_msg("Running qgroup tests\n"); ret = test_no_shared_qgroup(root); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..8be4278e25e8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } +static void clear_btree_io_tree(struct extent_io_tree *tree) +{ + spin_lock(&tree->lock); + while (!RB_EMPTY_ROOT(&tree->state)) { + struct rb_node *node; + struct extent_state *state; + + node = rb_first(&tree->state); + state = rb_entry(node, struct extent_state, rb_node); + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + /* + * btree io trees aren't supposed to have tasks waiting for + * changes in the flags of extent states ever. + */ + ASSERT(!waitqueue_active(&state->wq)); + free_extent_state(state); + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } + spin_unlock(&tree->lock); +} + static noinline void switch_commit_roots(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info) { @@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, root->commit_root = btrfs_root_node(root); if (is_fstree(root->objectid)) btrfs_unpin_free_ino(root); + clear_btree_io_tree(&root->dirty_log_pages); } up_write(&fs_info->commit_root_sem); } @@ -193,6 +220,7 @@ loop: * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); + cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); cur_trans->delayed_refs.href_root = RB_ROOT; @@ -220,6 +248,9 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); + INIT_LIST_HEAD(&cur_trans->pending_ordered); + INIT_LIST_HEAD(&cur_trans->dirty_bgs); + spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, fs_info->btree_inode->i_mapping); @@ -488,6 +519,7 @@ again: h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); + INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -719,6 +751,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + if (!list_empty(&trans->ordered)) { + spin_lock(&info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); + spin_unlock(&info->trans_lock); + } + trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -828,17 +866,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { - convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - mark, &cached_state, GFP_NOFS); - cached_state = NULL; - err = filemap_fdatawrite_range(mapping, start, end); + bool wait_writeback = false; + + err = convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state, GFP_NOFS); + /* + * convert_extent_bit can return -ENOMEM, which is most of the + * time a temporary error. So when it happens, ignore the error + * and wait for writeback of this range to finish - because we + * failed to set the bit EXTENT_NEED_WAIT for the range, a call + * to btrfs_wait_marked_extents() would not know that writeback + * for this range started and therefore wouldn't wait for it to + * finish - we don't want to commit a superblock that points to + * btree nodes/leafs for which writeback hasn't finished yet + * (and without errors). + * We cleanup any entries left in the io tree when committing + * the transaction (through clear_btree_io_tree()). + */ + if (err == -ENOMEM) { + err = 0; + wait_writeback = true; + } + if (!err) + err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; + else if (wait_writeback) + werr = filemap_fdatawait_range(mapping, start, end); + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } - if (err) - werr = err; return werr; } @@ -862,11 +922,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { - clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, - 0, 0, &cached_state, GFP_NOFS); - err = filemap_fdatawait_range(mapping, start, end); + /* + * Ignore -ENOMEM errors returned by clear_extent_bit(). + * When committing the transaction, we'll remove any entries + * left in the io tree. For a log commit, we don't remove them + * after committing the log because the tree can be accessed + * concurrently - we do it only at transaction commit time when + * it's safe to do it (through clear_btree_io_tree()). + */ + err = clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + 0, 0, &cached_state, GFP_NOFS); + if (err == -ENOMEM) + err = 0; + if (!err) + err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; + free_extent_state(cached_state); + cached_state = NULL; cond_resched(); start = end + 1; } @@ -919,17 +993,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return 0; } -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, +static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - if (!trans || !trans->transaction) { - struct inode *btree_inode; - btree_inode = root->fs_info->btree_inode; - return filemap_write_and_wait(btree_inode->i_mapping); - } - return btrfs_write_and_wait_marked_extents(root, + int ret; + + ret = btrfs_write_and_wait_marked_extents(root, &trans->transaction->dirty_pages, EXTENT_DIRTY); + clear_btree_io_tree(&trans->transaction->dirty_pages); + + return ret; } /* @@ -951,7 +1025,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root = root->fs_info->tree_root; old_root_used = btrfs_root_used(&root->root_item); - btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); @@ -967,9 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, return ret; old_root_used = btrfs_root_used(&root->root_item); - ret = btrfs_write_dirty_block_groups(trans, root); - if (ret) - return ret; } return 0; @@ -986,14 +1056,11 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *next; struct extent_buffer *eb; int ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) - return ret; - eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb); @@ -1017,15 +1084,20 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; + ret = btrfs_setup_space_cache(trans, root); + if (ret) + return ret; + /* run_qgroups might have added some more refs */ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); if (ret) return ret; - +again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); + clear_bit(BTRFS_ROOT_DIRTY, &root->state); if (root != fs_info->extent_root) list_add_tail(&root->dirty_list, @@ -1033,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, ret = update_cowonly_root(trans, root); if (ret) return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; } + while (!list_empty(dirty_bgs)) { + ret = btrfs_write_dirty_block_groups(trans, root); + if (ret) + return ret; + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) + return ret; + } + + if (!list_empty(&fs_info->dirty_cowonly_roots)) + goto again; + list_add_tail(&fs_info->extent_root->dirty_list, &trans->transaction->switch_commits); btrfs_after_dev_replace_commit(fs_info); @@ -1652,6 +1739,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) btrfs_wait_ordered_roots(fs_info, -1); } +static inline void +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_ordered_extent *ordered; + + spin_lock(&fs_info->trans_lock); + while (!list_empty(&cur_trans->pending_ordered)) { + ordered = list_first_entry(&cur_trans->pending_ordered, + struct btrfs_ordered_extent, + trans_list); + list_del_init(&ordered->trans_list); + spin_unlock(&fs_info->trans_lock); + + wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &ordered->flags)); + btrfs_put_ordered_extent(ordered); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -1702,6 +1811,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); + list_splice(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1709,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_for_commit(root, cur_trans); + if (unlikely(cur_trans->aborted)) + ret = cur_trans->aborted; + btrfs_put_transaction(cur_trans); return ret; @@ -1754,6 +1867,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); + btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_scrub_pause(root); /* * Ok now we need to make sure to block out any other joins while we @@ -1842,13 +1957,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } /* - * Since the transaction is done, we should set the inode map cache flag - * before any other comming transaction. + * Since the transaction is done, we can apply the pending changes + * before the next transaction. */ - if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) - btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); - else - btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); + btrfs_apply_pending_changes(root->fs_info); /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots @@ -1890,6 +2002,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, switch_commit_roots(cur_trans, root->fs_info); assert_qgroups_uptodate(trans); + ASSERT(list_empty(&cur_trans->dirty_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); @@ -1933,6 +2046,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); + if (cur_trans->have_free_bgs) + btrfs_clear_space_info_full(root->fs_info); + root->fs_info->last_trans_committed = cur_trans->transid; /* * We needn't acquire the lock here because there is no other task @@ -2019,3 +2135,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) return (ret < 0) ? 0 : 1; } + +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) +{ + unsigned long prev; + unsigned long bit; + + prev = xchg(&fs_info->pending_changes, 0); + if (!prev) + return; + + bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; + if (prev & bit) + btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; + if (prev & bit) + btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); + prev &= ~bit; + + bit = 1 << BTRFS_PENDING_COMMIT; + if (prev & bit) + btrfs_debug(fs_info, "pending commit done"); + prev &= ~bit; + + if (prev) + btrfs_warn(fs_info, + "unknown pending changes left 0x%lx, ignoring", prev); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..937050a2b68e 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,6 +47,11 @@ struct btrfs_transaction { atomic_t num_writers; atomic_t use_count; + /* + * true if there is free bgs operations in this transaction + */ + int have_free_bgs; + /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; struct list_head list; @@ -56,7 +61,10 @@ struct btrfs_transaction { wait_queue_head_t commit_wait; struct list_head pending_snapshots; struct list_head pending_chunks; + struct list_head pending_ordered; struct list_head switch_commits; + struct list_head dirty_bgs; + spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; }; @@ -105,6 +113,7 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; + struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -145,8 +154,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); -int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); @@ -170,4 +177,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); +void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); + #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 286213cec861..c5b8ba37f88e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, insert: btrfs_release_path(path); /* try to insert the key into the destination tree */ + path->skip_release_on_error = 1; ret = btrfs_insert_empty_item(trans, root, path, key, item_size); + path->skip_release_on_error = 0; /* make sure any existing item is the correct size */ - if (ret == -EEXIST) { + if (ret == -EEXIST || ret == -EOVERFLOW) { u32 found_size; found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -488,8 +490,20 @@ insert: src_item = (struct btrfs_inode_item *)src_ptr; dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(eb, src_item) == 0) + if (btrfs_inode_generation(eb, src_item) == 0) { + struct extent_buffer *dst_eb = path->nodes[0]; + + if (S_ISREG(btrfs_inode_mode(eb, src_item)) && + S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + struct btrfs_map_token token; + u64 ino_size = btrfs_inode_size(eb, src_item); + + btrfs_init_map_token(&token); + btrfs_set_token_inode_size(dst_eb, dst_item, + ino_size, &token); + } goto no_copy; + } if (overwrite_root && S_ISDIR(btrfs_inode_mode(eb, src_item)) && @@ -844,7 +858,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - char *name, int namelen) + const char *name, int namelen) { struct btrfs_path *path; struct btrfs_inode_ref *ref; @@ -998,7 +1012,7 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)base + cur_offset; + extref = (struct btrfs_inode_extref *)(base + cur_offset); victim_name_len = btrfs_inode_extref_name_len(leaf, extref); @@ -1254,13 +1268,14 @@ out: } static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset) + struct btrfs_root *root, u64 ino) { int ret; - ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID, - offset, BTRFS_ORPHAN_ITEM_KEY, NULL); - if (ret > 0) - ret = btrfs_insert_orphan_item(trans, root, offset); + + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; + return ret; } @@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root, leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + cur_offset = 0; while (cur_offset < item_size) { extref = (struct btrfs_inode_extref *) (ptr + cur_offset); @@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root, } btrfs_release_path(path); - if (ret < 0) + if (ret < 0 && ret != -ENOENT) return ret; return nlink; } @@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, nlink = ret; ret = count_inode_extrefs(root, inode, path); - if (ret == -ENOENT) - ret = 0; - if (ret < 0) goto out; @@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } /* + * Return true if an inode reference exists in the log for the given name, + * inode and parent inode. + */ +static bool name_in_log_ref(struct btrfs_root *log_root, + const char *name, const int name_len, + const u64 dirid, const u64 ino) +{ + struct btrfs_key search_key; + + search_key.objectid = ino; + search_key.type = BTRFS_INODE_REF_KEY; + search_key.offset = dirid; + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + search_key.type = BTRFS_INODE_EXTREF_KEY; + search_key.offset = btrfs_extref_hash(dirid, name, name_len); + if (backref_in_log(log_root, &search_key, dirid, name, name_len)) + return true; + + return false; +} + +/* * take a single entry in a log directory item and replay it into * the subvolume. * @@ -1666,10 +1703,17 @@ out: return ret; insert: + if (name_in_log_ref(root->log_root, name, name_len, + key->objectid, log_key.objectid)) { + /* The dentry will be added later. */ + ret = 0; + update_size = false; + goto out; + } btrfs_release_path(path); ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) + if (ret && ret != -ENOENT && ret != -EEXIST) goto out; update_size = false; ret = 0; @@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(root, bytenr, blocksize); + next = btrfs_find_create_tree_block(root, bytenr); if (!next) return -ENOMEM; @@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, mutex_unlock(&root->log_mutex); if (atomic_read(&root->log_writers)) schedule(); - mutex_lock(&root->log_mutex); finish_wait(&root->log_writer_wait, &wait); + mutex_lock(&root->log_mutex); } } @@ -2591,6 +2635,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) { + blk_finish_plug(&plug); mutex_unlock(&log_root_tree->log_mutex); ret = root_log_ctx.log_ret; goto out; @@ -2599,12 +2644,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, + mark); + btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(trans, log_root_tree, root_log_ctx.log_transid); - btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); - ret = root_log_ctx.log_ret; + if (!ret) + ret = root_log_ctx.log_ret; goto out; } ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); @@ -2641,11 +2688,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); - btrfs_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_NEW | EXTENT_DIRTY); - btrfs_wait_logged_extents(log, log_transid); + ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + if (!ret) + ret = btrfs_wait_marked_extents(log_root_tree, + &log_root_tree->dirty_log_pages, + EXTENT_NEW | EXTENT_DIRTY); + if (ret) { + btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_free_logged_extents(log, log_transid); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } + btrfs_wait_logged_extents(trans, log, log_transid); btrfs_set_super_log_root(root->fs_info->super_for_commit, log_root_tree->node->start); @@ -3209,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, int log_inode_only) + struct inode *inode, int log_inode_only, + u64 logged_isize) { struct btrfs_map_token token; @@ -3222,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * to say 'update this inode with these values' */ btrfs_set_token_inode_generation(leaf, item, 0, &token); - btrfs_set_token_inode_size(leaf, item, 0, &token); + btrfs_set_token_inode_size(leaf, item, logged_isize, &token); } else { btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, @@ -3235,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_sec(leaf, &item->atime, inode->i_atime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), + btrfs_set_token_timespec_nsec(leaf, &item->atime, inode->i_atime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_sec(leaf, &item->mtime, inode->i_mtime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), + btrfs_set_token_timespec_nsec(leaf, &item->mtime, inode->i_mtime.tv_nsec, &token); - btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_sec(leaf, &item->ctime, inode->i_ctime.tv_sec, &token); - btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), + btrfs_set_token_timespec_nsec(leaf, &item->ctime, inode->i_ctime.tv_nsec, &token); btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), @@ -3274,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); + fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0); btrfs_release_path(path); return 0; } @@ -3283,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *dst_path, struct btrfs_path *src_path, u64 *last_extent, - int start_slot, int nr, int inode_only) + int start_slot, int nr, int inode_only, + u64 logged_isize) { unsigned long src_offset; unsigned long dst_offset; @@ -3340,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, dst_path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - inode, inode_only == LOG_INODE_EXISTS); + inode, inode_only == LOG_INODE_EXISTS, + logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, src_offset, ins_sizes[i]); @@ -3626,6 +3683,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + /* + * Clear the AS_EIO/AS_ENOSPC flags from the inode's + * i_mapping flags, so that the next fsync won't get + * an outdated io error too. + */ + btrfs_inode_check_errors(inode); *ordered_io_error = true; break; } @@ -3766,7 +3829,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, &token); if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) btrfs_set_token_file_extent_type(leaf, fi, @@ -3886,6 +3949,33 @@ process: return ret; } +static int logged_inode_size(struct btrfs_root *log, struct inode *inode, + struct btrfs_path *path, u64 *size_ret) +{ + struct btrfs_key key; + int ret; + + key.objectid = btrfs_ino(inode); + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *size_ret = i_size_read(inode); + } else { + struct btrfs_inode_item *item; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + } + + btrfs_release_path(path); + return 0; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -3923,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool fast_search = false; u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 logged_isize = 0; path = btrfs_alloc_path(); if (!path) @@ -3950,20 +4041,27 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, max_key.type = (u8)-1; max_key.offset = (u64)-1; - /* Only run delayed items if we are a dir or a new file */ + /* + * Only run delayed items if we are a dir or a new file. + * Otherwise commit the delayed inode only, which is needed in + * order for the log replay code to mark inodes for link count + * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). + */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { + BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; - } + else + ret = btrfs_commit_inode_delayed_inode(inode); + + if (ret) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + return ret; } mutex_lock(&BTRFS_I(inode)->log_mutex); - btrfs_get_logged_extents(inode, &logged_list); + btrfs_get_logged_extents(inode, &logged_list, start, end); /* * a brute force approach to making sure we get the most uptodate @@ -3972,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) - max_key_type = BTRFS_XATTR_ITEM_KEY; + if (inode_only == LOG_INODE_EXISTS) { + max_key_type = BTRFS_INODE_EXTREF_KEY; + max_key.type = max_key_type; + } ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { - if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); - } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + if (inode_only == LOG_INODE_EXISTS) { + /* + * Make sure the new inode item we write to the log has + * the same isize as the current one (if it exists). + * This is necessary to prevent data loss after log + * replay, and also to prevent doing a wrong expanding + * truncate - for e.g. create file, write 4K into offset + * 0, fsync, write 4K into offset 4096, add hard link, + * fsync some other file (to sync log), power fail - if + * we use the inode's current i_size, after log replay + * we get a 8Kb file, with the last 4Kb extent as a hole + * (zeroes), as if an expanding truncate happened, + * instead of getting a file of 4Kb only. + */ + err = logged_inode_size(log, inode, path, + &logged_isize); + if (err) + goto out_unlock; + } + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + if (inode_only == LOG_INODE_EXISTS) { + max_key.type = BTRFS_INODE_EXTREF_KEY; + ret = drop_objectid_items(trans, log, path, ino, + max_key.type); + } else { + clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); + ret = btrfs_truncate_inode_items(trans, log, + inode, 0, 0); + } + } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) + if (inode_only == LOG_INODE_ALL) { + clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags); fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; + } else { + max_key.type = BTRFS_INODE_EXTREF_KEY; + } ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4031,7 +4163,8 @@ again: } ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4055,7 +4188,7 @@ next_slot: if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, ins_start_slot, - ins_nr, inode_only); + ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4076,7 +4209,8 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, &last_extent, - ins_start_slot, ins_nr, inode_only); + ins_start_slot, ins_nr, inode_only, + logged_isize); if (ret < 0) { err = ret; goto out_unlock; @@ -4089,6 +4223,21 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (fast_search) { + /* + * Some ordered extents started by fsync might have completed + * before we collected the ordered extents in logged_list, which + * means they're gone, not in our logged_list nor in the inode's + * ordered tree. We want the application/user space to know an + * error happened while attempting to persist file data so that + * it can take proper action. If such error happened, we leave + * without writing to the log tree and the fsync must report the + * file data write error and not commit the current transaction. + */ + err = btrfs_inode_check_errors(inode); + if (err) { + ctx->io_err = err; + goto out_unlock; + } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, &logged_list, ctx); if (ret) { @@ -4242,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; + const struct dentry * const first_parent = parent; + const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > + last_committed); sb = inode->i_sb; @@ -4297,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - inode_only = LOG_INODE_EXISTS; while (1) { if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; @@ -4306,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (root != BTRFS_I(inode)->root) break; + /* + * On unlink we must make sure our immediate parent directory + * inode is fully logged. This is to prevent leaving dangling + * directory index entries and a wrong directory inode's i_size. + * Not doing so can result in a directory being impossible to + * delete after log replay (rmdir will always fail with error + * -ENOTEMPTY). + */ + if (did_unlink && parent == first_parent) + inode_only = LOG_INODE_ALL; + else + inode_only = LOG_INODE_EXISTS; + if (BTRFS_I(inode)->generation > - root->fs_info->last_trans_committed) { + root->fs_info->last_trans_committed || + inode_only == LOG_INODE_ALL) { ret = btrfs_log_inode(trans, root, inode, inode_only, 0, LLONG_MAX, ctx); if (ret) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d47289c715c8..8222f6f74147 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); -static void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - static struct btrfs_fs_devices *__alloc_fs_devices(void) { struct btrfs_fs_devices *fs_devs; @@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, u64 *start, u64 len) { struct extent_map *em; + struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; - list_for_each_entry(em, &trans->transaction->pending_chunks, list) { +again: + list_for_each_entry(em, search_list, list) { struct map_lookup *map; int i; @@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, ret = 1; } } + if (search_list == &trans->transaction->pending_chunks) { + search_list = &trans->root->fs_info->pinned_chunks; + goto again; + } return ret; } @@ -1314,6 +1310,8 @@ again: if (ret) { btrfs_error(root->fs_info, ret, "Failed to remove dev extent item"); + } else { + trans->transaction->have_free_bgs = 1; } out: btrfs_free_path(path); @@ -1489,7 +1487,7 @@ static void update_dev_time(char *path_name) struct file *filp; filp = filp_open(path_name, O_RDWR, 0); - if (!filp) + if (IS_ERR(filp)) return; file_update_time(filp); filp_close(filp, NULL); @@ -1800,8 +1798,8 @@ error_undo: goto error_brelse; } -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev) +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) { struct btrfs_fs_devices *fs_devices; @@ -1829,6 +1827,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->bdev) fs_devices->open_devices--; +} + +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev) +{ + struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; call_rcu(&srcdev->rcu, free_device); @@ -2647,18 +2651,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); goto out; } - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for the tree */ - free_extent_map(em); out: /* once for us */ free_extent_map(em); @@ -4200,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { - if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) + if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) return; btrfs_set_fs_incompat(info, RAID56); @@ -4505,6 +4503,8 @@ error_del_extent: free_extent_map(em); /* One for the tree reference */ free_extent_map(em); + /* One for the pending_chunks list reference */ + free_extent_map(em); error: kfree(devices_info); return ret; @@ -4805,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); - } free_extent_map(em); return len; } @@ -4828,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(em->start > logical || em->start + em->len < logical); map = (struct map_lookup *)em->bdev; - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); return ret; @@ -4878,7 +4875,7 @@ static inline int parity_smaller(u64 a, u64 b) } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) +static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) { struct btrfs_bio_stripe s; int i; @@ -4887,24 +4884,64 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) while (again) { again = 0; - for (i = 0; i < bbio->num_stripes - 1; i++) { - if (parity_smaller(raid_map[i], raid_map[i+1])) { + for (i = 0; i < num_stripes - 1; i++) { + if (parity_smaller(bbio->raid_map[i], + bbio->raid_map[i+1])) { s = bbio->stripes[i]; - l = raid_map[i]; + l = bbio->raid_map[i]; bbio->stripes[i] = bbio->stripes[i+1]; - raid_map[i] = raid_map[i+1]; + bbio->raid_map[i] = bbio->raid_map[i+1]; bbio->stripes[i+1] = s; - raid_map[i+1] = l; + bbio->raid_map[i+1] = l; + again = 1; } } } } +static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +{ + struct btrfs_bio *bbio = kzalloc( + /* the size of the btrfs_bio */ + sizeof(struct btrfs_bio) + + /* plus the variable array for the stripes */ + sizeof(struct btrfs_bio_stripe) * (total_stripes) + + /* plus the variable array for the tgt dev */ + sizeof(int) * (real_stripes) + + /* + * plus the raid_map, which includes both the tgt dev + * and the stripes + */ + sizeof(u64) * (total_stripes), + GFP_NOFS); + if (!bbio) + return NULL; + + atomic_set(&bbio->error, 0); + atomic_set(&bbio->refs, 1); + + return bbio; +} + +void btrfs_get_bbio(struct btrfs_bio *bbio) +{ + WARN_ON(!atomic_read(&bbio->refs)); + atomic_inc(&bbio->refs); +} + +void btrfs_put_bbio(struct btrfs_bio *bbio) +{ + if (!bbio) + return; + if (atomic_dec_and_test(&bbio->refs)) + kfree(bbio); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, - int mirror_num, u64 **raid_map_ret) + int mirror_num, int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -4917,12 +4954,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - u64 *raid_map = NULL; int stripe_index; int i; int ret = 0; int num_stripes; int max_errors = 0; + int tgtdev_indexes = 0; struct btrfs_bio *bbio = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; @@ -4967,7 +5004,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_offset = offset - stripe_offset; /* if we're here for raid56, we need to know the stripe aligned start */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); raid56_full_stripe_start = offset; @@ -4980,8 +5017,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) { /* we don't discard raid56 yet */ - if (map->type & - (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; goto out; } @@ -4991,7 +5027,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ - if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && (rw & REQ_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); @@ -5038,7 +5074,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 physical_of_found = 0; ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, NULL); + logical, &tmp_length, &tmp_bbio, 0, 0); if (ret) { WARN_ON(tmp_bbio != NULL); goto out; @@ -5052,7 +5088,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * is not left of the left cursor */ ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } @@ -5087,11 +5123,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } else { WARN_ON(1); ret = -EIO; - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); goto out; } - kfree(tmp_bbio); + btrfs_put_bbio(tmp_bbio); } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5157,50 +5193,24 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, mirror_num = stripe_index - old_stripe_index + 1; } - } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { - u64 tmp; - - if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) - && raid_map_ret) { - int i, rot; - + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + if (need_raid_map && + ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = raid56_full_stripe_start; - do_div(stripe_nr, stripe_len); - - stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + do_div(stripe_nr, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; max_errors = nr_parity_stripes(map); - raid_map = kmalloc_array(num_stripes, sizeof(u64), - GFP_NOFS); - if (!raid_map) { - ret = -ENOMEM; - goto out; - } - - /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); - - /* Fill in the logical address of each stripe */ - tmp = stripe_nr * nr_data_stripes(map); - for (i = 0; i < nr_data_stripes(map); i++) - raid_map[(i+rot) % num_stripes] = - em->start + (tmp + i) * map->stripe_len; - - raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; - if (map->type & BTRFS_BLOCK_GROUP_RAID6) - raid_map[(i+rot+1) % num_stripes] = - RAID6_Q_STRIPE; - *length = map->stripe_len; stripe_index = 0; stripe_offset = 0; } else { + u64 tmp; + /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. @@ -5235,14 +5245,44 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_alloc_stripes <<= 1; if (rw & REQ_GET_READ_MIRRORS) num_alloc_stripes++; + tgtdev_indexes = num_stripes; } - bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); + + bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); if (!bbio) { - kfree(raid_map); ret = -ENOMEM; goto out; } - atomic_set(&bbio->error, 0); + if (dev_replace_is_ongoing) + bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); + + /* build raid_map */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && + need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || + mirror_num > 1)) { + u64 tmp; + int i, rot; + + bbio->raid_map = (u64 *)((void *)bbio->stripes + + sizeof(struct btrfs_bio_stripe) * + num_alloc_stripes + + sizeof(int) * tgtdev_indexes); + + /* Work out the disk rotation on this stripe-set */ + tmp = stripe_nr; + rot = do_div(tmp, num_stripes); + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + for (i = 0; i < nr_data_stripes(map); i++) + bbio->raid_map[(i+rot) % num_stripes] = + em->start + (tmp + i) * map->stripe_len; + + bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + bbio->raid_map[(i+rot+1) % num_stripes] = + RAID6_Q_STRIPE; + } if (rw & REQ_DISCARD) { int factor = 0; @@ -5327,6 +5367,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) max_errors = btrfs_chunk_max_errors(map); + if (bbio->raid_map) + sort_parity_stripes(bbio, num_stripes); + + tgtdev_indexes = 0; if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { int index_where_to_add; @@ -5355,8 +5399,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, new->physical = old->physical; new->length = old->length; new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; index_where_to_add++; max_errors++; + tgtdev_indexes++; } } num_stripes = index_where_to_add; @@ -5402,16 +5448,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, tgtdev_stripe->length = bbio->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + tgtdev_indexes++; num_stripes++; } } } *bbio_ret = bbio; + bbio->map_type = map->type; bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; + bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -5424,10 +5474,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, bbio->stripes[0].physical = physical_to_patch_in_first_stripe; bbio->mirror_num = map->num_stripes + 1; } - if (raid_map) { - sort_parity_stripes(bbio, raid_map); - *raid_map_ret = raid_map; - } out: if (dev_replace_is_ongoing) btrfs_dev_replace_unlock(dev_replace); @@ -5440,7 +5486,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, struct btrfs_bio **bbio_ret, int mirror_num) { return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, - mirror_num, NULL); + mirror_num, 0); +} + +/* For Scrub/replace */ +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num, + int need_raid_map) +{ + return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, + mirror_num, need_raid_map); } int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, @@ -5482,8 +5538,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, do_div(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) do_div(length, map->num_stripes); - else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6)) { + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { do_div(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } @@ -5536,7 +5591,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e bio_endio_nodec(bio, err); else bio_endio(bio, err); - kfree(bbio); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) @@ -5779,7 +5834,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; - u64 *raid_map = NULL; int ret; int dev_nr = 0; int total_devs = 1; @@ -5790,7 +5844,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, btrfs_bio_counter_inc_blocked(root->fs_info); ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, - mirror_num, &raid_map); + mirror_num, 1); if (ret) { btrfs_bio_counter_dec(root->fs_info); return ret; @@ -5803,21 +5857,16 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (raid_map) { + if (bbio->raid_map) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { - ret = raid56_parity_write(root, bio, bbio, - raid_map, map_length); + ret = raid56_parity_write(root, bio, bbio, map_length); } else { - ret = raid56_parity_recover(root, bio, bbio, - raid_map, map_length, - mirror_num); + ret = raid56_parity_recover(root, bio, bbio, map_length, + mirror_num, 1); } - /* - * FIXME, replace dosen't support raid56 yet, please fix - * it in the future. - */ + btrfs_bio_counter_dec(root->fs_info); return ret; } @@ -6212,17 +6261,22 @@ int btrfs_read_sys_array(struct btrfs_root *root) struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; - u8 *ptr; - unsigned long sb_ptr; + u8 *array_ptr; + unsigned long sb_array_offset; int ret = 0; u32 num_stripes; u32 array_size; u32 len = 0; - u32 cur; + u32 cur_offset; struct btrfs_key key; - sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, - BTRFS_SUPER_INFO_SIZE); + ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); + /* + * This will create extent buffer of nodesize, superblock size is + * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will + * overallocate but we can keep it as-is, only the first page is used. + */ + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); @@ -6245,35 +6299,56 @@ int btrfs_read_sys_array(struct btrfs_root *root) write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); - ptr = super_copy->sys_chunk_array; - sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); - cur = 0; + array_ptr = super_copy->sys_chunk_array; + sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array); + cur_offset = 0; + + while (cur_offset < array_size) { + disk_key = (struct btrfs_disk_key *)array_ptr; + len = sizeof(*disk_key); + if (cur_offset + len > array_size) + goto out_short_read; - while (cur < array_size) { - disk_key = (struct btrfs_disk_key *)ptr; btrfs_disk_key_to_cpu(&key, disk_key); - len = sizeof(*disk_key); ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; if (key.type == BTRFS_CHUNK_ITEM_KEY) { - chunk = (struct btrfs_chunk *)sb_ptr; + chunk = (struct btrfs_chunk *)sb_array_offset; + /* + * At least one btrfs_chunk with one stripe must be + * present, exact stripe count check comes afterwards + */ + len = btrfs_chunk_item_size(1); + if (cur_offset + len > array_size) + goto out_short_read; + + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + if (cur_offset + len > array_size) + goto out_short_read; + ret = read_one_chunk(root, &key, sb, chunk); if (ret) break; - num_stripes = btrfs_chunk_num_stripes(sb, chunk); - len = btrfs_chunk_item_size(num_stripes); } else { ret = -EIO; break; } - ptr += len; - sb_ptr += len; - cur += len; + array_ptr += len; + sb_array_offset += len; + cur_offset += len; } free_extent_buffer(sb); return ret; + +out_short_read: + printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n", + len, cur_offset); + free_extent_buffer(sb); + return -EIO; } int btrfs_read_chunk_tree(struct btrfs_root *root) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 08980fa23039..83069dec6898 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -292,11 +292,13 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 +#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) struct btrfs_bio { + atomic_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; + u64 map_type; /* get from map_lookup->type */ bio_end_io_t *end_io; struct bio *orig_bio; unsigned long flags; @@ -305,6 +307,14 @@ struct btrfs_bio { int max_errors; int num_stripes; int mirror_num; + int num_tgtdevs; + int *tgtdev_map; + /* + * logical block numbers for the start of each stripe + * The last one or two are p/q. These are sorted, + * so raid_map[0] is the start of our full stripe + */ + u64 *raid_map; struct btrfs_bio_stripe stripes[]; }; @@ -386,13 +396,15 @@ struct btrfs_balance_control { int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); - -#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) - +void btrfs_get_bbio(struct btrfs_bio *bbio); +void btrfs_put_bbio(struct btrfs_bio *bbio); int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, int mirror_num, + int need_raid_map); int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -448,8 +460,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); -void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); +void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev); void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, @@ -513,4 +527,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, struct btrfs_transaction *transaction); + +static inline void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static inline void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + + #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index dcf20131fbe4..883b93623bc5 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -29,6 +29,7 @@ #include "xattr.h" #include "disk-io.h" #include "props.h" +#include "locking.h" ssize_t __btrfs_getxattr(struct inode *inode, const char *name, @@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct btrfs_dir_item *di; + struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; size_t name_len = strlen(name); @@ -103,84 +104,123 @@ static int do_setxattr(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->skip_release_on_error = 1; - if (flags & XATTR_REPLACE) { - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, - name_len, -1); - if (IS_ERR(di)) { + if (!value) { + di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), + name, name_len, -1); + if (!di && (flags & XATTR_REPLACE)) + ret = -ENODATA; + else if (IS_ERR(di)) ret = PTR_ERR(di); - goto out; - } else if (!di) { + else if (di) + ret = btrfs_delete_one_dir_name(trans, root, path, di); + goto out; + } + + /* + * For a replace we can't just do the insert blindly. + * Do a lookup first (read-only btrfs_search_slot), and return if xattr + * doesn't exist. If it exists, fall down below to the insert/replace + * path - we can't race with a concurrent xattr delete, because the VFS + * locks the inode's i_mutex before calling setxattr or removexattr. + */ + if (flags & XATTR_REPLACE) { + ASSERT(mutex_is_locked(&inode->i_mutex)); + di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), + name, name_len, 0); + if (!di) ret = -ENODATA; - goto out; - } - ret = btrfs_delete_one_dir_name(trans, root, path, di); + else if (IS_ERR(di)) + ret = PTR_ERR(di); if (ret) goto out; btrfs_release_path(path); + di = NULL; + } + ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), + name, name_len, value, size); + if (ret == -EOVERFLOW) { /* - * remove the attribute + * We have an existing item in a leaf, split_leaf couldn't + * expand it. That item might have or not a dir_item that + * matches our target xattr, so lets check. */ - if (!value) - goto out; - } else { - di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), - name, name_len, 0); - if (IS_ERR(di)) { - ret = PTR_ERR(di); + ret = 0; + btrfs_assert_tree_locked(path->nodes[0]); + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (!di && !(flags & XATTR_REPLACE)) { + ret = -ENOSPC; goto out; } - if (!di && !value) - goto out; - btrfs_release_path(path); + } else if (ret == -EEXIST) { + ret = 0; + di = btrfs_match_dir_item_name(root, path, name, name_len); + ASSERT(di); /* logic error */ + } else if (ret) { + goto out; } -again: - ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), - name, name_len, value, size); - /* - * If we're setting an xattr to a new value but the new value is say - * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting - * back from split_leaf. This is because it thinks we'll be extending - * the existing item size, but we're asking for enough space to add the - * item itself. So if we get EOVERFLOW just set ret to EEXIST and let - * the rest of the function figure it out. - */ - if (ret == -EOVERFLOW) + if (di && (flags & XATTR_CREATE)) { ret = -EEXIST; + goto out; + } - if (ret == -EEXIST) { - if (flags & XATTR_CREATE) - goto out; + if (di) { /* - * We can't use the path we already have since we won't have the - * proper locking for a delete, so release the path and - * re-lookup to delete the thing. + * We're doing a replace, and it must be atomic, that is, at + * any point in time we have either the old or the new xattr + * value in the tree. We don't want readers (getxattr and + * listxattrs) to miss a value, this is specially important + * for ACLs. */ - btrfs_release_path(path); - di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), - name, name_len, -1); - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } else if (!di) { - /* Shouldn't happen but just in case... */ - btrfs_release_path(path); - goto again; + const int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + const u16 old_data_len = btrfs_dir_data_len(leaf, di); + const u32 item_size = btrfs_item_size_nr(leaf, slot); + const u32 data_size = sizeof(*di) + name_len + size; + struct btrfs_item *item; + unsigned long data_ptr; + char *ptr; + + if (size > old_data_len) { + if (btrfs_leaf_free_space(root, leaf) < + (size - old_data_len)) { + ret = -ENOSPC; + goto out; + } } - ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + if (old_data_len + name_len + sizeof(*di) == item_size) { + /* No other xattrs packed in the same leaf item. */ + if (size > old_data_len) + btrfs_extend_item(root, path, + size - old_data_len); + else if (size < old_data_len) + btrfs_truncate_item(root, path, data_size, 1); + } else { + /* There are other xattrs packed in the same item. */ + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_extend_item(root, path, data_size); + } + item = btrfs_item_nr(slot); + ptr = btrfs_item_ptr(leaf, slot, char); + ptr += btrfs_item_size(leaf, item) - data_size; + di = (struct btrfs_dir_item *)ptr; + btrfs_set_dir_data_len(leaf, di, size); + data_ptr = ((unsigned long)(di + 1)) + name_len; + write_extent_buffer(leaf, value, data_ptr, size); + btrfs_mark_buffer_dirty(leaf); + } else { /* - * We have a value to set, so go back and try to insert it now. + * Insert, and we had space for the xattr, so path->slots[0] is + * where our xattr dir_item is and btrfs_insert_xattr_item() + * filled it. */ - if (value) { - btrfs_release_path(path); - goto again; - } } out: btrfs_free_path(path); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 759fa4e2de8f..fb22fd8d8fb8 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -299,6 +299,8 @@ done: zlib_inflateEnd(&workspace->strm); if (data_in) kunmap(pages_in[page_in_index]); + if (!ret) + btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); return ret; } @@ -310,10 +312,14 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; int wbits = MAX_WBITS; - unsigned long bytes_left = destlen; + unsigned long bytes_left; unsigned long total_out = 0; + unsigned long pg_offset = 0; char *kaddr; + destlen = min_t(unsigned long, destlen, PAGE_SIZE); + bytes_left = destlen; + workspace->strm.next_in = data_in; workspace->strm.avail_in = srclen; workspace->strm.total_in = 0; @@ -341,7 +347,6 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long buf_start; unsigned long buf_offset; unsigned long bytes; - unsigned long pg_offset = 0; ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) @@ -384,6 +389,17 @@ next: ret = 0; zlib_inflateEnd(&workspace->strm); + + /* + * this should only happen if zlib returned fewer bytes than we + * expected. btrfs_get_block is responsible for zeroing from the + * end of the inline extent (destlen) to the end of the page + */ + if (pg_offset < destlen) { + kaddr = kmap_atomic(dest_page); + memset(kaddr + pg_offset, 0, destlen - pg_offset); + kunmap_atomic(kaddr); + } return ret; } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index ce1b115dcc28..f601def05bdf 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -574,7 +574,7 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args) /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); - if (!S_ISDIR(path.dentry->d_inode->i_mode)) + if (!d_can_lookup(path.dentry)) goto notdir; cachefiles_begin_secure(cache, &saved_cred); @@ -646,7 +646,7 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args) /* extract the directory dentry from the cwd */ get_fs_pwd(current->fs, &path); - if (!S_ISDIR(path.dentry->d_inode->i_mode)) + if (!d_can_lookup(path.dentry)) goto notdir; cachefiles_begin_secure(cache, &saved_cred); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 1c7293c3a93a..232426214fdd 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -437,7 +437,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) if (!object->backer) return -ENOBUFS; - ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + ASSERT(d_is_reg(object->backer)); fscache_set_store_limit(&object->fscache, ni_size); @@ -501,7 +501,7 @@ static void cachefiles_invalidate_object(struct fscache_operation *op) op->object->debug_id, (unsigned long long)ni_size); if (object->backer) { - ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + ASSERT(d_is_reg(object->backer)); fscache_set_store_limit(&object->fscache, ni_size); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index e12f189d539b..1e51714eb33e 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -102,8 +102,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, struct cachefiles_object *object; struct rb_node *p; - _enter(",'%*.*s'", - dentry->d_name.len, dentry->d_name.len, dentry->d_name.name); + _enter(",'%pd'", dentry); write_lock(&cache->active_lock); @@ -273,14 +272,12 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, char nbuffer[8 + 8 + 1]; int ret; - _enter(",'%*.*s','%*.*s'", - dir->d_name.len, dir->d_name.len, dir->d_name.name, - rep->d_name.len, rep->d_name.len, rep->d_name.name); + _enter(",'%pd','%pd'", dir, rep); _debug("remove %p from %p", rep, dir); /* non-directories can just be unlinked */ - if (!S_ISDIR(rep->d_inode->i_mode)) { + if (!d_is_dir(rep)) { _debug("unlink stale object"); path.mnt = cache->mnt; @@ -326,7 +323,7 @@ try_again: return 0; } - if (!S_ISDIR(cache->graveyard->d_inode->i_mode)) { + if (!d_can_lookup(cache->graveyard)) { unlock_rename(cache->graveyard, dir); cachefiles_io_error(cache, "Graveyard no longer a directory"); return -EIO; @@ -478,7 +475,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, ASSERT(parent->dentry); ASSERT(parent->dentry->d_inode); - if (!(S_ISDIR(parent->dentry->d_inode->i_mode))) { + if (!(d_is_dir(parent->dentry))) { // TODO: convert file to dir _leave("looking up in none directory"); return -ENOBUFS; @@ -542,7 +539,7 @@ lookup_again: _debug("mkdir -> %p{%p{ino=%lu}}", next, next->d_inode, next->d_inode->i_ino); - } else if (!S_ISDIR(next->d_inode->i_mode)) { + } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", next->d_inode->i_ino); ret = -ENOBUFS; @@ -571,8 +568,8 @@ lookup_again: _debug("create -> %p{%p{ino=%lu}}", next, next->d_inode, next->d_inode->i_ino); - } else if (!S_ISDIR(next->d_inode->i_mode) && - !S_ISREG(next->d_inode->i_mode) + } else if (!d_can_lookup(next) && + !d_is_reg(next) ) { pr_err("inode %lu is not a file or directory\n", next->d_inode->i_ino); @@ -597,8 +594,7 @@ lookup_again: /* if we've found that the terminal object exists, then we need to * check its attributes and delete it if it's out of date */ if (!object->new) { - _debug("validate '%*.*s'", - next->d_name.len, next->d_name.len, next->d_name.name); + _debug("validate '%pd'", next); ret = cachefiles_check_object_xattr(object, auxdata); if (ret == -ESTALE) { @@ -646,7 +642,7 @@ lookup_again: /* open a file interface onto a data file */ if (object->type != FSCACHE_COOKIE_TYPE_INDEX) { - if (S_ISREG(object->dentry->d_inode->i_mode)) { + if (d_is_reg(object->dentry)) { const struct address_space_operations *aops; ret = -EPERM; @@ -767,7 +763,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, /* we need to make sure the subdir is a directory */ ASSERT(subdir->d_inode); - if (!S_ISDIR(subdir->d_inode->i_mode)) { + if (!d_can_lookup(subdir)) { pr_err("%s is not a directory\n", dirname); ret = -EIO; goto check_error; @@ -827,8 +823,8 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, unsigned long start; int ret; - //_enter(",%*.*s/,%s", - // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + //_enter(",%pd/,%s", + // dir, filename); /* look up the victim */ mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); @@ -910,8 +906,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, struct dentry *victim; int ret; - _enter(",%*.*s/,%s", - dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + _enter(",%pd/,%s", dir, filename); victim = cachefiles_check_active(cache, dir, filename); if (IS_ERR(victim)) @@ -969,8 +964,8 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, { struct dentry *victim; - //_enter(",%*.*s/,%s", - // dir->d_name.len, dir->d_name.len, dir->d_name.name, filename); + //_enter(",%pd/,%s", + // dir, filename); victim = cachefiles_check_active(cache, dir, filename); if (IS_ERR(victim)) diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 616db0e77b44..c6cd8d7a4eef 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -900,7 +900,7 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) return -ENOBUFS; } - ASSERT(S_ISREG(object->backer->d_inode->i_mode)); + ASSERT(d_is_reg(object->backer)); cache = container_of(object->fscache.cache, struct cachefiles_cache, cache); diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index acbc1f094fb1..a8a68745e11d 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -51,9 +51,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object) } if (ret != -EEXIST) { - pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n", - dentry->d_name.len, dentry->d_name.len, - dentry->d_name.name, dentry->d_inode->i_ino, + pr_err("Can't set xattr on %pd [%lu] (err %d)\n", + dentry, dentry->d_inode->i_ino, -ret); goto error; } @@ -64,9 +63,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret == -ERANGE) goto bad_type_length; - pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n", - dentry->d_name.len, dentry->d_name.len, - dentry->d_name.name, dentry->d_inode->i_ino, + pr_err("Can't read xattr on %pd [%lu] (err %d)\n", + dentry, dentry->d_inode->i_ino, -ret); goto error; } @@ -92,9 +90,8 @@ bad_type_length: bad_type: xtype[2] = 0; - pr_err("Cache object %*.*s [%lu] type %s not %s\n", - dentry->d_name.len, dentry->d_name.len, - dentry->d_name.name, dentry->d_inode->i_ino, + pr_err("Cache object %pd [%lu] type %s not %s\n", + dentry, dentry->d_inode->i_ino, xtype, type); ret = -EIO; goto error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 5bd853ba44ff..64fa248343f6 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode, spin_unlock(&ci->i_ceph_lock); } -static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode, - int type) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct posix_acl *acl = ACL_NOT_CACHED; - - spin_lock(&ci->i_ceph_lock); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)) - acl = get_cached_acl(inode, type); - spin_unlock(&ci->i_ceph_lock); - - return acl; -} - struct posix_acl *ceph_get_acl(struct inode *inode, int type) { int size; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 18c06bbaf136..fd5599d32362 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -192,17 +192,35 @@ static int readpage_nounlock(struct file *filp, struct page *page) struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; int err = 0; + u64 off = page_offset(page); u64 len = PAGE_CACHE_SIZE; - err = ceph_readpage_from_fscache(inode, page); + if (off >= i_size_read(inode)) { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) + return -EINVAL; + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + + err = ceph_readpage_from_fscache(inode, page); if (err == 0) goto out; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - (u64) page_offset(page), &len, + off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -319,7 +337,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) off, len); vino = ceph_vino(inode); req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 1, CEPH_OSD_OP_READ, + 0, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -384,6 +402,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, int rc = 0; int max = 0; + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) + return -EINVAL; + rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, &nr_pages); @@ -673,7 +694,7 @@ static int ceph_writepages_start(struct address_space *mapping, int rc = 0; unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; - int do_sync; + int do_sync = 0; u64 truncate_size, snap_size; u32 truncate_seq; @@ -750,7 +771,6 @@ retry: last_snapc = snapc; while (!done && index <= end) { - int num_ops = do_sync ? 2 : 1; unsigned i; int first; pgoff_t next; @@ -850,7 +870,8 @@ get_more_pages: len = wsize; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, - offset, &len, num_ops, + offset, &len, 0, + do_sync ? 2 : 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, @@ -862,6 +883,9 @@ get_more_pages: break; } + if (do_sync) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + req->r_callback = writepages_finish; req->r_inode = inode; @@ -1204,6 +1228,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; + struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; int want, got, ret; @@ -1215,7 +1240,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; while (1) { got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, + -1, &got, &pinned_page); if (ret == 0) break; if (ret != -ERESTARTSYS) { @@ -1226,12 +1252,54 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) dout("filemap_fault %p %llu~%zd got cap refs on %s\n", inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); - ret = filemap_fault(vma, vmf); + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || + ci->i_inline_version == CEPH_INLINE_NONE) + ret = filemap_fault(vma, vmf); + else + ret = -EAGAIN; dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); + if (pinned_page) + page_cache_release(pinned_page); ceph_put_cap_refs(ci, got); + if (ret != -EAGAIN) + return ret; + + /* read inline data */ + if (off >= PAGE_CACHE_SIZE) { + /* does not support inline data > PAGE_SIZE */ + ret = VM_FAULT_SIGBUS; + } else { + int ret1; + struct address_space *mapping = inode->i_mapping; + struct page *page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & + ~__GFP_FS); + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + ret1 = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret1 < 0 || off >= i_size_read(inode)) { + unlock_page(page); + page_cache_release(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + if (ret1 < PAGE_CACHE_SIZE) + zero_user_segment(page, ret1, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + SetPageUptodate(page); + vmf->page = page; + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; + } +out: + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + inode, off, (size_t)PAGE_CACHE_SIZE, ret); return ret; } @@ -1250,6 +1318,19 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) size_t len; int want, got, ret; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + struct page *locked_page = NULL; + if (off == 0) { + lock_page(page); + locked_page = page; + } + ret = ceph_uninline_data(vma->vm_file, locked_page); + if (locked_page) + unlock_page(locked_page); + if (ret < 0) + return VM_FAULT_SIGBUS; + } + if (off + PAGE_CACHE_SIZE <= size) len = PAGE_CACHE_SIZE; else @@ -1263,7 +1344,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; while (1) { got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + &got, NULL); if (ret == 0) break; if (ret != -ERESTARTSYS) { @@ -1297,11 +1379,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) { + if (ret != VM_FAULT_LOCKED) unlock_page(page); - } else { + if (ret == VM_FAULT_LOCKED || + ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -1315,10 +1399,181 @@ out: return ret; } +void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + if (locked_page) { + page = locked_page; + } else { + if (i_size_read(inode) == 0) + return; + page = find_or_create_page(mapping, 0, + mapping_gfp_mask(mapping) & ~__GFP_FS); + if (!page) + return; + if (PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + return; + } + } + + dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n", + inode, ceph_vinop(inode), len, locked_page); + + if (len > 0) { + void *kaddr = kmap_atomic(page); + memcpy(kaddr, data, len); + kunmap_atomic(kaddr); + } + + if (page != locked_page) { + if (len < PAGE_CACHE_SIZE) + zero_user_segment(page, len, PAGE_CACHE_SIZE); + else + flush_dcache_page(page); + + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + } +} + +int ceph_uninline_data(struct file *filp, struct page *locked_page) +{ + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_request *req; + struct page *page = NULL; + u64 len, inline_version; + int err = 0; + bool from_pagecache = false; + + spin_lock(&ci->i_ceph_lock); + inline_version = ci->i_inline_version; + spin_unlock(&ci->i_ceph_lock); + + dout("uninline_data %p %llx.%llx inline_version %llu\n", + inode, ceph_vinop(inode), inline_version); + + if (inline_version == 1 || /* initial version, no data */ + inline_version == CEPH_INLINE_NONE) + goto out; + + if (locked_page) { + page = locked_page; + WARN_ON(!PageUptodate(page)); + } else if (ceph_caps_issued(ci) & + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { + page = find_get_page(inode->i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + from_pagecache = true; + lock_page(page); + } else { + page_cache_release(page); + page = NULL; + } + } + } + + if (page) { + len = i_size_read(inode); + if (len > PAGE_CACHE_SIZE) + len = PAGE_CACHE_SIZE; + } else { + page = __page_cache_alloc(GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto out; + } + err = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, true); + if (err < 0) { + /* no inline data */ + if (err == -ENODATA) + err = 0; + goto out; + } + len = err; + } + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 0, 1, + CEPH_OSD_OP_CREATE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + 0, 0, false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); + ceph_osdc_put_request(req); + if (err < 0) + goto out; + + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, + ceph_vino(inode), 0, &len, 1, 3, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + ci->i_snap_realm->cached_context, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } + + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); + + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &inline_version, + sizeof(inline_version), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", &inline_version, + sizeof(inline_version), 0, 0); + if (err) + goto out_put; + + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, req); +out_put: + ceph_osdc_put_request(req); + if (err == -ECANCELED) + err = 0; +out: + if (page && page != locked_page) { + if (from_pagecache) { + unlock_page(page); + page_cache_release(page); + } else + __free_pages(page, 0); + } + + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", + inode, ceph_vinop(inode), inline_version, err); + return err; +} + static struct vm_operations_struct ceph_vmops = { .fault = ceph_filemap_fault, .page_mkwrite = ceph_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ceph_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index cefca661464b..8172775428a0 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode, struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) { - ceph_get_snap_realm(mdsc, realm); spin_lock(&realm->inodes_with_caps_lock); ci->i_snap_realm = realm; list_add(&ci->i_snap_realm_item, @@ -975,10 +974,12 @@ static int send_cap_msg(struct ceph_mds_session *session, kuid_t uid, kgid_t gid, umode_t mode, u64 xattr_version, struct ceph_buffer *xattrs_buf, - u64 follows) + u64 follows, bool inline_data) { struct ceph_mds_caps *fc; struct ceph_msg *msg; + void *p; + size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u mseq %u follows %lld size %llu/%llu" @@ -988,7 +989,10 @@ static int send_cap_msg(struct ceph_mds_session *session, seq, issue_seq, mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); + /* flock buffer size + inline version + inline data size */ + extra_len = 4 + 8 + 4; + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, + GFP_NOFS, false); if (!msg) return -ENOMEM; @@ -1020,6 +1024,14 @@ static int send_cap_msg(struct ceph_mds_session *session, fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); fc->mode = cpu_to_le32(mode); + p = fc + 1; + /* flock buffer size */ + ceph_encode_32(&p, 0); + /* inline version */ + ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); + /* inline data size */ + ceph_encode_32(&p, 0); + fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { msg->middle = ceph_buffer_get(xattrs_buf); @@ -1126,6 +1138,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 flush_tid = 0; int i; int ret; + bool inline_data; held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; @@ -1209,13 +1222,15 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, xattr_version = ci->i_xattrs.version; } + inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, - follows); + follows, inline_data); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); delayed = 1; @@ -1336,7 +1351,7 @@ retry: capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows); + capsnap->follows, capsnap->inline_data); next_follows = capsnap->follows + 1; ceph_put_cap_snap(capsnap); @@ -1435,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode, spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; if (list_empty(&ci->i_flushing_item)) { + ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; dout(" inode %p now flushing seq %lld\n", inode, @@ -2057,7 +2072,7 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff, int *check_max, int *err) + loff_t endoff, int *got, int *check_max, int *err) { struct inode *inode = &ci->vfs_inode; int ret = 0; @@ -2066,6 +2081,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); + spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2075,7 +2091,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ceph_cap_string(need), ceph_cap_string(file_wanted)); *err = -EBADF; ret = 1; - goto out; + goto out_unlock; } /* finish pending truncate */ @@ -2095,7 +2111,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, *check_max = 1; ret = 1; } - goto out; + goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we @@ -2103,7 +2119,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, */ if (__ceph_have_pending_cap_snap(ci)) { dout("get_cap_refs %p cap_snap_pending\n", inode); - goto out; + goto out_unlock; } } @@ -2125,11 +2141,27 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, ret = 1; } } else { + int session_readonly = false; + if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) { + struct ceph_mds_session *s = ci->i_auth_cap->session; + spin_lock(&s->s_cap_lock); + session_readonly = s->s_readonly; + spin_unlock(&s->s_cap_lock); + } + if (session_readonly) { + dout("get_cap_refs %p needed %s but mds%d readonly\n", + inode, ceph_cap_string(need), ci->i_auth_cap->mds); + *err = -EROFS; + ret = 1; + goto out_unlock; + } + dout("get_cap_refs %p have %s needed %s\n", inode, ceph_cap_string(have), ceph_cap_string(need)); } -out: +out_unlock: spin_unlock(&ci->i_ceph_lock); + dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); return ret; @@ -2168,25 +2200,55 @@ static void check_max_size(struct inode *inode, loff_t endoff) * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, - loff_t endoff) +int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, + loff_t endoff, int *got, struct page **pinned_page) { - int check_max, ret, err; + int _got, check_max, ret, err = 0; retry: if (endoff > 0) check_max_size(&ci->vfs_inode, endoff); + _got = 0; check_max = 0; - err = 0; ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, - got, endoff, - &check_max, &err)); + try_get_cap_refs(ci, need, want, endoff, + &_got, &check_max, &err)); if (err) ret = err; + if (ret < 0) + return ret; + if (check_max) goto retry; - return ret; + + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + goto out; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while holding + * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* getattr request will bring inline data into page cache */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, true); + if (ret < 0) + return ret; + goto retry; + } +out: + *got = _got; + return 0; } /* @@ -2382,11 +2444,13 @@ static void invalidate_aliases(struct inode *inode) */ static void handle_cap_grant(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *grant, - void *snaptrace, int snaptrace_len, + u64 inline_version, + void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, struct ceph_cap *cap, int issued) __releases(ci->i_ceph_lock) + __releases(mdsc->snap_rwsem) { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; @@ -2403,6 +2467,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, bool queue_invalidate = false; bool queue_revalidate = false; bool deleted_inode = false; + bool fill_inline = false; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2576,19 +2641,25 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, } BUG_ON(cap->issued & ~cap->implemented); + if (inline_version > 0 && inline_version >= ci->i_inline_version) { + ci->i_inline_version = inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) + fill_inline = true; + } + spin_unlock(&ci->i_ceph_lock); if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { - down_write(&mdsc->snap_rwsem); - ceph_update_snap_trace(mdsc, snaptrace, - snaptrace + snaptrace_len, false); - downgrade_write(&mdsc->snap_rwsem); kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); if (newcaps & ~issued) wake = true; } + if (fill_inline) + ceph_fill_inline_data(inode, NULL, inline_data, inline_len); + if (queue_trunc) { ceph_queue_vmtruncate(inode); ceph_queue_revalidate(inode); @@ -2989,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; + struct ceph_snap_realm *realm; int mds = session->s_mds; int op, issued; u32 seq, mseq; @@ -2996,11 +3068,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, u64 cap_id; u64 size, max_size; u64 tid; + u64 inline_version = 0; + void *inline_data = NULL; + u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; - void *flock; - void *end; - u32 flock_len; + void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3021,30 +3094,37 @@ void ceph_handle_caps(struct ceph_mds_session *session, snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); + p = snaptrace + snaptrace_len; if (le16_to_cpu(msg->hdr.version) >= 2) { - void *p = snaptrace + snaptrace_len; + u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; - flock = p; - } else { - flock = NULL; - flock_len = 0; + p += flock_len; } if (le16_to_cpu(msg->hdr.version) >= 3) { if (op == CEPH_CAP_OP_IMPORT) { - void *p = flock + flock_len; if (p + sizeof(*peer) > end) goto bad; peer = p; + p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } + if (le16_to_cpu(msg->hdr.version) >= 4) { + ceph_decode_64_safe(&p, end, inline_version, bad); + ceph_decode_32_safe(&p, end, inline_len, bad); + if (p + inline_len > end) + goto bad; + inline_data = p; + p += inline_len; + } + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -3082,10 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session, goto done_unlocked; case CEPH_CAP_OP_IMPORT: + realm = NULL; + if (snaptrace_len) { + down_write(&mdsc->snap_rwsem); + ceph_update_snap_trace(mdsc, snaptrace, + snaptrace + snaptrace_len, + false, &realm); + downgrade_write(&mdsc->snap_rwsem); + } else { + down_read(&mdsc->snap_rwsem); + } handle_cap_import(mdsc, inode, h, peer, session, &cap, &issued); - handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, + handle_cap_grant(mdsc, inode, h, + inline_version, inline_data, inline_len, msg->middle, session, cap, issued); + if (realm) + ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } @@ -3105,8 +3198,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); - handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, - session, cap, issued); + handle_cap_grant(mdsc, inode, h, + inline_version, inline_data, inline_len, + msg->middle, session, cap, issued); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: @@ -3137,8 +3231,7 @@ flush_cap_releases: done: mutex_unlock(&session->s_mutex); done_unlocked: - if (inode) - iput(inode); + iput(inode); return; bad: diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 5d5a4c8c8496..1b2355109b9f 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -83,10 +83,9 @@ static int mdsc_show(struct seq_file *s, void *p) if (IS_ERR(path)) path = NULL; spin_lock(&req->r_dentry->d_lock); - seq_printf(s, " #%llx/%.*s (%s)", + seq_printf(s, " #%llx/%pd (%s)", ceph_ino(req->r_dentry->d_parent->d_inode), - req->r_dentry->d_name.len, - req->r_dentry->d_name.name, + req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); kfree(path); @@ -103,11 +102,10 @@ static int mdsc_show(struct seq_file *s, void *p) if (IS_ERR(path)) path = NULL; spin_lock(&req->r_old_dentry->d_lock); - seq_printf(s, " #%llx/%.*s (%s)", + seq_printf(s, " #%llx/%pd (%s)", req->r_old_dentry_dir ? ceph_ino(req->r_old_dentry_dir) : 0, - req->r_old_dentry->d_name.len, - req->r_old_dentry->d_name.name, + req->r_old_dentry, path ? path : ""); spin_unlock(&req->r_old_dentry->d_lock); kfree(path); @@ -150,8 +148,8 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) spin_lock(&mdsc->dentry_lru_lock); list_for_each_entry(di, &mdsc->dentry_lru, lru) { struct dentry *dentry = di->dentry; - seq_printf(s, "%p %p\t%.*s\n", - di, dentry, dentry->d_name.len, dentry->d_name.name); + seq_printf(s, "%p %p\t%pd\n", + di, dentry, dentry); } spin_unlock(&mdsc->dentry_lru_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e6d63f8f98c0..83e9976f7189 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -26,8 +26,6 @@ * point by name. */ -const struct inode_operations ceph_dir_iops; -const struct file_operations ceph_dir_fops; const struct dentry_operations ceph_dentry_ops; /* @@ -111,7 +109,7 @@ static int fpos_cmp(loff_t l, loff_t r) /* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on - * d_u.d_child when we initially get results back from the MDS, and + * d_child when we initially get results back from the MDS, and * falling back to a "normal" sync readdir if any dentries in the dir * are dropped. * @@ -123,7 +121,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, u32 shared_gen) { struct ceph_file_info *fi = file->private_data; - struct dentry *parent = file->f_dentry; + struct dentry *parent = file->f_path.dentry; struct inode *dir = parent->d_inode; struct list_head *p; struct dentry *dentry, *last; @@ -147,11 +145,11 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, p = parent->d_subdirs.prev; dout(" initial p %p/%p\n", p->prev, p->next); } else { - p = last->d_u.d_child.prev; + p = last->d_child.prev; } more: - dentry = list_entry(p, struct dentry, d_u.d_child); + dentry = list_entry(p, struct dentry, d_child); di = ceph_dentry(dentry); while (1) { dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, @@ -168,13 +166,13 @@ more: ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && fpos_cmp(ctx->pos, di->offset) <= 0) break; - dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry, - dentry->d_name.len, dentry->d_name.name, di->offset, + dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, + dentry, di->offset, ctx->pos, d_unhashed(dentry) ? " unhashed" : "", !dentry->d_inode ? " null" : ""); spin_unlock(&dentry->d_lock); p = p->prev; - dentry = list_entry(p, struct dentry, d_u.d_child); + dentry = list_entry(p, struct dentry, d_child); di = ceph_dentry(dentry); } @@ -183,15 +181,15 @@ more: spin_unlock(&parent->d_lock); /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { + if (!ceph_dir_is_complete_ordered(dir)) { dout(" lost dir complete on %p; falling back to mds\n", dir); dput(dentry); err = -EAGAIN; goto out; } - dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, - dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dentry, dentry, dentry->d_inode); if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -261,10 +259,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* always start with . and .. */ if (ctx->pos == 0) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - dout("readdir off 0 -> '.'\n"); if (!dir_emit(ctx, ".", 1, ceph_translate_ino(inode->i_sb, inode->i_ino), @@ -274,7 +268,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) off = 1; } if (ctx->pos == 1) { - ino_t ino = parent_ino(file->f_dentry); + ino_t ino = parent_ino(file->f_path.dentry); dout("readdir off 1 -> '..'\n"); if (!dir_emit(ctx, "..", 2, ceph_translate_ino(inode->i_sb, ino), @@ -289,7 +283,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) if ((ctx->pos == 2 || fi->dentry) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && - __ceph_dir_is_complete(ci) && + __ceph_dir_is_complete_ordered(ci) && __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { u32 shared_gen = ci->i_shared_gen; spin_unlock(&ci->i_ceph_lock); @@ -312,6 +306,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* proceed with a normal readdir */ + if (ctx->pos == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + fi->dir_release_count = atomic_read(&ci->i_release_count); + fi->dir_ordered_count = ci->i_ordered_count; + } + more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -337,7 +338,7 @@ more: } req->r_inode = inode; ihold(inode); - req->r_dentry = dget(file->f_dentry); + req->r_dentry = dget(file->f_path.dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); @@ -446,8 +447,12 @@ more: */ spin_lock(&ci->i_ceph_lock); if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - dout(" marking %p complete\n", inode); - __ceph_dir_set_complete(ci, fi->dir_release_count); + if (ci->i_ordered_count == fi->dir_ordered_count) + dout(" marking %p complete and ordered\n", inode); + else + dout(" marking %p complete\n", inode); + __ceph_dir_set_complete(ci, fi->dir_release_count, + fi->dir_ordered_count); } spin_unlock(&ci->i_ceph_lock); @@ -538,8 +543,8 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); - dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", - dentry, dentry->d_name.len, dentry->d_name.name, inode); + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", + dentry, dentry, inode); BUG_ON(!d_unhashed(dentry)); d_add(dentry, inode); err = 0; @@ -603,8 +608,8 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, int op; int err; - dout("lookup %p dentry %p '%.*s'\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name); + dout("lookup %p dentry %p '%pd'\n", + dir, dentry, dentry); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -665,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) /* * We created the item, then did a lookup, and found * it was already linked to another inode we already - * had in our cache (and thus got spliced). Link our - * dentry to that inode, but don't hash it, just in - * case the VFS wants to dereference it. + * had in our cache (and thus got spliced). To not + * confuse VFS (especially when inode is a directory), + * we don't link our dentry to that inode, return an + * error instead. + * + * This event should be rare and it happens only when + * we talk to old MDS. Recent MDS does not send traceless + * reply for request that creates new inode. */ - BUG_ON(!result->d_inode); - d_instantiate(dentry, result->d_inode); - return 0; + d_drop(result); + return -ESTALE; } return PTR_ERR(result); } @@ -774,8 +783,8 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ op = CEPH_MDS_OP_MKSNAP; - dout("mksnap dir %p snap '%.*s' dn %p\n", dir, - dentry->d_name.len, dentry->d_name.name, dentry); + dout("mksnap dir %p snap '%pd' dn %p\n", dir, + dentry, dentry); } else if (ceph_snap(dir) == CEPH_NOSNAP) { dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); op = CEPH_MDS_OP_MKDIR; @@ -805,7 +814,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) acls.pagelist = NULL; } err = ceph_mdsc_do_request(mdsc, dir, req); - if (!err && !req->r_reply_info.head->is_dentry) + if (!err && + !req->r_reply_info.head->is_target && + !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: @@ -888,13 +899,12 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) if (ceph_snap(dir) == CEPH_SNAPDIR) { /* rmdir .snap/foo is RMSNAP */ - dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len, - dentry->d_name.name, dentry); + dout("rmsnap dir %p '%pd' dn %p\n", dir, dentry, dentry); op = CEPH_MDS_OP_RMSNAP; } else if (ceph_snap(dir) == CEPH_NOSNAP) { dout("unlink/rmdir dir %p dn %p inode %p\n", dir, dentry, inode); - op = S_ISDIR(dentry->d_inode->i_mode) ? + op = d_is_dir(dentry) ? CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK; } else goto out; @@ -1063,16 +1073,15 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode, - ceph_dentry(dentry)->offset); + dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, + dentry, dentry->d_inode, ceph_dentry(dentry)->offset); dir = ceph_get_dentry_parent_inode(dentry); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { - dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry, - dentry->d_name.len, dentry->d_name.name, dentry->d_inode); + dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, + dentry, dentry->d_inode); valid = 1; } else if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { @@ -1265,8 +1274,7 @@ void ceph_dentry_lru_add(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_add %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); + dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_add_tail(&di->lru, &mdsc->dentry_lru); @@ -1279,8 +1287,8 @@ void ceph_dentry_lru_touch(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, - dn->d_name.len, dn->d_name.name, di->offset); + dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, + di->offset); mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_move_tail(&di->lru, &mdsc->dentry_lru); @@ -1292,8 +1300,7 @@ void ceph_dentry_lru_del(struct dentry *dn) struct ceph_dentry_info *di = ceph_dentry(dn); struct ceph_mds_client *mdsc; - dout("dentry_lru_del %p %p '%.*s'\n", di, dn, - dn->d_name.len, dn->d_name.name); + dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; spin_lock(&mdsc->dentry_lru_lock); list_del_init(&di->lru); @@ -1330,6 +1337,13 @@ const struct file_operations ceph_dir_fops = { .fsync = ceph_dir_fsync, }; +const struct file_operations ceph_snapdir_fops = { + .iterate = ceph_readdir, + .llseek = ceph_dir_llseek, + .open = ceph_open, + .release = ceph_release, +}; + const struct inode_operations ceph_dir_iops = { .lookup = ceph_lookup, .permission = ceph_permission, @@ -1352,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = { .atomic_open = ceph_atomic_open, }; +const struct inode_operations ceph_snapdir_iops = { + .lookup = ceph_lookup, + .permission = ceph_permission, + .getattr = ceph_getattr, + .mkdir = ceph_mkdir, + .rmdir = ceph_unlink, +}; + const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d7e0da8366e6..d533075a823d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -211,7 +211,7 @@ int ceph_open(struct inode *inode, struct file *file) req->r_num_caps = 1; if (flags & O_CREAT) - parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); + parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); iput(parent_inode); if (!err) @@ -238,8 +238,8 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_acls_info acls = {}; int err; - dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", - dir, dentry, dentry->d_name.len, dentry->d_name.name, + dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", + dir, dentry, dentry, d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode); if (dentry->d_name.len > NAME_MAX) @@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); + err = ceph_handle_snapdir(req, dentry, err); if (err) goto out_req; - err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -292,7 +292,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } if (err) goto out_req; - if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { + if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); @@ -333,6 +333,11 @@ int ceph_release(struct inode *inode, struct file *file) return 0; } +enum { + CHECK_EOF = 1, + READ_INLINE = 2, +}; + /* * Read a range of bytes striped over one or more objects. Iterate over * objects we stripe over. (That's not atomic, but good enough for now.) @@ -387,13 +392,14 @@ more: if (ret >= 0) { int didpages; if (was_short && (pos + ret < inode->i_size)) { - u64 tmp = min(this_len - ret, - inode->i_size - pos - ret); + int zlen = min(this_len - ret, + inode->i_size - pos - ret); + int zoff = (o_direct ? buf_align : io_align) + + read + ret; dout(" zero gap %llu to %llu\n", - pos + ret, pos + ret + tmp); - ceph_zero_page_vector_range(page_align + read + ret, - tmp, pages); - ret += tmp; + pos + ret, pos + ret + zlen); + ceph_zero_page_vector_range(zoff, zlen, pages); + ret += zlen; } didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; @@ -412,7 +418,7 @@ more: ret = read; /* did we bounce off eof? */ if (pos + left > inode->i_size) - *checkeof = 1; + *checkeof = CHECK_EOF; } dout("striped_read returns %d\n", ret); @@ -598,7 +604,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, + vino, pos, &len, 0, 2,/*include a 'startsync' command*/ CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, @@ -609,6 +615,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { ret = n; @@ -713,7 +721,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 1, + vino, pos, &len, 0, 1, CEPH_OSD_OP_WRITE, flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, @@ -803,9 +811,10 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) size_t len = iocb->ki_nbytes; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); + struct page *pinned_page = NULL; ssize_t ret; int want, got = 0; - int checkeof = 0, read = 0; + int retry_op = 0, read = 0; again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", @@ -815,7 +824,7 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); if (ret < 0) return ret; @@ -827,8 +836,12 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &checkeof); + if (ci->i_inline_version == CEPH_INLINE_NONE) { + /* hmm, this isn't really async... */ + ret = ceph_sync_read(iocb, to, &retry_op); + } else { + retry_op = READ_INLINE; + } } else { dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, @@ -838,21 +851,69 @@ again: } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); + if (pinned_page) { + page_cache_release(pinned_page); + pinned_page = NULL; + } ceph_put_cap_refs(ci, got); + if (retry_op && ret >= 0) { + int statret; + struct page *page = NULL; + loff_t i_size; + if (retry_op == READ_INLINE) { + page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + } - if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); + statret = __ceph_do_getattr(inode, page, + CEPH_STAT_CAP_INLINE_DATA, !!page); + if (statret < 0) { + __free_page(page); + if (statret == -ENODATA) { + BUG_ON(retry_op != READ_INLINE); + goto again; + } + return statret; + } + + i_size = i_size_read(inode); + if (retry_op == READ_INLINE) { + BUG_ON(ret > 0 || read > 0); + if (iocb->ki_pos < i_size && + iocb->ki_pos < PAGE_CACHE_SIZE) { + loff_t end = min_t(loff_t, i_size, + iocb->ki_pos + len); + end = min_t(loff_t, end, PAGE_CACHE_SIZE); + if (statret < end) + zero_user_segment(page, statret, end); + ret = copy_page_to_iter(page, + iocb->ki_pos & ~PAGE_MASK, + end - iocb->ki_pos, to); + iocb->ki_pos += ret; + read += ret; + } + if (iocb->ki_pos < i_size && read < len) { + size_t zlen = min_t(size_t, len - read, + i_size - iocb->ki_pos); + ret = iov_iter_zero(zlen, to); + iocb->ki_pos += ret; + read += ret; + } + __free_pages(page, 0); + return read; + } /* hit EOF or hole? */ - if (statret == 0 && iocb->ki_pos < inode->i_size && - ret < len) { + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && + ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" ", reading more\n", iocb->ki_pos, inode->i_size); read += ret; len -= ret; - checkeof = 0; + retry_op = 0; goto again; } } @@ -891,7 +952,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ - current->backing_dev_info = file->f_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -909,6 +970,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; + if (ci->i_inline_version != CEPH_INLINE_NONE) { + err = ceph_uninline_data(file, NULL); + if (err < 0) + goto out; + } + retry_snap: if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { err = -ENOSPC; @@ -922,7 +989,8 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + &got, NULL); if (err < 0) goto out; @@ -969,6 +1037,7 @@ retry_snap: if (written >= 0) { int dirty; spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) @@ -1111,7 +1180,7 @@ static int ceph_zero_partial_object(struct inode *inode, req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), offset, length, - 1, op, + 0, 1, op, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, NULL, 0, 0, false); @@ -1214,6 +1283,12 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } + if (ci->i_inline_version != CEPH_INLINE_NONE) { + ret = ceph_uninline_data(file, NULL); + if (ret < 0) + goto unlock; + } + size = i_size_read(inode); if (!(mode & FALLOC_FL_KEEP_SIZE)) endoff = offset + length; @@ -1223,7 +1298,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1240,6 +1315,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); spin_unlock(&ci->i_ceph_lock); if (dirty) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7b6139004401..119c43c80638 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_op = &ceph_dir_iops; - inode->i_fop = &ceph_dir_fops; + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; return inode; @@ -387,8 +387,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) spin_lock_init(&ci->i_ceph_lock); ci->i_version = 0; + ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; + ci->i_ordered_count = 0; atomic_set(&ci->i_release_count, 1); atomic_set(&ci->i_complete_count, 0); ci->i_symlink = NULL; @@ -657,7 +659,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, * Populate an inode based on info from mds. May be called on new or * existing inodes. */ -static int fill_inode(struct inode *inode, +static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_dirfrag *dirinfo, struct ceph_mds_session *session, @@ -675,6 +677,7 @@ static int fill_inode(struct inode *inode, bool wake = false; bool queue_trunc = false; bool new_version = false; + bool fill_inline = false; dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", inode, ceph_vinop(inode), le64_to_cpu(info->version), @@ -780,8 +783,6 @@ static int fill_inode(struct inode *inode, } inode->i_mapping->a_ops = &ceph_aops; - inode->i_mapping->backing_dev_info = - &ceph_sb_to_client(inode->i_sb)->backing_dev_info; switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -837,29 +838,31 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode), inode->i_mode); } - /* set dir completion flag? */ - if (S_ISDIR(inode->i_mode) && - ci->i_files == 0 && ci->i_subdirs == 0 && - ceph_snap(inode) == CEPH_NOSNAP && - (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && - (issued & CEPH_CAP_FILE_EXCL) == 0 && - !__ceph_dir_is_complete(ci)) { - dout(" marking %p complete (empty)\n", inode); - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - } - /* were we issued a capability? */ if (info->cap.caps) { if (ceph_snap(inode) == CEPH_NOSNAP) { + unsigned caps = le32_to_cpu(info->cap.caps); ceph_add_cap(inode, session, le64_to_cpu(info->cap.cap_id), - cap_fmode, - le32_to_cpu(info->cap.caps), + cap_fmode, caps, le32_to_cpu(info->cap.wanted), le32_to_cpu(info->cap.seq), le32_to_cpu(info->cap.mseq), le64_to_cpu(info->cap.realm), info->cap.flags, &new_cap); + + /* set dir completion flag? */ + if (S_ISDIR(inode->i_mode) && + ci->i_files == 0 && ci->i_subdirs == 0 && + (caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + !__ceph_dir_is_complete(ci)) { + dout(" marking %p complete (empty)\n", inode); + __ceph_dir_set_complete(ci, + atomic_read(&ci->i_release_count), + ci->i_ordered_count); + } + wake = true; } else { dout(" %p got snap_caps %s\n", inode, @@ -873,8 +876,23 @@ static int fill_inode(struct inode *inode, ceph_vinop(inode)); __ceph_get_fmode(ci, cap_fmode); } + + if (iinfo->inline_version > 0 && + iinfo->inline_version >= ci->i_inline_version) { + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; + ci->i_inline_version = iinfo->inline_version; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (locked_page || + (le32_to_cpu(info->cap.caps) & cache_caps))) + fill_inline = true; + } + spin_unlock(&ci->i_ceph_lock); + if (fill_inline) + ceph_fill_inline_data(inode, locked_page, + iinfo->inline_data, iinfo->inline_len); + if (wake) wake_up_all(&ci->i_cap_wq); @@ -967,7 +985,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, /* dn must be unhashed */ if (!d_unhashed(dn)) d_drop(dn); - realdn = d_materialise_unique(dn, in); + realdn = d_splice_alias(in, dn); if (IS_ERR(realdn)) { pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", PTR_ERR(realdn), dn, in, ceph_vinop(in)); @@ -1062,7 +1080,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *dir = req->r_locked_dir; if (dir) { - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, + err = fill_inode(dir, NULL, + &rinfo->diri, rinfo->dirfrag, session, req->r_request_started, -1, &req->r_caps_reservation); if (err < 0) @@ -1132,7 +1151,7 @@ retry_lookup: } req->r_target_inode = in; - err = fill_inode(in, &rinfo->targeti, NULL, + err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, session, req->r_request_started, (!req->r_aborted && rinfo->head->result == 0) ? req->r_fmode : -1, @@ -1186,28 +1205,26 @@ retry_lookup: struct inode *olddir = req->r_old_dentry_dir; BUG_ON(!olddir); - dout(" src %p '%.*s' dst %p '%.*s'\n", + dout(" src %p '%pd' dst %p '%pd'\n", req->r_old_dentry, - req->r_old_dentry->d_name.len, - req->r_old_dentry->d_name.name, - dn, dn->d_name.len, dn->d_name.name); + req->r_old_dentry, + dn, dn); dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); d_move(req->r_old_dentry, dn); - dout(" src %p '%.*s' dst %p '%.*s'\n", + dout(" src %p '%pd' dst %p '%pd'\n", + req->r_old_dentry, req->r_old_dentry, - req->r_old_dentry->d_name.len, - req->r_old_dentry->d_name.name, - dn, dn->d_name.len, dn->d_name.name); + dn, dn); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(dir); - ceph_dir_clear_complete(olddir); + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1219,6 +1236,7 @@ retry_lookup: if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); if (dn->d_inode) { + ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); } else { @@ -1235,7 +1253,7 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { - ceph_dir_clear_complete(dir); + ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { @@ -1265,7 +1283,7 @@ retry_lookup: BUG_ON(!dir); BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); - ceph_dir_clear_complete(dir); + ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { @@ -1302,7 +1320,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, dout("new_inode badness got %d\n", err); continue; } - rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, + rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation); if (rc < 0) { @@ -1399,7 +1417,7 @@ retry_lookup: /* reorder parent's d_subdirs */ spin_lock(&parent->d_lock); spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &parent->d_subdirs); + list_move(&dn->d_child, &parent->d_subdirs); spin_unlock(&dn->d_lock); spin_unlock(&parent->d_lock); } @@ -1418,7 +1436,7 @@ retry_lookup: } } - if (fill_inode(in, &rinfo->dir_in[i], NULL, session, + if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, req->r_request_started, -1, &req->r_caps_reservation) < 0) { pr_err("fill_inode badness on %p\n", in); @@ -1429,12 +1447,14 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL); - if (IS_ERR(dn)) { - err = PTR_ERR(dn); + struct dentry *realdn = splice_dentry(dn, in, NULL); + if (IS_ERR(realdn)) { + err = PTR_ERR(realdn); + d_drop(dn); dn = NULL; goto next_item; } + dn = realdn; } di = dn->d_fsdata; @@ -1901,7 +1921,8 @@ out_put: * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. */ -int ceph_do_getattr(struct inode *inode, int mask, bool force) +int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force) { struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1913,7 +1934,8 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force) return 0; } - dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); + dout("do_getattr inode %p mask %s mode 0%o\n", + inode, ceph_cap_string(mask), inode->i_mode); if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; @@ -1924,7 +1946,19 @@ int ceph_do_getattr(struct inode *inode, int mask, bool force) ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); + req->r_locked_page = locked_page; err = ceph_mdsc_do_request(mdsc, NULL, req); + if (locked_page && err == 0) { + u64 inline_version = req->r_reply_info.targeti.inline_version; + if (inline_version == 0) { + /* the reply is supposed to contain inline data */ + err = -EINVAL; + } else if (inline_version == CEPH_INLINE_NONE) { + err = -ENODATA; + } else { + err = req->r_reply_info.targeti.inline_len; + } + } ceph_mdsc_put_request(req); dout("do_getattr result=%d\n", err); return err; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index fbc39c47bacd..4347039ecc18 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -9,6 +9,8 @@ #include <linux/ceph/pagelist.h> static u64 lock_secret; +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); static inline u64 secure_addr(void *addr) { @@ -40,6 +42,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, u64 length = 0; u64 owner; + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) + wait = 0; + req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); @@ -68,6 +73,9 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; + if (wait) + req->r_wait_for_completion = ceph_lock_wait_for_completion; + err = ceph_mdsc_do_request(mdsc, inode, req); if (operation == CEPH_MDS_OP_GETFILELOCK) { @@ -96,6 +104,52 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return err; } +static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req) +{ + struct ceph_mds_request *intr_req; + struct inode *inode = req->r_inode; + int err, lock_type; + + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else + BUG_ON(1); + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); + + err = wait_for_completion_interruptible(&req->r_completion); + if (!err) + return 0; + + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", + req->r_tid); + + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, + USE_AUTH_MDS); + if (IS_ERR(intr_req)) + return PTR_ERR(intr_req); + + intr_req->r_inode = inode; + ihold(inode); + intr_req->r_num_caps = 1; + + intr_req->r_args.filelock_change = req->r_args.filelock_change; + intr_req->r_args.filelock_change.rule = lock_type; + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; + + err = ceph_mdsc_do_request(mdsc, inode, intr_req); + ceph_mdsc_put_request(intr_req); + + if (err && err != -ERESTARTSYS) + return err; + + wait_for_completion(&req->r_completion); + return 0; +} + /** * Attempt to set an fcntl lock. * For now, this just goes away to the server. Later it may be more awesome. @@ -143,11 +197,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err); } } - - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -186,32 +235,30 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else if (err == -ERESTARTSYS) { - dout("undoing lock\n"); - ceph_lock_message(CEPH_LOCK_FLOCK, - CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } -/** - * Must be called with lock_flocks() already held. Fills in the passed - * counter variables, so you can prepare pagelist metadata before calling - * ceph_encode_locks. +/* + * Fills in the passed counter variables, so you can prepare pagelist metadata + * before calling ceph_encode_locks. */ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) { struct file_lock *lock; + struct file_lock_context *ctx; *fcntl_count = 0; *flock_count = 0; - for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { - if (lock->fl_flags & FL_POSIX) + ctx = inode->i_flctx; + if (ctx) { + spin_lock(&ctx->flc_lock); + list_for_each_entry(lock, &ctx->flc_posix, fl_list) ++(*fcntl_count); - else if (lock->fl_flags & FL_FLOCK) + list_for_each_entry(lock, &ctx->flc_flock, fl_list) ++(*flock_count); + spin_unlock(&ctx->flc_lock); } dout("counted %d flock locks and %d fcntl locks", *flock_count, *fcntl_count); @@ -227,6 +274,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; + struct file_lock_context *ctx = inode->i_flctx; int err = 0; int seen_fcntl = 0; int seen_flock = 0; @@ -235,33 +283,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode, dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { - if (lock->fl_flags & FL_POSIX) { - ++seen_fcntl; - if (seen_fcntl > num_fcntl_locks) { - err = -ENOSPC; - goto fail; - } - err = lock_to_ceph_filelock(lock, &flocks[l]); - if (err) - goto fail; - ++l; + if (!ctx) + return 0; + + spin_lock(&ctx->flc_lock); + list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + ++seen_fcntl; + if (seen_fcntl > num_fcntl_locks) { + err = -ENOSPC; + goto fail; } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; } - for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { - if (lock->fl_flags & FL_FLOCK) { - ++seen_flock; - if (seen_flock > num_flock_locks) { - err = -ENOSPC; - goto fail; - } - err = lock_to_ceph_filelock(lock, &flocks[l]); - if (err) - goto fail; - ++l; + list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + ++seen_flock; + if (seen_flock > num_flock_locks) { + err = -ENOSPC; + goto fail; } + err = lock_to_ceph_filelock(lock, &flocks[l]); + if (err) + goto fail; + ++l; } fail: + spin_unlock(&ctx->flc_lock); return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a92d3f5c6c12..71c073f38e54 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -89,6 +89,16 @@ static int parse_reply_info_in(void **p, void *end, ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; *p += info->xattr_len; + + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + return 0; bad: return err; @@ -470,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, mdsc->max_sessions = newmax; } mdsc->sessions[mds] = s; + atomic_inc(&mdsc->num_sessions); atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, @@ -493,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, mdsc->sessions[s->s_mds] = NULL; ceph_con_close(&s->s_con); ceph_put_mds_session(s); + atomic_dec(&mdsc->num_sessions); } /* @@ -524,8 +536,7 @@ void ceph_mdsc_release_request(struct kref *kref) } if (req->r_locked_dir) ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); - if (req->r_target_inode) - iput(req->r_target_inode); + iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); if (req->r_old_dentry) @@ -833,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_options *opt = mdsc->fsc->client->options; void *p; - const char* metadata[3][2] = { + const char* metadata[][2] = { {"hostname", utsname()->nodename}, + {"kernel_version", utsname()->release}, {"entity_id", opt->name ? opt->name : ""}, {NULL, NULL} }; @@ -861,8 +873,11 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 /* * Serialize client metadata into waiting buffer space, using * the format that userspace expects for map<string, string> + * + * ClientSession messages with metadata are v2 */ - msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */ + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ p = msg->front.iov_base + sizeof(*h); @@ -1066,8 +1081,7 @@ out: session->s_cap_iterator = NULL; spin_unlock(&session->s_cap_lock); - if (last_inode) - iput(last_inode); + iput(last_inode); if (old_cap) ceph_put_cap(session->s_mdsc, old_cap); @@ -1453,19 +1467,33 @@ out_unlocked: return err; } +static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + spin_lock(&ci->i_ceph_lock); + if (ci->i_flushing_caps) + ret = ci->i_cap_flush_seq >= want_flush_seq; + else + ret = 1; + spin_unlock(&ci->i_ceph_lock); + return ret; +} + /* * flush all dirty inode data to disk. * * returns true if we've flushed through want_flush_seq */ -static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) { - int mds, ret = 1; + int mds; dout("check_cap_flush want %lld\n", want_flush_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; ret && mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; mds++) { struct ceph_mds_session *session = mdsc->sessions[mds]; + struct inode *inode = NULL; if (!session) continue; @@ -1478,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) list_entry(session->s_cap_flushing.next, struct ceph_inode_info, i_flushing_item); - struct inode *inode = &ci->vfs_inode; - spin_lock(&ci->i_ceph_lock); - if (ci->i_cap_flush_seq <= want_flush_seq) { + if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", inode, - ci->i_cap_flush_seq, want_flush_seq, - session->s_mds); - ret = 0; + "seq %lld <= %lld to mds%d\n", + &ci->vfs_inode, ci->i_cap_flush_seq, + want_flush_seq, session->s_mds); + inode = igrab(&ci->vfs_inode); } - spin_unlock(&ci->i_ceph_lock); } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); - if (!ret) - return ret; + if (inode) { + wait_event(mdsc->cap_flushing_wq, + check_cap_flush(inode, want_flush_seq)); + iput(inode); + } + mutex_lock(&mdsc->mutex); } mutex_unlock(&mdsc->mutex); dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); - return ret; } /* @@ -1874,7 +1902,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, goto out_free2; } - msg->hdr.version = 2; + msg->hdr.version = cpu_to_le16(2); msg->hdr.tid = cpu_to_le64(req->r_tid); head = msg->front.iov_base; @@ -1912,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->num_releases = cpu_to_le16(releases); /* time stamp */ - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; @@ -2001,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* time stamp */ p = msg->front.iov_base + req->r_request_release_offset; - ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); + { + struct ceph_timespec ts; + ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_copy(&p, &ts, sizeof(ts)); + } msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); @@ -2148,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) p = rb_next(p); if (req->r_got_unsafe) continue; + if (req->r_attempts > 0) + continue; /* only new requests */ if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); @@ -2208,6 +2246,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, &req->r_completion, req->r_timeout); if (err == 0) err = -EIO; + } else if (req->r_wait_for_completion) { + err = req->r_wait_for_completion(mdsc, req); } else { err = wait_for_completion_killable(&req->r_completion); } @@ -2273,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) struct ceph_mds_request *req; struct ceph_mds_reply_head *head = msg->front.iov_base; struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */ + struct ceph_snap_realm *realm; u64 tid; int err, result; int mds = session->s_mds; @@ -2388,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) } /* snap trace */ + realm = NULL; if (rinfo->snapblob_len) { down_write(&mdsc->snap_rwsem); ceph_update_snap_trace(mdsc, rinfo->snapblob, - rinfo->snapblob + rinfo->snapblob_len, - le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP); + rinfo->snapblob + rinfo->snapblob_len, + le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP, + &realm); downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); @@ -2410,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_unlock(&req->r_fill_mutex); up_read(&mdsc->snap_rwsem); + if (realm) + ceph_put_snap_realm(mdsc, realm); out_err: mutex_lock(&mdsc->mutex); if (!req->r_aborted) { @@ -2474,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc, dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds); BUG_ON(req->r_err); BUG_ON(req->r_got_result); + req->r_attempts = 0; req->r_num_fwd = fwd_seq; req->r_resend_mds = next_mds; put_request_session(req); @@ -2567,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session, send_flushmsg_ack(mdsc, session, seq); break; + case CEPH_SESSION_FORCE_RO: + dout("force_session_readonly %p\n", session); + spin_lock(&session->s_cap_lock); + session->s_readonly = true; + spin_unlock(&session->s_cap_lock); + wake_up_session_caps(session, 0); + break; + default: pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds); WARN_ON(1); @@ -2597,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_mds_request *req, *nreq; + struct rb_node *p; int err; dout("replay_unsafe_requests mds%d\n", session->s_mds); @@ -2609,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ceph_con_send(&session->s_con, req->r_request); } } + + /* + * also re-send old requests when MDS enters reconnect stage. So that MDS + * can process completed request in clientreplay stage. + */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_got_unsafe) + continue; + if (req->r_attempts == 0) + continue; /* only old requests */ + if (req->r_session && + req->r_session->s_mds == session->s_mds) { + err = __prepare_send_request(mdsc, req, session->s_mds); + if (!err) { + ceph_msg_get(req->r_request); + ceph_con_send(&session->s_con, req->r_request); + } + } + } mutex_unlock(&mdsc->mutex); } @@ -2687,20 +2764,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_filelock *flocks; encode_again: - spin_lock(&inode->i_lock); ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - spin_unlock(&inode->i_lock); flocks = kmalloc((num_fcntl_locks+num_flock_locks) * sizeof(struct ceph_filelock), GFP_NOFS); if (!flocks) { err = -ENOMEM; goto out_free; } - spin_lock(&inode->i_lock); err = ceph_encode_locks_to_buffer(inode, flocks, num_fcntl_locks, num_flock_locks); - spin_unlock(&inode->i_lock); if (err) { kfree(flocks); if (err == -ENOSPC) @@ -2778,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, spin_unlock(&session->s_gen_ttl_lock); spin_lock(&session->s_cap_lock); + /* don't know if session is readonly */ + session->s_readonly = 0; /* * notify __ceph_remove_cap() that we are composing cap reconnect. * If a cap get released before being added to the cap reconnect, @@ -2924,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc, mutex_unlock(&s->s_mutex); s->s_state = CEPH_MDS_SESSION_RESTARTING; } - - /* kick any requests waiting on the recovering mds */ - kick_requests(mdsc, i); } else if (oldstate == newstate) { continue; /* nothing new with this mds */ } @@ -3286,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; + atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; init_rwsem(&mdsc->snap_rwsem); @@ -3419,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) dout("sync\n"); mutex_lock(&mdsc->mutex); want_tid = mdsc->last_tid; - want_flush = mdsc->cap_flush_seq; mutex_unlock(&mdsc->mutex); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); ceph_flush_dirty_caps(mdsc); + spin_lock(&mdsc->cap_dirty_lock); + want_flush = mdsc->cap_flush_seq; + spin_unlock(&mdsc->cap_dirty_lock); + + dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); wait_unsafe_requests(mdsc, want_tid); - wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); + wait_caps_flush(mdsc, want_flush); } /* @@ -3434,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) */ static bool done_closing_sessions(struct ceph_mds_client *mdsc) { - int i, n = 0; - if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return true; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) - if (mdsc->sessions[i]) - n++; - mutex_unlock(&mdsc->mutex); - return n == 0; + return atomic_read(&mdsc->num_sessions) == 0; } /* @@ -3744,6 +3812,20 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con, return msg; } +static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_sign_message(auth, msg); +} + +static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mds_session *s = con->private; + struct ceph_auth_handshake *auth = &s->s_auth; + return ceph_auth_check_message_signature(auth, msg); +} + static const struct ceph_connection_operations mds_con_ops = { .get = con_get, .put = con_put, @@ -3753,6 +3835,8 @@ static const struct ceph_connection_operations mds_con_ops = { .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, .alloc_msg = mds_alloc_msg, + .sign_message = sign_message, + .check_message_signature = check_message_signature, }; /* eof */ diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 3288359353e9..1875b5d985c6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -41,6 +41,9 @@ struct ceph_mds_reply_info_in { char *symlink; u32 xattr_len; char *xattr_data; + u64 inline_version; + u32 inline_len; + char *inline_data; }; /* @@ -134,6 +137,7 @@ struct ceph_mds_session { int s_nr_caps, s_trim_caps; int s_num_cap_releases; int s_cap_reconnect; + int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; @@ -166,6 +170,11 @@ struct ceph_mds_client; */ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); +/* + * wait for request completion callback + */ +typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, + struct ceph_mds_request *req); /* * an in-flight mds request @@ -215,6 +224,7 @@ struct ceph_mds_request { int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; + struct page *r_locked_page; int r_err; bool r_aborted; @@ -239,6 +249,7 @@ struct ceph_mds_request { struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; + ceph_mds_request_wait_callback_t r_wait_for_completion; struct list_head r_unsafe_item; /* per-session unsafe list item */ bool r_got_unsafe, r_got_safe, r_got_result; @@ -262,6 +273,7 @@ struct ceph_mds_client { struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ + atomic_t num_sessions; int max_sessions; /* len of s_mds_sessions */ int stopping; /* true if shutting down */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f01645a27752..a97e39f09ba6 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, * safe. we do need to protect against concurrent empty list * additions, however. */ - if (atomic_read(&realm->nref) == 0) { + if (atomic_inc_return(&realm->nref) == 1) { spin_lock(&mdsc->snap_empty_lock); list_del_init(&realm->empty_item); spin_unlock(&mdsc->snap_empty_lock); } - - atomic_inc(&realm->nref); } static void __insert_snap_realm(struct rb_root *root, @@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm( if (!realm) return ERR_PTR(-ENOMEM); - atomic_set(&realm->nref, 0); /* tree does not take a ref */ + atomic_set(&realm->nref, 1); /* for caller */ realm->ino = ino; INIT_LIST_HEAD(&realm->children); INIT_LIST_HEAD(&realm->child_item); @@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( * * caller must hold snap_rwsem for write. */ -struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, - u64 ino) +static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) { struct rb_node *n = mdsc->snap_realms.rb_node; struct ceph_snap_realm *r; @@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, return NULL; } +struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, + u64 ino) +{ + struct ceph_snap_realm *r; + r = __lookup_snap_realm(mdsc, ino); + if (r) + ceph_get_snap_realm(mdsc, r); + return r; +} + static void __put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); @@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc, } realm->parent_ino = parentino; realm->parent = parent; - ceph_get_snap_realm(mdsc, parent); list_add(&realm->child_item, &parent->children); return 1; } @@ -288,6 +295,9 @@ static int cmpu64_rev(const void *a, const void *b) return 0; } + +static struct ceph_snap_context *empty_snapc; + /* * build the snap context for a given realm. */ @@ -328,6 +338,12 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } + if (num == 0 && realm->seq == empty_snapc->seq) { + ceph_get_snap_context(empty_snapc); + snapc = empty_snapc; + goto done; + } + /* alloc new snap context */ err = -ENOMEM; if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) @@ -365,8 +381,8 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) - ceph_put_snap_context(realm->cached_context); +done: + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; return 0; @@ -466,6 +482,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); + } else if (ci->i_snap_realm->cached_context == empty_snapc) { + dout("queue_cap_snap %p empty snapc\n", inode); + kfree(capsnap); } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { struct ceph_snap_context *snapc = ci->i_head_snapc; @@ -504,6 +523,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this snapshot. */ @@ -590,15 +611,13 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) if (!inode) continue; spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); lastinode = inode; ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); + iput(lastinode); list_for_each_entry(child, &realm->children, child_item) { dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", @@ -619,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) * Caller must hold snap_rwsem for write. */ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, - void *p, void *e, bool deletion) + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret) { struct ceph_mds_snap_realm *ri; /* encoded */ __le64 *snaps; /* encoded */ __le64 *prior_parent_snaps; /* encoded */ - struct ceph_snap_realm *realm; + struct ceph_snap_realm *realm = NULL; + struct ceph_snap_realm *first_realm = NULL; int invalidate = 0; int err = -ENOMEM; LIST_HEAD(dirty_realms); @@ -692,13 +713,18 @@ more: dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino, realm, invalidate, p, e); - if (p < e) - goto more; - /* invalidate when we reach the _end_ (root) of the trace */ - if (invalidate) + if (invalidate && p >= e) rebuild_snap_realms(realm); + if (!first_realm) + first_realm = realm; + else + ceph_put_snap_realm(mdsc, realm); + + if (p < e) + goto more; + /* * queue cap snaps _after_ we've built the new snap contexts, * so that i_head_snapc can be set appropriately. @@ -709,12 +735,21 @@ more: queue_realm_cap_snaps(realm); } + if (realm_ret) + *realm_ret = first_realm; + else + ceph_put_snap_realm(mdsc, first_realm); + __cleanup_empty_realms(mdsc); return 0; bad: err = -EINVAL; fail: + if (realm && !IS_ERR(realm)) + ceph_put_snap_realm(mdsc, realm); + if (first_realm) + ceph_put_snap_realm(mdsc, first_realm); pr_err("update_snap_trace error %d\n", err); return err; } @@ -832,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, if (IS_ERR(realm)) goto out; } - ceph_get_snap_realm(mdsc, realm); dout("splitting snap_realm %llx %p\n", realm->ino, realm); for (i = 0; i < num_split_inos; i++) { @@ -893,7 +927,7 @@ skip_inode: /* we may have taken some of the old realm's children. */ for (i = 0; i < num_split_realms; i++) { struct ceph_snap_realm *child = - ceph_lookup_snap_realm(mdsc, + __lookup_snap_realm(mdsc, le64_to_cpu(split_realms[i])); if (!child) continue; @@ -906,7 +940,7 @@ skip_inode: * snap, we can avoid queueing cap_snaps. */ ceph_update_snap_trace(mdsc, p, e, - op == CEPH_SNAP_OP_DESTROY); + op == CEPH_SNAP_OP_DESTROY, NULL); if (op == CEPH_SNAP_OP_SPLIT) /* we took a reference when we created the realm, above */ @@ -928,5 +962,16 @@ out: return; } +int __init ceph_snap_init(void) +{ + empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!empty_snapc) + return -ENOMEM; + empty_snapc->seq = 1; + return 0; +} - +void ceph_snap_exit(void) +{ + ceph_put_snap_context(empty_snapc); +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f6e12377335c..a63997b8bcff 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s) dout("put_super\n"); ceph_mdsc_close_sessions(fsc->mdsc); - - /* - * ensure we release the bdi before put_anon_super releases - * the device name. - */ - if (s->s_bdi == &fsc->backing_dev_info) { - bdi_unregister(&fsc->backing_dev_info); - s->s_bdi = NULL; - } - - return; } static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -425,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noshare"); if (opt->flags & CEPH_OPT_NOCRC) seq_puts(m, ",nocrc"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, ",nocephx_require_signatures"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, ",notcp_nodelay"); if (opt->name) seq_printf(m, ",name=%s", opt->name); @@ -515,7 +508,8 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_fs_client *fsc; const u64 supported_features = CEPH_FEATURE_FLOCK | - CEPH_FEATURE_DIRLAYOUTHASH; + CEPH_FEATURE_DIRLAYOUTHASH | + CEPH_FEATURE_MDS_INLINE_DATA; const u64 required_features = 0; int page_count; size_t size; @@ -909,7 +903,7 @@ static int ceph_register_bdi(struct super_block *sb, >> PAGE_SHIFT; else fsc->backing_dev_info.ra_pages = - default_backing_dev_info.ra_pages; + VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE; err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", atomic_long_inc_return(&bdi_seq)); @@ -1001,11 +995,16 @@ out_final: static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); + dev_t dev = s->s_dev; + dout("kill_sb %p\n", s); + ceph_mdsc_pre_umount(fsc->mdsc); - kill_anon_super(s); /* will call put_super after sb is r/o */ + generic_shutdown_super(s); ceph_mdsc_destroy(fsc); + destroy_fs_client(fsc); + free_anon_bdev(dev); } static struct file_system_type ceph_fs_type = { @@ -1017,9 +1016,6 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - static int __init init_ceph(void) { int ret = init_caches(); @@ -1028,15 +1024,20 @@ static int __init init_ceph(void) ceph_flock_init(); ceph_xattr_init(); + ret = ceph_snap_init(); + if (ret) + goto out_xattr; ret = register_filesystem(&ceph_fs_type); if (ret) - goto out_icache; + goto out_snap; pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); return 0; -out_icache: +out_snap: + ceph_snap_exit(); +out_xattr: ceph_xattr_exit(); destroy_caches(); out: @@ -1047,6 +1048,7 @@ static void __exit exit_ceph(void) { dout("exit_ceph\n"); unregister_filesystem(&ceph_fs_type); + ceph_snap_exit(); ceph_xattr_exit(); destroy_caches(); } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b82f507979b8..04c8124ed30e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -161,6 +161,7 @@ struct ceph_cap_snap { u64 time_warp_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ + bool inline_data; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -253,9 +254,11 @@ struct ceph_inode_info { spinlock_t i_ceph_lock; u64 i_version; + u64 i_inline_version; u32 i_time_warp_seq; unsigned i_ceph_flags; + int i_ordered_count; atomic_t i_release_count; atomic_t i_complete_count; @@ -434,14 +437,19 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ +#define CEPH_I_NODELAY 4 /* do not delay cap release */ +#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count) + int release_count, int ordered_count) { atomic_set(&ci->i_complete_count, release_count); + if (ci->i_ordered_count == ordered_count) + ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; + else + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) @@ -455,16 +463,35 @@ static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) atomic_read(&ci->i_release_count); } +static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) +{ + return __ceph_dir_is_complete(ci) && + (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); +} + static inline void ceph_dir_clear_complete(struct inode *inode) { __ceph_dir_clear_complete(ceph_inode(inode)); } -static inline bool ceph_dir_is_complete(struct inode *inode) +static inline void ceph_dir_clear_ordered(struct inode *inode) { - return __ceph_dir_is_complete(ceph_inode(inode)); + struct ceph_inode_info *ci = ceph_inode(inode); + spin_lock(&ci->i_ceph_lock); + ci->i_ordered_count++; + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + spin_unlock(&ci->i_ceph_lock); } +static inline bool ceph_dir_is_complete_ordered(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + bool ret; + spin_lock(&ci->i_ceph_lock); + ret = __ceph_dir_is_complete_ordered(ci); + spin_unlock(&ci->i_ceph_lock); + return ret; +} /* find a specific frag @f */ extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, @@ -580,6 +607,7 @@ struct ceph_file_info { char *last_name; /* last entry in previous chunk */ struct dentry *dentry; /* next dentry (for dcache readdir) */ int dir_release_count; + int dir_ordered_count; /* used for -o dirstat read() on directory thing */ char *dir_info; @@ -665,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm); extern int ceph_update_snap_trace(struct ceph_mds_client *m, - void *p, void *e, bool deletion); + void *p, void *e, bool deletion, + struct ceph_snap_realm **realm_ret); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); @@ -673,6 +702,8 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern int ceph_snap_init(void); +extern void ceph_snap_exit(void); /* * a cap_snap is "pending" if it is still awaiting an in-progress @@ -715,7 +746,12 @@ extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); extern void ceph_queue_writeback(struct inode *inode); -extern int ceph_do_getattr(struct inode *inode, int mask, bool force); +extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, + int mask, bool force); +static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) +{ + return __ceph_do_getattr(inode, NULL, mask, force); +} extern int ceph_permission(struct inode *inode, int mask); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -830,7 +866,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, int mds, int drop, int unless); extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, - int *got, loff_t endoff); + loff_t endoff, int *got, struct page **pinned_page); /* for counting open files by mode */ static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) @@ -852,10 +888,14 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode, int *opened); extern int ceph_release(struct inode *inode, struct file *filp); - +extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, + char *data, size_t len); +int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; +extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; +extern const struct inode_operations ceph_snapdir_iops; extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 678b0d2bbbc4..5a492caf34cb 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -854,7 +854,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_pagelist *pagelist = NULL; int err; - if (value) { + if (size > 0) { /* copy value into pagelist */ pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); if (!pagelist) @@ -864,7 +864,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = ceph_pagelist_append(pagelist, value, size); if (err) goto out; - } else { + } else if (!value) { flags |= CEPH_XATTR_REMOVE; } @@ -1001,6 +1001,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __ceph_setxattr(dentry, name, value, size, flags); } diff --git a/fs/char_dev.c b/fs/char_dev.c index f77f7702fabe..ea06a3d0364c 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -24,27 +24,6 @@ #include "internal.h" -/* - * capabilities for /dev/mem, /dev/kmem and similar directly mappable character - * devices - * - permits shared-mmap for read, write and/or exec - * - does not permit private mmap in NOMMU mode (can't do COW) - * - no readahead or I/O queue unplugging required - */ -struct backing_dev_info directly_mappable_cdev_bdi = { - .name = "char", - .capabilities = ( -#ifdef CONFIG_MMU - /* permit private copies of the data to be taken */ - BDI_CAP_MAP_COPY | -#endif - /* permit direct mmap, for read, write or exec */ - BDI_CAP_MAP_DIRECT | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP | - /* no writeback happens */ - BDI_CAP_NO_ACCT_AND_WRITEBACK), -}; - static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); @@ -117,7 +96,6 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, goto out; } major = i; - ret = major; } cd->major = major; @@ -576,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data) void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); - if (bdi_init(&directly_mappable_cdev_bdi)) - panic("Failed to init directly mappable cdev bdi"); } @@ -591,4 +567,3 @@ EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); -EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 44ec72684df5..7febcf2475c5 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -34,27 +34,9 @@ void cifs_dump_mem(char *label, void *data, int length) { - int i, j; - int *intptr = data; - char *charptr = data; - char buf[10], line[80]; - - printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n", - label, length, data); - for (i = 0; i < length; i += 16) { - line[0] = 0; - for (j = 0; (j < 4) && (i + j * 4 < length); j++) { - sprintf(buf, " %08x", intptr[i / 4 + j]); - strcat(line, buf); - } - buf[0] = ' '; - buf[2] = 0; - for (j = 0; (j < 16) && (i + j < length); j++) { - buf[1] = isprint(charptr[i + j]) ? charptr[i + j] : '.'; - strcat(line, buf); - } - printk(KERN_DEBUG "%s\n", line); - } + pr_debug("%s: dump of %d bytes of data at 0x%p\n", label, length, data); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, 4, + data, length, true); } #ifdef CONFIG_CIFS_DEBUG @@ -68,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - printk(KERN_ERR "CIFS VFS: %pV", &vaf); + pr_err("CIFS VFS: %pV", &vaf); va_end(args); } @@ -274,6 +256,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char c; + bool bv; int rc; struct list_head *tmp1, *tmp2, *tmp3; struct TCP_Server_Info *server; @@ -284,7 +267,7 @@ static ssize_t cifs_stats_proc_write(struct file *file, if (rc) return rc; - if (c == '1' || c == 'y' || c == 'Y' || c == '0') { + if (strtobool(&c, &bv) == 0) { #ifdef CONFIG_CIFS_STATS2 atomic_set(&totBufAllocCount, 0); atomic_set(&totSmBufAllocCount, 0); @@ -451,15 +434,14 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char c; + bool bv; int rc; rc = get_user(c, buffer); if (rc) return rc; - if (c == '0' || c == 'n' || c == 'N') - cifsFYI = 0; - else if (c == '1' || c == 'y' || c == 'Y') - cifsFYI = 1; + if (strtobool(&c, &bv) == 0) + cifsFYI = bv; else if ((c > '1') && (c <= '9')) cifsFYI = (int) (c - '0'); /* see cifs_debug.h for meanings */ @@ -490,15 +472,18 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char c; + bool bv; int rc; rc = get_user(c, buffer); if (rc) return rc; - if (c == '0' || c == 'n' || c == 'N') - linuxExtEnabled = 0; - else if (c == '1' || c == 'y' || c == 'Y') - linuxExtEnabled = 1; + + rc = strtobool(&c, &bv); + if (rc) + return rc; + + linuxExtEnabled = bv; return count; } @@ -527,15 +512,18 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char c; + bool bv; int rc; rc = get_user(c, buffer); if (rc) return rc; - if (c == '0' || c == 'n' || c == 'N') - lookupCacheEnabled = 0; - else if (c == '1' || c == 'y' || c == 'Y') - lookupCacheEnabled = 1; + + rc = strtobool(&c, &bv); + if (rc) + return rc; + + lookupCacheEnabled = bv; return count; } @@ -564,15 +552,18 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char c; + bool bv; int rc; rc = get_user(c, buffer); if (rc) return rc; - if (c == '0' || c == 'n' || c == 'N') - traceSMB = 0; - else if (c == '1' || c == 'y' || c == 'Y') - traceSMB = 1; + + rc = strtobool(&c, &bv); + if (rc) + return rc; + + traceSMB = bv; return count; } @@ -615,9 +606,11 @@ cifs_security_flags_handle_must_flags(unsigned int *flags) *flags = CIFSSEC_MUST_NTLMV2; else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM) *flags = CIFSSEC_MUST_NTLM; - else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) + else if (CIFSSEC_MUST_LANMAN && + (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) *flags = CIFSSEC_MUST_LANMAN; - else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) + else if (CIFSSEC_MUST_PLNTXT && + (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) *flags = CIFSSEC_MUST_PLNTXT; *flags |= signflags; @@ -630,6 +623,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, unsigned int flags; char flags_string[12]; char c; + bool bv; if ((count < 1) || (count > 11)) return -EINVAL; @@ -642,11 +636,8 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if (count < 3) { /* single char or single char followed by null */ c = flags_string[0]; - if (c == '0' || c == 'n' || c == 'N') { - global_secflags = CIFSSEC_DEF; /* default */ - return count; - } else if (c == '1' || c == 'y' || c == 'Y') { - global_secflags = CIFSSEC_MAX; + if (strtobool(&c, &bv) == 0) { + global_secflags = bv ? CIFSSEC_MAX : CIFSSEC_DEF; return count; } else if (!isdigit(c)) { cifs_dbg(VFS, "Invalid SecurityFlags: %s\n", diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index c99b40fb609b..f40fbaca1b2a 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -53,13 +53,12 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...); do { \ if (type == FYI) { \ if (cifsFYI & CIFS_INFO) { \ - printk(KERN_DEBUG "%s: " fmt, \ - __FILE__, ##__VA_ARGS__); \ + pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \ } \ } else if (type == VFS) { \ cifs_vfs_err(fmt, ##__VA_ARGS__); \ } else if (type == NOISY && type != 0) { \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + pr_debug(fmt, ##__VA_ARGS__); \ } \ } while (0) @@ -71,7 +70,7 @@ do { \ #define cifs_dbg(type, fmt, ...) \ do { \ if (0) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + pr_debug(fmt, ##__VA_ARGS__); \ } while (0) #endif diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6d00c419cbae..1ea780bc6376 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; /* security id for Authenticated Users system group */ static const struct cifs_sid sid_authusers = { - 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} }; + 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 4ac7445e6ec7..aa0dc2573374 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -1,6 +1,9 @@ /* * fs/cifs/cifsencrypt.c * + * Encryption and hashing operations relating to NTLM, NTLMv2. See MS-NLMP + * for more detailed information + * * Copyright (C) International Business Machines Corp., 2005,2013 * Author(s): Steve French (sfrench@us.ibm.com) * @@ -515,7 +518,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, __func__); return rc; } - } else if (ses->serverName) { + } else { + /* We use ses->serverName if no domain name available */ len = strlen(ses->serverName); server = kmalloc(2 + (len * 2), GFP_KERNEL); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 9d7996e8e793..d72fe37f5420 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -209,8 +209,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf) static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) { - struct super_block *sb = file->f_path.dentry->d_sb; - struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 002e0c173939..252f5c15806b 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.05" +#define CIFS_VERSION "2.06" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 02a33e529904..22b289a3b1c4 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -661,16 +661,16 @@ set_credits(struct TCP_Server_Info *server, const int val) server->ops->set_credits(server, val); } -static inline __u64 +static inline __le64 get_next_mid64(struct TCP_Server_Info *server) { - return server->ops->get_next_mid(server); + return cpu_to_le64(server->ops->get_next_mid(server)); } static inline __le16 get_next_mid(struct TCP_Server_Info *server) { - __u16 mid = get_next_mid64(server); + __u16 mid = server->ops->get_next_mid(server); /* * The value in the SMB header should be little endian for easy * on-the-wire decoding. @@ -1168,6 +1168,12 @@ CIFS_SB(struct super_block *sb) return sb->s_fs_info; } +static inline struct cifs_sb_info * +CIFS_FILE_SB(struct file *file) +{ + return CIFS_SB(file_inode(file)->i_sb); +} + static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 61d00a6e398f..fa13d5e79f64 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; else { if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_RDLCK)) + cpu_to_le16(CIFS_RDLCK)) pLockData->fl_type = F_RDLCK; else if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_WRLCK)) + cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK; pLockData->fl_start = le64_to_cpu(parm_data->start); @@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); pSMB->TotalParameterCount = 0; - pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->TotalDataCount = cpu_to_le32(2); pSMB->MaxParameterCount = 0; pSMB->MaxDataCount = 0; pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; - pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataCount = cpu_to_le32(2); pSMB->DataOffset = cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, compression_state) - 4); /* 84 */ pSMB->SetupCount = 4; - pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); pSMB->ParameterCount = 0; - pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); pSMB->IsFsctl = 1; /* FSCTL */ pSMB->IsRootFlag = 0; pSMB->Fid = fid; /* file handle always le */ /* 3 byte pad, followed by 2 byte compress state */ - pSMB->ByteCount = __constant_cpu_to_le16(5); + pSMB->ByteCount = cpu_to_le16(5); inc_rfc1001_len(pSMB, 5); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 24fa08d261fb..480cf9c81d50 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1466,9 +1466,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->seal = 1; break; case Opt_noac: - printk(KERN_WARNING "CIFS: Mount option noac not " - "supported. Instead set " - "/proc/fs/cifs/LookupCacheEnabled to 0\n"); + pr_warn("CIFS: Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n"); break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE @@ -1598,9 +1596,11 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (strnlen(string, CIFS_MAX_USERNAME_LEN) > CIFS_MAX_USERNAME_LEN) { - printk(KERN_WARNING "CIFS: username too long\n"); + pr_warn("CIFS: username too long\n"); goto cifs_parse_mount_err; } + + kfree(vol->username); vol->username = kstrdup(string, GFP_KERNEL); if (!vol->username) goto cifs_parse_mount_err; @@ -1662,8 +1662,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); if (vol->password == NULL) { - printk(KERN_WARNING "CIFS: no memory " - "for password\n"); + pr_warn("CIFS: no memory for password\n"); goto cifs_parse_mount_err; } @@ -1687,8 +1686,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (!cifs_convert_address(dstaddr, string, strlen(string))) { - printk(KERN_ERR "CIFS: bad ip= option (%s).\n", - string); + pr_err("CIFS: bad ip= option (%s).\n", string); goto cifs_parse_mount_err; } got_ip = true; @@ -1700,15 +1698,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (strnlen(string, CIFS_MAX_DOMAINNAME_LEN) == CIFS_MAX_DOMAINNAME_LEN) { - printk(KERN_WARNING "CIFS: domain name too" - " long\n"); + pr_warn("CIFS: domain name too long\n"); goto cifs_parse_mount_err; } + kfree(vol->domainname); vol->domainname = kstrdup(string, GFP_KERNEL); if (!vol->domainname) { - printk(KERN_WARNING "CIFS: no memory " - "for domainname\n"); + pr_warn("CIFS: no memory for domainname\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); @@ -1721,8 +1718,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (!cifs_convert_address( (struct sockaddr *)&vol->srcaddr, string, strlen(string))) { - printk(KERN_WARNING "CIFS: Could not parse" - " srcaddr: %s\n", string); + pr_warn("CIFS: Could not parse srcaddr: %s\n", + string); goto cifs_parse_mount_err; } break; @@ -1732,17 +1729,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto out_nomem; if (strnlen(string, 1024) >= 65) { - printk(KERN_WARNING "CIFS: iocharset name " - "too long.\n"); + pr_warn("CIFS: iocharset name too long.\n"); goto cifs_parse_mount_err; } if (strncasecmp(string, "default", 7) != 0) { + kfree(vol->iocharset); vol->iocharset = kstrdup(string, GFP_KERNEL); if (!vol->iocharset) { - printk(KERN_WARNING "CIFS: no memory" - "for charset\n"); + pr_warn("CIFS: no memory for charset\n"); goto cifs_parse_mount_err; } } @@ -1773,9 +1769,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, * set at top of the function */ if (i == RFC1001_NAME_LEN && string[i] != 0) - printk(KERN_WARNING "CIFS: netbiosname" - " longer than 15 truncated.\n"); - + pr_warn("CIFS: netbiosname longer than 15 truncated.\n"); break; case Opt_servern: /* servernetbiosname specified override *SMBSERVER */ @@ -1801,8 +1795,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* The string has 16th byte zero still from set at top of the function */ if (i == RFC1001_NAME_LEN && string[i] != 0) - printk(KERN_WARNING "CIFS: server net" - "biosname longer than 15 truncated.\n"); + pr_warn("CIFS: server netbiosname longer than 15 truncated.\n"); break; case Opt_ver: string = match_strdup(args); @@ -1814,8 +1807,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, break; } /* For all other value, error */ - printk(KERN_WARNING "CIFS: Invalid version" - " specified\n"); + pr_warn("CIFS: Invalid version specified\n"); goto cifs_parse_mount_err; case Opt_vers: string = match_strdup(args); @@ -1856,7 +1848,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } if (!sloppy && invalid) { - printk(KERN_ERR "CIFS: Unknown mount option \"%s\"\n", invalid); + pr_err("CIFS: Unknown mount option \"%s\"\n", invalid); goto cifs_parse_mount_err; } @@ -1882,8 +1874,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* No ip= option specified? Try to get it from UNC */ if (!cifs_convert_address(dstaddr, &vol->UNC[2], strlen(&vol->UNC[2]))) { - printk(KERN_ERR "Unable to determine destination " - "address.\n"); + pr_err("Unable to determine destination address.\n"); goto cifs_parse_mount_err; } } @@ -1894,20 +1885,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (uid_specified) vol->override_uid = override_uid; else if (override_uid == 1) - printk(KERN_NOTICE "CIFS: ignoring forceuid mount option " - "specified with no uid= option.\n"); + pr_notice("CIFS: ignoring forceuid mount option specified with no uid= option.\n"); if (gid_specified) vol->override_gid = override_gid; else if (override_gid == 1) - printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " - "specified with no gid= option.\n"); + pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); kfree(mountdata_copy); return 0; out_nomem: - printk(KERN_WARNING "Could not allocate temporary buffer\n"); + pr_warn("Could not allocate temporary buffer\n"); cifs_parse_mount_err: kfree(string); kfree(mountdata_copy); @@ -2928,8 +2917,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * calling name ends in null (byte 16) from old smb * convention. */ - if (server->workstation_RFC1001_name && - server->workstation_RFC1001_name[0] != 0) + if (server->workstation_RFC1001_name[0] != 0) rfc1002mangle(ses_init_buf->trailer. session_req.calling_name, server->workstation_RFC1001_name, @@ -3461,7 +3449,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) int referral_walks_count = 0; #endif - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); if (rc) return rc; @@ -3707,6 +3695,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, #endif /* CIFS_WEAK_PW_HASH */ rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr, nls_codepage); + if (rc) { + cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n", + __func__, rc); + cifs_buf_release(smb_buffer); + return rc; + } bcc_ptr += CIFS_AUTH_RESP_SIZE; if (ses->capabilities & CAP_UNICODE) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3e4d00a06c44..ca30c391a894 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -366,6 +366,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) struct cifsLockInfo *li, *tmp; struct cifs_fid fid; struct cifs_pending_open open; + bool oplock_break_cancelled; spin_lock(&cifs_file_list_lock); if (--cifs_file->count > 0) { @@ -397,7 +398,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) } spin_unlock(&cifs_file_list_lock); - cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -409,6 +410,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) _free_xid(xid); } + if (oplock_break_cancelled) + cifs_done_oplock_break(cifsi); + cifs_del_pending_open(&open); /* @@ -1066,7 +1070,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; @@ -1109,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) return rc; } -/* copied from fs/locks.c with a name change */ -#define cifs_for_each_lock(inode, lockp) \ - for (lockp = &inode->i_flock; *lockp != NULL; \ - lockp = &(*lockp)->fl_next) - struct lock_to_push { struct list_head llist; __u64 offset; @@ -1128,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) { struct inode *inode = cfile->dentry->d_inode; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct file_lock *flock, **before; - unsigned int count = 0, i = 0; + struct file_lock *flock; + struct file_lock_context *flctx = inode->i_flctx; + unsigned int count = 0, i; int rc = 0, xid, type; struct list_head locks_to_send, *el; struct lock_to_push *lck, *tmp; @@ -1137,12 +1137,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) xid = get_xid(); - spin_lock(&inode->i_lock); - cifs_for_each_lock(inode, before) { - if ((*before)->fl_flags & FL_POSIX) - count++; + if (!flctx) + goto out; + + spin_lock(&flctx->flc_lock); + list_for_each(el, &flctx->flc_posix) { + count++; } - spin_unlock(&inode->i_lock); + spin_unlock(&flctx->flc_lock); INIT_LIST_HEAD(&locks_to_send); @@ -1151,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) * added to the list while we are holding cinode->lock_sem that * protects locking operations of this inode. */ - for (; i < count; i++) { + for (i = 0; i < count; i++) { lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL); if (!lck) { rc = -ENOMEM; @@ -1161,11 +1163,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) } el = locks_to_send.next; - spin_lock(&inode->i_lock); - cifs_for_each_lock(inode, before) { - flock = *before; - if ((flock->fl_flags & FL_POSIX) == 0) - continue; + spin_lock(&flctx->flc_lock); + list_for_each_entry(flock, &flctx->flc_posix, fl_list) { if (el == &locks_to_send) { /* * The list ended. We don't have enough allocated @@ -1185,9 +1184,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile) lck->length = length; lck->type = type; lck->offset = flock->fl_start; - el = el->next; } - spin_unlock(&inode->i_lock); + spin_unlock(&flctx->flc_lock); list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) { int stored_rc; @@ -1401,7 +1399,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM; @@ -1586,7 +1584,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, tcon->ses->server); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cifs_sb = CIFS_FILE_SB(file); netfid = cfile->fid.netfid; cinode = CIFS_I(file_inode(file)); @@ -1825,6 +1823,7 @@ refind_writable: cifsFileInfo_put(inv_file); spin_lock(&cifs_file_list_lock); ++refind; + inv_file = NULL; goto refind_writable; } } @@ -2305,7 +2304,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct inode *inode = file->f_mapping->host; rc = filemap_write_and_wait_range(inode->i_mapping, start, end); @@ -2585,7 +2584,7 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) iov_iter_truncate(from, len); INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cifs_sb = CIFS_FILE_SB(file); open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); @@ -3010,7 +3009,7 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) return 0; INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cifs_sb = CIFS_FILE_SB(file); open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); @@ -3155,7 +3154,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) __u32 pid; xid = get_xid(); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + cifs_sb = CIFS_FILE_SB(file); /* FIXME: set up handlers for larger reads and/or convert to async */ rsize = min_t(unsigned int, cifs_sb->rsize, CIFSMaxBufSize); @@ -3244,7 +3243,6 @@ static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) @@ -3462,7 +3460,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, int rc; struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct TCP_Server_Info *server; pid_t pid; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 197cb503d528..3e126d7bb2ea 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -771,6 +771,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, cifs_buf_release(srchinf->ntwrk_buf_start); } kfree(srchinf); + if (rc) + goto cgii_exit; } else goto cgii_exit; @@ -895,7 +897,7 @@ inode_has_hashed_dentries(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&inode->i_lock); return true; @@ -937,8 +939,6 @@ retry_iget5_locked: inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; - if (S_ISREG(inode->i_mode)) - inode->i_data.backing_dev_info = sb->s_bdi; #ifdef CONFIG_CIFS_FSCACHE /* initialize per-inode cache cookie pointer */ CIFS_I(inode)->fscache = NULL; diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 45cb59bcc791..8b7898b7670f 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -86,21 +86,16 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, } src_inode = file_inode(src_file.file); + rc = -EINVAL; + if (S_ISDIR(src_inode->i_mode)) + goto out_fput; /* * Note: cifs case is easier than btrfs since server responsible for * checks for proper open modes and file type and if it wants * server could even support copy of range where source = target */ - - /* so we do not deadlock racing two ioctls on same files */ - if (target_inode < src_inode) { - mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD); - } + lock_two_nondirectories(target_inode, src_inode); /* determine range to clone */ rc = -EINVAL; @@ -124,13 +119,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, out_unlock: /* although unlocking in the reverse order from locking is not strictly necessary here it is a little cleaner to be consistent */ - if (target_inode < src_inode) { - mutex_unlock(&src_inode->i_mutex); - mutex_unlock(&target_inode->i_mutex); - } else { - mutex_unlock(&target_inode->i_mutex); - mutex_unlock(&src_inode->i_mutex); - } + unlock_two_nondirectories(src_inode, target_inode); out_fput: fdput(src_file); out_drop_write: diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index b7415d596dbd..337946355b29 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -513,39 +513,11 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) void dump_smb(void *buf, int smb_buf_length) { - int i, j; - char debug_line[17]; - unsigned char *buffer = buf; - if (traceSMB == 0) return; - for (i = 0, j = 0; i < smb_buf_length; i++, j++) { - if (i % 8 == 0) { - /* have reached the beginning of line */ - printk(KERN_DEBUG "| "); - j = 0; - } - printk("%0#4x ", buffer[i]); - debug_line[2 * j] = ' '; - if (isprint(buffer[i])) - debug_line[1 + (2 * j)] = buffer[i]; - else - debug_line[1 + (2 * j)] = '_'; - - if (i % 8 == 7) { - /* reached end of line, time to print ascii */ - debug_line[16] = 0; - printk(" | %s\n", debug_line); - } - } - for (; j < 8; j++) { - printk(" "); - debug_line[2 * j] = ' '; - debug_line[1 + (2 * j)] = ' '; - } - printk(" | %s\n", debug_line); - return; + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 8, 2, buf, + smb_buf_length, true); } void diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index b333ff60781d..abae6dd2c6b9 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -926,6 +926,7 @@ cifs_NTtimeToUnix(__le64 ntutc) /* Subtract the NTFS time offset, then convert to 1s intervals. */ s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + u64 abs_t; /* * Unfortunately can not use normal 64 bit division on 32 bit arch, but @@ -933,13 +934,14 @@ cifs_NTtimeToUnix(__le64 ntutc) * to special case them */ if (t < 0) { - t = -t; - ts.tv_nsec = (long)(do_div(t, 10000000) * 100); + abs_t = -t; + ts.tv_nsec = (long)(do_div(abs_t, 10000000) * 100); ts.tv_nsec = -ts.tv_nsec; - ts.tv_sec = -t; + ts.tv_sec = -abs_t; } else { - ts.tv_nsec = (long)do_div(t, 10000000) * 100; - ts.tv_sec = t; + abs_t = t; + ts.tv_nsec = (long)do_div(abs_t, 10000000) * 100; + ts.tv_sec = abs_t; } return ts; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8fd2a95860ba..c295338e0a98 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -69,7 +69,8 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT * * Find the dentry that matches "name". If there isn't one, create one. If it's - * a negative dentry or the uniqueid changed, then drop it and recreate it. + * a negative dentry or the uniqueid or filetype(mode) changed, + * then drop it and recreate it. */ static void cifs_prime_dcache(struct dentry *parent, struct qstr *name, @@ -97,8 +98,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) fattr->cf_uniqueid = CIFS_I(inode)->uniqueid; - /* update inode in place if i_ino didn't change */ - if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) { + /* update inode in place + * if both i_ino and i_mode didn't change */ + if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && + (inode->i_mode & S_IFMT) == + (fattr->cf_mode & S_IFMT)) { cifs_fattr_to_inode(inode, fattr); goto out; } @@ -123,7 +127,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, if (!inode) goto out; - alias = d_materialise_unique(dentry, inode); + alias = d_splice_alias(inode, dentry); if (alias && !IS_ERR(alias)) dput(alias); out: @@ -261,7 +265,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) int rc = 0; char *full_path = NULL; struct cifsFileInfo *cifsFile; - struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon; struct TCP_Server_Info *server; @@ -561,7 +565,7 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, loff_t first_entry_in_buffer; loff_t index_to_find = pos; struct cifsFileInfo *cfile = file->private_data; - struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); + struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct TCP_Server_Info *server = tcon->ses->server; /* check if index in the buffer */ @@ -679,7 +683,7 @@ static int cifs_filldir(char *find_entry, struct file *file, char *scratch_buf, unsigned int max_len) { struct cifsFileInfo *file_info = file->private_data; - struct super_block *sb = file->f_path.dentry->d_sb; + struct super_block *sb = file_inode(file)->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_dirent de = { NULL, }; struct cifs_fattr fattr; @@ -753,7 +757,7 @@ static int cifs_filldir(char *find_entry, struct file *file, */ fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - cifs_prime_dcache(file->f_dentry, &name, &fattr); + cifs_prime_dcache(file->f_path.dentry, &name, &fattr); ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid); return !dir_emit(ctx, name.name, name.len, ino, fattr.cf_dtype); @@ -794,10 +798,6 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) if it before then restart search if after then keep searching till find it */ - if (file->private_data == NULL) { - rc = -EINVAL; - goto rddir2_exit; - } cifsFile = file->private_data; if (cifsFile->srch_inf.endOfSearch) { if (cifsFile->srch_inf.emptyDir) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 57db63ff88da..bce6fdcd5d48 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = __constant_cpu_to_le16(1); + pSMB->req.VcNumber = cpu_to_le16(1); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -1303,6 +1303,11 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ + if (ses->Suid != smb_buf->Uid) { + ses->Suid = smb_buf->Uid; + cifs_dbg(FYI, "UID changed! new UID = %llu\n", ses->Suid); + } + bytes_remaining = get_bcc(smb_buf); bcc_ptr = pByteArea(smb_buf); blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 45992944e238..7198eac5dddd 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, return -EINVAL; max_num = max_buf / sizeof(struct smb2_lock_element); - buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) return -ENOMEM; @@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) } max_num = max_buf / sizeof(struct smb2_lock_element); - buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1a08a34838fc..22dfdf17d065 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -32,12 +32,14 @@ static int check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) { + __u64 wire_mid = le64_to_cpu(hdr->MessageId); + /* * Make sure that this really is an SMB, that it is a response, * and that the message ids match. */ if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && - (mid == hdr->MessageId)) { + (mid == wire_mid)) { if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) return 0; else { @@ -51,11 +53,11 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) cifs_dbg(VFS, "Bad protocol string signature header %x\n", *(unsigned int *) hdr->ProtocolId); - if (mid != hdr->MessageId) + if (mid != wire_mid) cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", - mid, hdr->MessageId); + mid, wire_mid); } - cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId); + cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid); return 1; } @@ -67,27 +69,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * indexed by command in host byte order */ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { - /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65), - /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9), - /* SMB2_LOGOFF */ __constant_cpu_to_le16(4), - /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16), - /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4), - /* SMB2_CREATE */ __constant_cpu_to_le16(89), - /* SMB2_CLOSE */ __constant_cpu_to_le16(60), - /* SMB2_FLUSH */ __constant_cpu_to_le16(4), - /* SMB2_READ */ __constant_cpu_to_le16(17), - /* SMB2_WRITE */ __constant_cpu_to_le16(17), - /* SMB2_LOCK */ __constant_cpu_to_le16(4), - /* SMB2_IOCTL */ __constant_cpu_to_le16(49), + /* SMB2_NEGOTIATE */ cpu_to_le16(65), + /* SMB2_SESSION_SETUP */ cpu_to_le16(9), + /* SMB2_LOGOFF */ cpu_to_le16(4), + /* SMB2_TREE_CONNECT */ cpu_to_le16(16), + /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4), + /* SMB2_CREATE */ cpu_to_le16(89), + /* SMB2_CLOSE */ cpu_to_le16(60), + /* SMB2_FLUSH */ cpu_to_le16(4), + /* SMB2_READ */ cpu_to_le16(17), + /* SMB2_WRITE */ cpu_to_le16(17), + /* SMB2_LOCK */ cpu_to_le16(4), + /* SMB2_IOCTL */ cpu_to_le16(49), /* BB CHECK this ... not listed in documentation */ - /* SMB2_CANCEL */ __constant_cpu_to_le16(0), - /* SMB2_ECHO */ __constant_cpu_to_le16(4), - /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9), - /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9), - /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9), - /* SMB2_SET_INFO */ __constant_cpu_to_le16(2), + /* SMB2_CANCEL */ cpu_to_le16(0), + /* SMB2_ECHO */ cpu_to_le16(4), + /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9), + /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9), + /* SMB2_QUERY_INFO */ cpu_to_le16(9), + /* SMB2_SET_INFO */ cpu_to_le16(2), /* BB FIXME can also be 44 for lease break */ - /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24) + /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; int @@ -95,7 +97,7 @@ smb2_check_message(char *buf, unsigned int length) { struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; - __u64 mid = hdr->MessageId; + __u64 mid = le64_to_cpu(hdr->MessageId); __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; @@ -320,7 +322,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return hdr->ProtocolId + *off; + return (char *)(&hdr->ProtocolId[0]) + *off; else return NULL; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index c5f521bcdee2..eab05e1aa587 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -176,10 +176,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) { struct mid_q_entry *mid; struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + __u64 wire_mid = le64_to_cpu(hdr->MessageId); spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { - if ((mid->mid == hdr->MessageId) && + if ((mid->mid == wire_mid) && (mid->mid_state == MID_REQUEST_SUBMITTED) && (mid->command == hdr->Command)) { spin_unlock(&GlobalMid_Lock); @@ -600,7 +601,7 @@ smb2_clone_range(const unsigned int xid, goto cchunk_out; /* For now array only one chunk long, will make more flexible later */ - pcchunk->ChunkCount = __constant_cpu_to_le32(1); + pcchunk->ChunkCount = cpu_to_le32(1); pcchunk->Reserved = 0; pcchunk->Reserved2 = 0; @@ -683,7 +684,8 @@ smb2_clone_range(const unsigned int xid, /* No need to change MaxChunks since already set to 1 */ chunk_sizes_updated = true; - } + } else + goto cchunk_out; } cchunk_out: @@ -1102,6 +1104,64 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } +static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len, bool keep_size) +{ + struct inode *inode; + struct cifsInodeInfo *cifsi; + struct cifsFileInfo *cfile = file->private_data; + long rc = -EOPNOTSUPP; + unsigned int xid; + + xid = get_xid(); + + inode = cfile->dentry->d_inode; + cifsi = CIFS_I(inode); + + /* if file not oplocked can't be sure whether asking to extend size */ + if (!CIFS_CACHE_READ(cifsi)) + if (keep_size == false) + return -EOPNOTSUPP; + + /* + * Files are non-sparse by default so falloc may be a no-op + * Must check if file sparse. If not sparse, and not extending + * then no need to do anything since file already allocated + */ + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) { + if (keep_size == true) + return 0; + /* check if extending file */ + else if (i_size_read(inode) >= off + len) + /* not extending file and already not sparse */ + return 0; + /* BB: in future add else clause to extend file */ + else + return -EOPNOTSUPP; + } + + if ((keep_size == true) || (i_size_read(inode) >= off + len)) { + /* + * Check if falloc starts within first few pages of file + * and ends within a few pages of the end of file to + * ensure that most of file is being forced to be + * fallocated now. If so then setting whole file sparse + * ie potentially making a few extra pages at the beginning + * or end of the file non-sparse via set_sparse is harmless. + */ + if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) + return -EOPNOTSUPP; + + rc = smb2_set_sparse(xid, tcon, cfile, inode, false); + } + /* BB: else ... in future add code to extend file and set sparse */ + + + free_xid(xid); + return rc; +} + + static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, loff_t off, loff_t len) { @@ -1112,7 +1172,10 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, if (mode & FALLOC_FL_KEEP_SIZE) return smb3_zero_range(file, tcon, off, len, true); return smb3_zero_range(file, tcon, off, len, false); - } + } else if (mode == FALLOC_FL_KEEP_SIZE) + return smb3_simple_falloc(file, tcon, off, len, true); + else if (mode == 0) + return smb3_simple_falloc(file, tcon, off, len, false); return -EOPNOTSUPP; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 8f1672bb82d5..65cd7a84c8bc 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -431,8 +431,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc) goto neg_exit; if (blob_length) - rc = decode_neg_token_init(security_blob, blob_length, - &server->sec_type); + rc = decode_negTokenInit(security_blob, blob_length, server); if (rc == 1) rc = 0; else if (rc == 0) { @@ -1219,7 +1218,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp; struct TCP_Server_Info *server; - struct cifs_ses *ses = tcon->ses; + struct cifs_ses *ses; struct kvec iov[2]; int resp_buftype; int num_iovecs; @@ -1234,6 +1233,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, if (plen) *plen = 0; + if (tcon) + ses = tcon->ses; + else + return -EIO; + if (ses && (ses->server)) server = ses->server; else @@ -1297,14 +1301,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; if ((rc != 0) && (rc != -EINVAL)) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; } else if (rc == -EINVAL) { if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) && (opcode != FSCTL_SRV_COPYCHUNK)) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); + cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); goto ioctl_exit; } } @@ -1359,7 +1361,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, char *ret_data = NULL; fsctl_input.CompressionState = - __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, @@ -1630,7 +1632,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0); - if ((rc != 0) && tcon) + if (rc != 0) cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE); free_rsp_buf(resp_buftype, iov[0].iov_base); @@ -2115,7 +2117,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, struct kvec iov[2]; int rc = 0; int len; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; unsigned char *bufptr; struct TCP_Server_Info *server; struct cifs_ses *ses = tcon->ses; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index e3188abdafd0..70867d54fb8b 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -85,7 +85,7 @@ /* BB FIXME - analyze following length BB */ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ -#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe) +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) /* * SMB2 Header Definition @@ -96,7 +96,7 @@ * */ -#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64) +#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ @@ -110,7 +110,7 @@ struct smb2_hdr { __le16 CreditRequest; /* CreditResponse */ __le32 Flags; __le32 NextCommand; - __u64 MessageId; /* opaque - so can stay little endian */ + __le64 MessageId; __le32 ProcessId; __u32 TreeId; /* opaque - so do not make little endian */ __u64 SessionId; /* opaque - so do not make little endian */ @@ -137,16 +137,16 @@ struct smb2_transform_hdr { } __packed; /* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) /* * SMB2 flag definitions */ -#define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001) -#define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002) -#define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004) -#define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008) -#define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000) +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) /* * Definitions for SMB2 Protocol Data Units (network frames) @@ -157,7 +157,7 @@ struct smb2_transform_hdr { * */ -#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9) +#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { struct smb2_hdr hdr; @@ -502,12 +502,12 @@ struct create_context { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04) +#define SMB2_LEASE_NONE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02) +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02) #define SMB2_LEASE_KEY_SIZE 16 @@ -836,6 +836,25 @@ struct smb2_query_directory_rsp { #define SMB2_O_INFO_SECURITY 0x03 #define SMB2_O_INFO_QUOTA 0x04 +/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ +#define OWNER_SECINFO 0x00000001 +#define GROUP_SECINFO 0x00000002 +#define DACL_SECINFO 0x00000004 +#define SACL_SECINFO 0x00000008 +#define LABEL_SECINFO 0x00000010 +#define ATTRIBUTE_SECINFO 0x00000020 +#define SCOPE_SECINFO 0x00000040 +#define BACKUP_SECINFO 0x00010000 +#define UNPROTECTED_SACL_SECINFO 0x10000000 +#define UNPROTECTED_DACL_SECINFO 0x20000000 +#define PROTECTED_SACL_SECINFO 0x40000000 +#define PROTECTED_DACL_SECINFO 0x80000000 + +/* Flags used for FileFullEAinfo */ +#define SL_RESTART_SCAN 0x00000001 +#define SL_RETURN_SINGLE_ENTRY 0x00000002 +#define SL_INDEX_SPECIFIED 0x00000004 + struct smb2_query_info_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 41 */ diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 5111e7272db6..d4c5b6f109a7 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -490,7 +490,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer, return temp; else { memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = smb_buffer->MessageId; /* always LE */ + temp->mid = le64_to_cpu(smb_buffer->MessageId); temp->pid = current->pid; temp->command = smb_buffer->Command; /* Always LE */ temp->when_alloc = jiffies; diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 6c1566366a66..a4232ec4f2ba 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -221,7 +221,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16, } rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16)); - memset(wpwd, 0, 129 * sizeof(__le16)); + memzero_explicit(wpwd, sizeof(wpwd)); return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 9d087f4e7d4e..126f46b887cc 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -99,9 +99,9 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) something is wrong, unless it is quite a slow link or server */ if ((now - midEntry->when_alloc) > HZ) { if ((cifsFYI & CIFS_TIMER) && (midEntry->command != command)) { - printk(KERN_DEBUG " CIFS slow rsp: cmd %d mid %llu", + pr_debug(" CIFS slow rsp: cmd %d mid %llu", midEntry->command, midEntry->mid); - printk(" A: 0x%lx S: 0x%lx R: 0x%lx\n", + pr_info(" A: 0x%lx S: 0x%lx R: 0x%lx\n", now - midEntry->when_alloc, now - midEntry->when_sent, now - midEntry->when_received); diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 278f8fdeb9ef..46ee6f238985 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -92,7 +92,7 @@ static void coda_flag_children(struct dentry *parent, int flag) struct dentry *de; spin_lock(&parent->d_lock); - list_for_each_entry(de, &parent->d_subdirs, d_u.d_child) { + list_for_each_entry(de, &parent->d_subdirs, d_child) { /* don't know what to do with negative dentries */ if (de->d_inode ) coda_flag_inode(de->d_inode, flag); diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 1326d38960db..f1714cfb589c 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -40,12 +40,6 @@ int coda_iscontrol(const char *name, size_t length) (strncmp(name, CODA_CONTROL, CODA_CONTROLLEN) == 0)); } -/* recognize /coda inode */ -int coda_isroot(struct inode *i) -{ - return ( i->i_sb->s_root->d_inode == i ); -} - unsigned short coda_flags_to_cflags(unsigned short flags) { unsigned short coda_flags = 0; diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d42b725b1d21..d6f7a76a1f5b 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -52,7 +52,6 @@ int coda_setattr(struct dentry *, struct iattr *); /* this file: heloers */ char *coda_f2s(struct CodaFid *f); -int coda_isroot(struct inode *i); int coda_iscontrol(const char *name, size_t length); void coda_vattr_to_iattr(struct inode *, struct coda_vattr *); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 9c3dedc000d1..60cb88c1dd2b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -28,29 +28,6 @@ #include "coda_int.h" -/* dir inode-ops */ -static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl); -static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags); -static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, - struct dentry *entry); -static int coda_unlink(struct inode *dir_inode, struct dentry *entry); -static int coda_symlink(struct inode *dir_inode, struct dentry *entry, - const char *symname); -static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode); -static int coda_rmdir(struct inode *dir_inode, struct dentry *entry); -static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, - struct inode *new_inode, struct dentry *new_dentry); - -/* dir file-ops */ -static int coda_readdir(struct file *file, struct dir_context *ctx); - -/* dentry ops */ -static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); -static int coda_dentry_delete(const struct dentry *); - -/* support routines */ -static int coda_venus_readdir(struct file *, struct dir_context *); - /* same as fs/bad_inode.c */ static int coda_return_EIO(void) { @@ -58,38 +35,6 @@ static int coda_return_EIO(void) } #define CODA_EIO_ERROR ((void *) (coda_return_EIO)) -const struct dentry_operations coda_dentry_operations = -{ - .d_revalidate = coda_dentry_revalidate, - .d_delete = coda_dentry_delete, -}; - -const struct inode_operations coda_dir_inode_operations = -{ - .create = coda_create, - .lookup = coda_lookup, - .link = coda_link, - .unlink = coda_unlink, - .symlink = coda_symlink, - .mkdir = coda_mkdir, - .rmdir = coda_rmdir, - .mknod = CODA_EIO_ERROR, - .rename = coda_rename, - .permission = coda_permission, - .getattr = coda_getattr, - .setattr = coda_setattr, -}; - -const struct file_operations coda_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate = coda_readdir, - .open = coda_open, - .release = coda_release, - .fsync = coda_fsync, -}; - - /* inode operations for directories */ /* access routines: lookup, readlink, permission */ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) @@ -107,7 +52,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig } /* control object, create inode on the fly */ - if (coda_isroot(dir) && coda_iscontrol(name, length)) { + if (is_root_inode(dir) && coda_iscontrol(name, length)) { inode = coda_cnode_makectl(sb); type = CODA_NOCACHE; } else { @@ -195,7 +140,7 @@ static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool struct CodaFid newfid; struct coda_vattr attrs; - if (coda_isroot(dir) && coda_iscontrol(name, length)) + if (is_root_inode(dir) && coda_iscontrol(name, length)) return -EPERM; error = venus_create(dir->i_sb, coda_i2f(dir), name, length, @@ -227,7 +172,7 @@ static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode) int error; struct CodaFid newfid; - if (coda_isroot(dir) && coda_iscontrol(name, len)) + if (is_root_inode(dir) && coda_iscontrol(name, len)) return -EPERM; attrs.va_mode = mode; @@ -261,7 +206,7 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, int len = de->d_name.len; int error; - if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) + if (is_root_inode(dir_inode) && coda_iscontrol(name, len)) return -EPERM; error = venus_link(dir_inode->i_sb, coda_i2f(inode), @@ -287,7 +232,7 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de, int symlen; int error; - if (coda_isroot(dir_inode) && coda_iscontrol(name, len)) + if (is_root_inode(dir_inode) && coda_iscontrol(name, len)) return -EPERM; symlen = strlen(symname); @@ -359,7 +304,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, (const char *) old_name, (const char *)new_name); if (!error) { if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) { + if (d_is_dir(new_dentry)) { coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } @@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, return error; } - -/* file operations for directories */ -static int coda_readdir(struct file *coda_file, struct dir_context *ctx) -{ - struct coda_file_info *cfi; - struct file *host_file; - int ret; - - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - - if (host_file->f_op->iterate) { - struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); - ret = -ENOENT; - if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); - } - mutex_unlock(&host_inode->i_mutex); - return ret; - } - /* Venus: we must read Venus dirents from a file */ - return coda_venus_readdir(coda_file, ctx); -} - static inline unsigned int CDT2DT(unsigned char cdt) { unsigned int dt; @@ -426,7 +344,6 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) struct coda_file_info *cfi; struct coda_inode_info *cii; struct file *host_file; - struct dentry *de; struct venus_dirent *vdir; unsigned long vdir_size = offsetof(struct venus_dirent, d_name); unsigned int type; @@ -438,8 +355,7 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - de = coda_file->f_path.dentry; - cii = ITOC(de->d_inode); + cii = ITOC(file_inode(coda_file)); vdir = kmalloc(sizeof(*vdir), GFP_KERNEL); if (!vdir) return -ENOMEM; @@ -497,6 +413,33 @@ out: return 0; } +/* file operations for directories */ +static int coda_readdir(struct file *coda_file, struct dir_context *ctx) +{ + struct coda_file_info *cfi; + struct file *host_file; + int ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (host_file->f_op->iterate) { + struct inode *host_inode = file_inode(host_file); + + mutex_lock(&host_inode->i_mutex); + ret = -ENOENT; + if (!IS_DEADDIR(host_inode)) { + ret = host_file->f_op->iterate(host_file, ctx); + file_accessed(host_file); + } + mutex_unlock(&host_inode->i_mutex); + return ret; + } + /* Venus: we must read Venus dirents from a file */ + return coda_venus_readdir(coda_file, ctx); +} + /* called when a cache lookup succeeds */ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) { @@ -507,7 +450,7 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) return -ECHILD; inode = de->d_inode; - if (!inode || coda_isroot(inode)) + if (!inode || is_root_inode(inode)) goto out; if (is_bad_inode(inode)) goto bad; @@ -605,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode) } return 0; } + +const struct dentry_operations coda_dentry_operations = { + .d_revalidate = coda_dentry_revalidate, + .d_delete = coda_dentry_delete, +}; + +const struct inode_operations coda_dir_inode_operations = { + .create = coda_create, + .lookup = coda_lookup, + .link = coda_link, + .unlink = coda_unlink, + .symlink = coda_symlink, + .mkdir = coda_mkdir, + .rmdir = coda_rmdir, + .mknod = CODA_EIO_ERROR, + .rename = coda_rename, + .permission = coda_permission, + .getattr = coda_getattr, + .setattr = coda_setattr, +}; + +const struct file_operations coda_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = coda_readdir, + .open = coda_open, + .release = coda_release, + .fsync = coda_fsync, +}; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index b945410bfcd5..82ec68b59208 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) goto unlock_out; } - error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); + error = bdi_setup_and_register(&vc->bdi, "coda"); if (error) goto unlock_out; diff --git a/fs/compat.c b/fs/compat.c index b13df99f3534..6fd272d455e4 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -847,10 +847,12 @@ struct compat_readdir_callback { int result; }; -static int compat_fillonedir(void *__buf, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) +static int compat_fillonedir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { - struct compat_readdir_callback *buf = __buf; + struct compat_readdir_callback *buf = + container_of(ctx, struct compat_readdir_callback, ctx); struct compat_old_linux_dirent __user *dirent; compat_ulong_t d_ino; @@ -915,11 +917,12 @@ struct compat_getdents_callback { int error; }; -static int compat_filldir(void *__buf, const char *name, int namlen, +static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct compat_linux_dirent __user * dirent; - struct compat_getdents_callback *buf = __buf; + struct compat_getdents_callback *buf = + container_of(ctx, struct compat_getdents_callback, ctx); compat_ulong_t d_ino; int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); @@ -1001,11 +1004,13 @@ struct compat_getdents_callback64 { int error; }; -static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int compat_filldir64(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { struct linux_dirent64 __user *dirent; - struct compat_getdents_callback64 *buf = __buf; + struct compat_getdents_callback64 *buf = + container_of(ctx, struct compat_getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); u64 off; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index bd4a3c167091..b65d1ef532d5 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -69,16 +69,13 @@ extern struct kmem_cache *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *); -extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *)); -extern int configfs_inode_init(void); -extern void configfs_inode_exit(void); +extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int); extern int configfs_dirent_is_ready(struct configfs_dirent *); -extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int); extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 668dcabc5695..cf0db005d2f5 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -240,60 +240,26 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd, return 0; } -static int init_dir(struct inode * inode) +static void init_dir(struct inode * inode) { inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - return 0; } -static int configfs_init_file(struct inode * inode) +static void configfs_init_file(struct inode * inode) { inode->i_size = PAGE_SIZE; inode->i_fop = &configfs_file_operations; - return 0; } -static int init_symlink(struct inode * inode) +static void init_symlink(struct inode * inode) { inode->i_op = &configfs_symlink_inode_operations; - return 0; -} - -static int create_dir(struct config_item *k, struct dentry *d) -{ - int error; - umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - struct dentry *p = d->d_parent; - - BUG_ON(!k); - - error = configfs_dirent_exists(p->d_fsdata, d->d_name.name); - if (!error) - error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR | CONFIGFS_USET_CREATING); - if (!error) { - configfs_set_dir_dirent_depth(p->d_fsdata, d->d_fsdata); - error = configfs_create(d, mode, init_dir); - if (!error) { - inc_nlink(p->d_inode); - } else { - struct configfs_dirent *sd = d->d_fsdata; - if (sd) { - spin_lock(&configfs_dirent_lock); - list_del_init(&sd->s_sibling); - spin_unlock(&configfs_dirent_lock); - configfs_put(sd); - } - } - } - return error; } - /** * configfs_create_dir - create a directory for an config_item. * @item: config_itemwe're creating directory for. @@ -303,11 +269,37 @@ static int create_dir(struct config_item *k, struct dentry *d) * until it is validated by configfs_dir_set_ready() */ -static int configfs_create_dir(struct config_item * item, struct dentry *dentry) +static int configfs_create_dir(struct config_item *item, struct dentry *dentry) { - int error = create_dir(item, dentry); - if (!error) + int error; + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + struct dentry *p = dentry->d_parent; + + BUG_ON(!item); + + error = configfs_dirent_exists(p->d_fsdata, dentry->d_name.name); + if (unlikely(error)) + return error; + + error = configfs_make_dirent(p->d_fsdata, dentry, item, mode, + CONFIGFS_DIR | CONFIGFS_USET_CREATING); + if (unlikely(error)) + return error; + + configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); + error = configfs_create(dentry, mode, init_dir); + if (!error) { + inc_nlink(p->d_inode); item->ci_dentry = dentry; + } else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + spin_lock(&configfs_dirent_lock); + list_del_init(&sd->s_sibling); + spin_unlock(&configfs_dirent_lock); + configfs_put(sd); + } + } return error; } @@ -386,7 +378,7 @@ static void remove_dir(struct dentry * d) if (d->d_inode) simple_rmdir(parent->d_inode,d); - pr_debug(" o %s removing done (%d)\n",d->d_name.name, d_count(d)); + pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); dput(parent); } diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 1d1c41f1014d..56d2cdc9ae0a 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -313,21 +313,6 @@ const struct file_operations configfs_file_operations = { .release = configfs_release, }; - -int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type) -{ - struct configfs_dirent * parent_sd = dir->d_fsdata; - umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; - int error = 0; - - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); - error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); - mutex_unlock(&dir->d_inode->i_mutex); - - return error; -} - - /** * configfs_create_file - create an attribute file for an item. * @item: item we're creating for. @@ -336,9 +321,16 @@ int configfs_add_file(struct dentry * dir, const struct configfs_attribute * att int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr) { - BUG_ON(!item || !item->ci_dentry || !attr); + struct dentry *dir = item->ci_dentry; + struct configfs_dirent *parent_sd = dir->d_fsdata; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; - return configfs_add_file(item->ci_dentry, attr, - CONFIGFS_ITEM_ATTR); + mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, + CONFIGFS_ITEM_ATTR); + mutex_unlock(&dir->d_inode->i_mutex); + + return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 5946ad98053f..5423a6a6ecc8 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = { .write_end = simple_write_end, }; -static struct backing_dev_info configfs_backing_dev_info = { - .name = "configfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; @@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, if (inode) { inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &configfs_aops; - inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; inode->i_op = &configfs_inode_operations; if (sd->s_iattr) { @@ -183,7 +176,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, #endif /* CONFIG_LOCKDEP */ -int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct inode *)) +int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct inode *)) { int error = 0; struct inode *inode = NULL; @@ -205,13 +198,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, int (*init)(struct ino p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; configfs_set_inode_lock_class(sd, inode); - if (init) { - error = init(inode); - if (error) { - iput(inode); - return error; - } - } + init(inode); d_instantiate(dentry, inode); if (S_ISDIR(mode) || S_ISLNK(mode)) dget(dentry); /* pin link and directory dentries in core */ @@ -249,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!(d_unhashed(dentry) && dentry->d_inode)) { + if (!d_unhashed(dentry) && dentry->d_inode) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); @@ -283,13 +270,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) } mutex_unlock(&dir->d_inode->i_mutex); } - -int __init configfs_inode_init(void) -{ - return bdi_init(&configfs_backing_dev_info); -} - -void configfs_inode_exit(void) -{ - bdi_destroy(&configfs_backing_dev_info); -} diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index f6c285833390..da94e41bdbf6 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -145,19 +145,13 @@ static int __init configfs_init(void) if (!config_kobj) goto out2; - err = configfs_inode_init(); - if (err) - goto out3; - err = register_filesystem(&configfs_fs_type); if (err) - goto out4; + goto out3; return 0; -out4: - pr_err("Unable to register filesystem!\n"); - configfs_inode_exit(); out3: + pr_err("Unable to register filesystem!\n"); kobject_put(config_kobj); out2: kmem_cache_destroy(configfs_dir_cachep); @@ -172,7 +166,6 @@ static void __exit configfs_exit(void) kobject_put(config_kobj); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; - configfs_inode_exit(); } MODULE_AUTHOR("Oracle"); diff --git a/fs/dax.c b/fs/dax.c new file mode 100644 index 000000000000..ed1619ec6537 --- /dev/null +++ b/fs/dax.c @@ -0,0 +1,534 @@ +/* + * fs/dax.c - Direct Access filesystem code + * Copyright (c) 2013-2014 Intel Corporation + * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> + * Author: Ross Zwisler <ross.zwisler@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/highmem.h> +#include <linux/memcontrol.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/uio.h> +#include <linux/vmstat.h> + +int dax_clear_blocks(struct inode *inode, sector_t block, long size) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + sector_t sector = block << (inode->i_blkbits - 9); + + might_sleep(); + do { + void *addr; + unsigned long pfn; + long count; + + count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + if (count < 0) + return count; + BUG_ON(size < count); + while (count > 0) { + unsigned pgsz = PAGE_SIZE - offset_in_page(addr); + if (pgsz > count) + pgsz = count; + if (pgsz < PAGE_SIZE) + memset(addr, 0, pgsz); + else + clear_page(addr); + addr += pgsz; + size -= pgsz; + count -= pgsz; + BUG_ON(pgsz & 511); + sector += pgsz / 512; + cond_resched(); + } + } while (size); + + return 0; +} +EXPORT_SYMBOL_GPL(dax_clear_blocks); + +static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +{ + unsigned long pfn; + sector_t sector = bh->b_blocknr << (blkbits - 9); + return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); +} + +static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, + loff_t end) +{ + loff_t final = end - pos + first; /* The final byte of the buffer */ + + if (first > 0) + memset(addr, 0, first); + if (final < size) + memset(addr + final, 0, size - final); +} + +static bool buffer_written(struct buffer_head *bh) +{ + return buffer_mapped(bh) && !buffer_unwritten(bh); +} + +/* + * When ext4 encounters a hole, it returns without modifying the buffer_head + * which means that we can't trust b_size. To cope with this, we set b_state + * to 0 before calling get_block and, if any bit is set, we know we can trust + * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is + * and would save us time calling get_block repeatedly. + */ +static bool buffer_size_valid(struct buffer_head *bh) +{ + return bh->b_state != 0; +} + +static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) +{ + ssize_t retval = 0; + loff_t pos = start; + loff_t max = start; + loff_t bh_max = start; + void *addr; + bool hole = false; + + if (rw != WRITE) + end = min(end, i_size_read(inode)); + + while (pos < end) { + unsigned len; + if (pos == max) { + unsigned blkbits = inode->i_blkbits; + sector_t block = pos >> blkbits; + unsigned first = pos - (block << blkbits); + long size; + + if (pos == bh_max) { + bh->b_size = PAGE_ALIGN(end - pos); + bh->b_state = 0; + retval = get_block(inode, block, bh, + rw == WRITE); + if (retval) + break; + if (!buffer_size_valid(bh)) + bh->b_size = 1 << blkbits; + bh_max = pos - first + bh->b_size; + } else { + unsigned done = bh->b_size - + (bh_max - (pos - first)); + bh->b_blocknr += done >> blkbits; + bh->b_size -= done; + } + + hole = (rw != WRITE) && !buffer_written(bh); + if (hole) { + addr = NULL; + size = bh->b_size - first; + } else { + retval = dax_get_addr(bh, &addr, blkbits); + if (retval < 0) + break; + if (buffer_unwritten(bh) || buffer_new(bh)) + dax_new_buf(addr, retval, first, pos, + end); + addr += first; + size = retval - first; + } + max = min(pos + size, end); + } + + if (rw == WRITE) + len = copy_from_iter(addr, max - pos, iter); + else if (!hole) + len = copy_to_iter(addr, max - pos, iter); + else + len = iov_iter_zero(max - pos, iter); + + if (!len) + break; + + pos += len; + addr += len; + } + + return (pos == start) ? retval : pos - start; +} + +/** + * dax_do_io - Perform I/O to a DAX file + * @rw: READ to read or WRITE to write + * @iocb: The control block for this I/O + * @inode: The file which the I/O is directed at + * @iter: The addresses to do I/O from or to + * @pos: The file offset where the I/O starts + * @get_block: The filesystem method used to translate file offsets to blocks + * @end_io: A filesystem callback for I/O completion + * @flags: See below + * + * This function uses the same locking scheme as do_blockdev_direct_IO: + * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the + * caller for writes. For reads, we take and release the i_mutex ourselves. + * If DIO_LOCKING is not set, the filesystem takes care of its own locking. + * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O + * is in progress. + */ +ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, + get_block_t get_block, dio_iodone_t end_io, int flags) +{ + struct buffer_head bh; + ssize_t retval = -EINVAL; + loff_t end = pos + iov_iter_count(iter); + + memset(&bh, 0, sizeof(bh)); + + if ((flags & DIO_LOCKING) && (rw == READ)) { + struct address_space *mapping = inode->i_mapping; + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, pos, end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + /* Protects against truncate */ + atomic_inc(&inode->i_dio_count); + + retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); + + if ((flags & DIO_LOCKING) && (rw == READ)) + mutex_unlock(&inode->i_mutex); + + if ((retval > 0) && end_io) + end_io(iocb, pos, retval, bh.b_private); + + inode_dio_done(inode); + out: + return retval; +} +EXPORT_SYMBOL_GPL(dax_do_io); + +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, struct page *page, + struct vm_fault *vmf) +{ + unsigned long size; + struct inode *inode = mapping->host; + if (!page) + page = find_or_create_page(mapping, vmf->pgoff, + GFP_KERNEL | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + /* Recheck i_size under page lock to avoid truncate race */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static int copy_user_bh(struct page *to, struct buffer_head *bh, + unsigned blkbits, unsigned long vaddr) +{ + void *vfrom, *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) + return -EIO; + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + return 0; +} + +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + i_mmap_lock_read(mapping); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) + clear_page(addr); + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + i_mmap_unlock_read(mapping); + + if (bh->b_end_io) + bh->b_end_io(bh, 1); + + return error; +} + +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 0); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_page; + } else { + return dax_load_hole(mapping, page, vmf); + } + } + + if (vmf->cow_page) { + struct page *new_page = vmf->cow_page; + if (buffer_written(&bh)) + error = copy_user_bh(new_page, &bh, blkbits, vaddr); + else + clear_user_highpage(new_page, vaddr); + if (error) + goto unlock_page; + vmf->page = page; + if (!page) { + i_mmap_lock_read(mapping); + /* Check we didn't race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); + error = -EIO; + goto out; + } + } + return VM_FAULT_LOCKED; + } + + /* Check we didn't race with a read fault installing a new page */ + if (!page && major) + page = find_lock_page(mapping, vmf->pgoff); + + if (page) { + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_CACHE_SIZE, 0); + delete_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + } + + error = dax_insert_mapping(inode, &bh, vma, vmf); + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +/** + * dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. + */ +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = do_dax_fault(vma, vmf, get_block); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_fault); + +/** + * dax_zero_page_range - zero a range within a page of a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @length: The number of bytes to zero + * @get_block: The filesystem method used to translate file offsets to blocks + * + * This function can be called by a filesystem when it is zeroing part of a + * page in a DAX file. This is intended for hole-punch operations. If + * you are truncating a file, the helper function dax_truncate_page() may be + * more convenient. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, + get_block_t get_block) +{ + struct buffer_head bh; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + int err; + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + BUG_ON((offset + length) > PAGE_CACHE_SIZE); + + memset(&bh, 0, sizeof(bh)); + bh.b_size = PAGE_CACHE_SIZE; + err = get_block(inode, index, &bh, 0); + if (err < 0) + return err; + if (buffer_written(&bh)) { + void *addr; + err = dax_get_addr(&bh, &addr, inode->i_blkbits); + if (err < 0) + return err; + memset(addr + offset, 0, length); + } + + return 0; +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating a DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + unsigned length = PAGE_CACHE_ALIGN(from) - from; + return dax_zero_page_range(inode, from, length, get_block); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/fs/dcache.c b/fs/dcache.c index 3ffef7f4e5cd..c71e3732e53b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -38,13 +38,15 @@ #include <linux/prefetch.h> #include <linux/ratelimit.h> #include <linux/list_lru.h> +#include <linux/kasan.h> + #include "internal.h" #include "mount.h" /* * Usage: * dcache->d_inode->i_lock protects: - * - i_dentry, d_alias, d_inode of aliases + * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_anon bl list spinlock protects: @@ -59,7 +61,7 @@ * - d_unhashed() * - d_parent and d_subdirs * - childrens' d_child and d_parent - * - d_alias, d_inode + * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock @@ -252,14 +254,12 @@ static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - WARN_ON(!hlist_unhashed(&dentry->d_alias)); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); - WARN_ON(!hlist_unhashed(&dentry->d_alias)); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -271,6 +271,7 @@ static inline int dname_external(const struct dentry *dentry) static void dentry_free(struct dentry *dentry) { + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->u.count))) { @@ -311,7 +312,7 @@ static void dentry_iput(struct dentry * dentry) struct inode *inode = dentry->d_inode; if (inode) { dentry->d_inode = NULL; - hlist_del_init(&dentry->d_alias); + hlist_del_init(&dentry->d_u.d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -336,7 +337,7 @@ static void dentry_unlink_inode(struct dentry * dentry) struct inode *inode = dentry->d_inode; __d_clear_type(dentry); dentry->d_inode = NULL; - hlist_del_init(&dentry->d_alias); + hlist_del_init(&dentry->d_u.d_alias); dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -401,19 +402,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list) * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ -static void d_lru_isolate(struct dentry *dentry) +static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); - list_del_init(&dentry->d_lru); + list_lru_isolate(lru, &dentry->d_lru); } -static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) +static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, + struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; - list_move_tail(&dentry->d_lru, list); + list_lru_isolate_move(lru, &dentry->d_lru, list); } /* @@ -496,7 +498,7 @@ static void __dentry_kill(struct dentry *dentry) } /* if it was on the hash then remove it */ __d_drop(dentry); - list_del(&dentry->d_u.d_child); + __list_del_entry(&dentry->d_child); /* * Inform d_walk() that we are no longer attached to the * dentry tree @@ -509,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry) * dentry_iput drops the locks, at which point nobody (except * transient RCU lookups) can reach this dentry. */ - BUG_ON((int)dentry->d_lockref.count > 0); + BUG_ON(dentry->d_lockref.count > 0); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -562,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry) struct dentry *parent = dentry->d_parent; if (IS_ROOT(dentry)) return NULL; - if (unlikely((int)dentry->d_lockref.count < 0)) + if (unlikely(dentry->d_lockref.count < 0)) return NULL; if (likely(spin_trylock(&parent->d_lock))) return parent; @@ -591,6 +593,110 @@ again: return parent; } +/* + * Try to do a lockless dput(), and return whether that was successful. + * + * If unsuccessful, we return false, having already taken the dentry lock. + * + * The caller needs to hold the RCU read lock, so that the dentry is + * guaranteed to stay around even if the refcount goes down to zero! + */ +static inline bool fast_dput(struct dentry *dentry) +{ + int ret; + unsigned int d_flags; + + /* + * If we have a d_op->d_delete() operation, we sould not + * let the dentry count go to zero, so use "put__or_lock". + */ + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) + return lockref_put_or_lock(&dentry->d_lockref); + + /* + * .. otherwise, we can try to just decrement the + * lockref optimistically. + */ + ret = lockref_put_return(&dentry->d_lockref); + + /* + * If the lockref_put_return() failed due to the lock being held + * by somebody else, the fast path has failed. We will need to + * get the lock, and then check the count again. + */ + if (unlikely(ret < 0)) { + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count > 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + return 1; + } + return 0; + } + + /* + * If we weren't the last ref, we're done. + */ + if (ret) + return 1; + + /* + * Careful, careful. The reference count went down + * to zero, but we don't hold the dentry lock, so + * somebody else could get it again, and do another + * dput(), and we need to not race with that. + * + * However, there is a very special and common case + * where we don't care, because there is nothing to + * do: the dentry is still hashed, it does not have + * a 'delete' op, and it's referenced and already on + * the LRU list. + * + * NOTE! Since we aren't locked, these values are + * not "stable". However, it is sufficient that at + * some point after we dropped the reference the + * dentry was hashed and the flags had the proper + * value. Other dentry users may have re-gotten + * a reference to the dentry and change that, but + * our work is done - we can leave the dentry + * around with a zero refcount. + */ + smp_rmb(); + d_flags = ACCESS_ONCE(dentry->d_flags); + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST; + + /* Nothing to do? Dropping the reference was all we needed? */ + if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + return 1; + + /* + * Not the fast normal case? Get the lock. We've already decremented + * the refcount, but we'll need to re-check the situation after + * getting the lock. + */ + spin_lock(&dentry->d_lock); + + /* + * Did somebody else grab a reference to it in the meantime, and + * we're no longer the last user after all? Alternatively, somebody + * else could have killed it and marked it dead. Either way, we + * don't need to do anything else. + */ + if (dentry->d_lockref.count) { + spin_unlock(&dentry->d_lock); + return 1; + } + + /* + * Re-get the reference we optimistically dropped. We hold the + * lock, and we just tested that it was zero, so we can just + * set it to 1. + */ + dentry->d_lockref.count = 1; + return 0; +} + + /* * This is dput * @@ -623,8 +729,14 @@ void dput(struct dentry *dentry) return; repeat: - if (lockref_put_or_lock(&dentry->d_lockref)) + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); return; + } + + /* Slow case: now with the dentry lock held */ + rcu_read_unlock(); /* Unreachable? Get rid of it */ if (unlikely(d_unhashed(dentry))) @@ -722,7 +834,7 @@ static struct dentry *__d_find_alias(struct inode *inode) again: discon_alias = NULL; - hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { if (IS_ROOT(alias) && @@ -772,12 +884,13 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; restart: spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { struct dentry *parent = lock_parent(dentry); if (likely(!dentry->d_lockref.count)) { __dentry_kill(dentry); + dput(parent); goto restart; } if (parent) @@ -810,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if ((int)dentry->d_lockref.count > 0) { + if (dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); @@ -869,8 +982,8 @@ static void shrink_dentry_list(struct list_head *list) } } -static enum lru_status -dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +static enum lru_status dentry_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -890,7 +1003,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * another pass through the LRU. */ if (dentry->d_lockref.count) { - d_lru_isolate(dentry); + d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } @@ -921,7 +1034,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) return LRU_ROTATE; } - d_lru_shrink_move(dentry, freeable); + d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; @@ -930,30 +1043,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) /** * prune_dcache_sb - shrink the dcache * @sb: superblock - * @nr_to_scan : number of entries to try to free - * @nid: which node to scan for freeable entities + * @sc: shrink control, passed to list_lru_shrink_walk() * - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is - * done when we need more memory an called from the superblock shrinker + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This + * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ -long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid) +long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; - freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, - &dispose, &nr_to_scan); + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, + dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, - spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -966,7 +1077,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; - d_lru_shrink_move(dentry, freeable); + d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; @@ -1050,7 +1161,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); @@ -1082,33 +1193,31 @@ resume: /* * All done at this level ... ascend and resume the search. */ + rcu_read_lock(); +ascend: if (this_parent != parent) { struct dentry *child = this_parent; this_parent = child->d_parent; - rcu_read_lock(); spin_unlock(&child->d_lock); spin_lock(&this_parent->d_lock); - /* - * might go back up the wrong parent if we have had a rename - * or deletion - */ - if (this_parent != child->d_parent || - (child->d_flags & DCACHE_DENTRY_KILLED) || - need_seqretry(&rename_lock, seq)) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + /* might go back up the wrong parent if we have had a rename. */ + if (need_seqretry(&rename_lock, seq)) goto rename_retry; + next = child->d_child.next; + while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) { + if (next == &this_parent->d_subdirs) + goto ascend; + child = list_entry(next, struct dentry, d_child); + next = next->next; } rcu_read_unlock(); - next = child->d_u.d_child.next; goto resume; } - if (need_seqretry(&rename_lock, seq)) { - spin_unlock(&this_parent->d_lock); + if (need_seqretry(&rename_lock, seq)) goto rename_retry; - } + rcu_read_unlock(); if (finish) finish(data); @@ -1118,6 +1227,9 @@ out_unlock: return; rename_retry: + spin_unlock(&this_parent->d_lock); + rcu_read_unlock(); + BUG_ON(seq & 1); if (!retry) return; seq = 1; @@ -1429,6 +1541,9 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) } atomic_set(&p->u.count, 1); dname = p->name; + if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS)) + kasan_unpoison_shadow(dname, + round_up(name->len + 1, sizeof(unsigned long))); } else { dname = dentry->d_iname; } @@ -1454,8 +1569,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); - INIT_HLIST_NODE(&dentry->d_alias); - INIT_LIST_HEAD(&dentry->d_u.d_child); + INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_LIST_HEAD(&dentry->d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); this_cpu_inc(nr_dentry); @@ -1485,7 +1600,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) */ __dget_dlock(parent); dentry->d_parent = parent; - list_add(&dentry->d_u.d_child, &parent->d_subdirs); + list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&parent->d_lock); return dentry; @@ -1544,9 +1659,25 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) } EXPORT_SYMBOL(d_set_d_op); + +/* + * d_set_fallthru - Mark a dentry as falling through to a lower layer + * @dentry - The dentry to mark + * + * Mark a dentry as falling through to the lower layer (as set with + * d_pin_lower()). This flag may be recorded on the medium. + */ +void d_set_fallthru(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_FALLTHRU; + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_set_fallthru); + static unsigned d_flags_for_inode(struct inode *inode) { - unsigned add_flags = DCACHE_FILE_TYPE; + unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; @@ -1559,13 +1690,21 @@ static unsigned d_flags_for_inode(struct inode *inode) else inode->i_opflags |= IOP_LOOKUP; } - } else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (unlikely(inode->i_op->follow_link)) + goto type_determined; + } + + if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { + if (unlikely(inode->i_op->follow_link)) { add_flags = DCACHE_SYMLINK_TYPE; - else - inode->i_opflags |= IOP_NOFOLLOW; + goto type_determined; + } + inode->i_opflags |= IOP_NOFOLLOW; } + if (unlikely(!S_ISREG(inode->i_mode))) + add_flags = DCACHE_SPECIAL_TYPE; + +type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; @@ -1576,9 +1715,10 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - __d_set_type(dentry, add_flags); + dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + dentry->d_flags |= add_flags; if (inode) - hlist_add_head(&dentry->d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); dentry->d_inode = inode; dentry_rcuwalk_barrier(dentry); spin_unlock(&dentry->d_lock); @@ -1602,7 +1742,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) spin_lock(&inode->i_lock); __d_instantiate(entry, inode); @@ -1641,7 +1781,7 @@ static struct dentry *__d_instantiate_unique(struct dentry *entry, return NULL; } - hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or @@ -1667,7 +1807,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) { struct dentry *result; - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) spin_lock(&inode->i_lock); @@ -1698,7 +1838,7 @@ EXPORT_SYMBOL(d_instantiate_unique); */ int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) { @@ -1737,7 +1877,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode) if (hlist_empty(&inode->i_dentry)) return NULL; - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); __dget(alias); return alias; } @@ -1799,7 +1939,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) spin_lock(&tmp->d_lock); tmp->d_inode = inode; tmp->d_flags |= add_flags; - hlist_add_head(&tmp->d_alias, &inode->i_dentry); + hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry); hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); hlist_bl_unlock(&tmp->d_sb->s_anon); @@ -1888,51 +2028,19 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); - if (unlikely(IS_ERR(found))) - goto err_out; if (!found) { new = d_alloc(dentry->d_parent, name); if (!new) { found = ERR_PTR(-ENOMEM); - goto err_out; - } - - found = d_splice_alias(inode, new); - if (found) { - dput(new); - return found; - } - return new; - } - - /* - * If a matching dentry exists, and it's not negative use it. - * - * Decrement the reference count to balance the iget() done - * earlier on. - */ - if (found->d_inode) { - if (unlikely(found->d_inode != inode)) { - /* This can't happen because bad inodes are unhashed. */ - BUG_ON(!is_bad_inode(inode)); - BUG_ON(!is_bad_inode(found->d_inode)); + } else { + found = d_splice_alias(inode, new); + if (found) { + dput(new); + return found; + } + return new; } - iput(inode); - return found; } - - /* - * Negative dentry: instantiate it unless the inode is a directory and - * already has a dentry. - */ - new = d_splice_alias(inode, found); - if (new) { - dput(found); - found = new; - } - return found; - -err_out: iput(inode); return found; } @@ -2218,37 +2326,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) } EXPORT_SYMBOL(d_hash_and_lookup); -/** - * d_validate - verify dentry provided from insecure source (deprecated) - * @dentry: The dentry alleged to be valid child of @dparent - * @dparent: The parent dentry (known to be valid) - * - * An insecure source has sent us a dentry, here we verify it and dget() it. - * This is used by ncpfs in its readdir implementation. - * Zero is returned in the dentry is invalid. - * - * This function is slow for big directories, and deprecated, do not use it. - */ -int d_validate(struct dentry *dentry, struct dentry *dparent) -{ - struct dentry *child; - - spin_lock(&dparent->d_lock); - list_for_each_entry(child, &dparent->d_subdirs, d_u.d_child) { - if (dentry == child) { - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dparent->d_lock); - return 1; - } - } - spin_unlock(&dparent->d_lock); - - return 0; -} -EXPORT_SYMBOL(d_validate); - /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry @@ -2392,6 +2469,8 @@ static void swap_names(struct dentry *dentry, struct dentry *target) */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); + kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN); + kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); @@ -2525,13 +2604,13 @@ static void __d_move(struct dentry *dentry, struct dentry *target, /* splicing a tree */ dentry->d_parent = target->d_parent; target->d_parent = target; - list_del_init(&target->d_u.d_child); - list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + list_del_init(&target->d_child); + list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); } else { /* swapping two dentries */ swap(dentry->d_parent, target->d_parent); - list_move(&target->d_u.d_child, &target->d_parent->d_subdirs); - list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); + list_move(&target->d_child, &target->d_parent->d_subdirs); + list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); if (exchange) fsnotify_d_move(target); fsnotify_d_move(dentry); @@ -2607,11 +2686,11 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ -static struct dentry *__d_unalias(struct inode *inode, +static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL, *m2 = NULL; - struct dentry *ret = ERR_PTR(-EBUSY); + int ret = -EBUSY; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) @@ -2626,7 +2705,7 @@ static struct dentry *__d_unalias(struct inode *inode, m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: __d_move(alias, dentry, false); - ret = alias; + ret = 0; out_err: spin_unlock(&inode->i_lock); if (m2) @@ -2661,130 +2740,57 @@ out_err: */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { - struct dentry *new = NULL; - if (IS_ERR(inode)) return ERR_CAST(inode); - if (inode && S_ISDIR(inode->i_mode)) { - spin_lock(&inode->i_lock); - new = __d_find_any_alias(inode); - if (new) { - if (!IS_ROOT(new)) { - spin_unlock(&inode->i_lock); - dput(new); - iput(inode); - return ERR_PTR(-EIO); - } - if (d_ancestor(new, dentry)) { - spin_unlock(&inode->i_lock); - dput(new); - iput(inode); - return ERR_PTR(-EIO); - } - write_seqlock(&rename_lock); - __d_move(new, dentry, false); - write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); - security_d_instantiate(new, inode); - iput(inode); - } else { - /* already taking inode->i_lock, so d_add() by hand */ - __d_instantiate(dentry, inode); - spin_unlock(&inode->i_lock); - security_d_instantiate(dentry, inode); - d_rehash(dentry); - } - } else { - d_instantiate(dentry, inode); - if (d_unhashed(dentry)) - d_rehash(dentry); - } - return new; -} -EXPORT_SYMBOL(d_splice_alias); - -/** - * d_materialise_unique - introduce an inode into the tree - * @dentry: candidate dentry - * @inode: inode to bind to the dentry, to which aliases may be attached - * - * Introduces an dentry into the tree, substituting an extant disconnected - * root directory alias in its place if there is one. Caller must hold the - * i_mutex of the parent directory. - */ -struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) -{ - struct dentry *actual; - BUG_ON(!d_unhashed(dentry)); if (!inode) { - actual = dentry; __d_instantiate(dentry, NULL); - d_rehash(actual); - goto out_nolock; + goto out; } - spin_lock(&inode->i_lock); - if (S_ISDIR(inode->i_mode)) { - struct dentry *alias; - - /* Does an aliased dentry already exist? */ - alias = __d_find_alias(inode); - if (alias) { - actual = alias; + struct dentry *new = __d_find_any_alias(inode); + if (unlikely(new)) { write_seqlock(&rename_lock); - - if (d_ancestor(alias, dentry)) { - /* Check for loops */ - actual = ERR_PTR(-ELOOP); + if (unlikely(d_ancestor(new, dentry))) { + write_sequnlock(&rename_lock); spin_unlock(&inode->i_lock); - } else if (IS_ROOT(alias)) { - /* Is this an anonymous mountpoint that we - * could splice into our tree? */ - __d_move(alias, dentry, false); + dput(new); + new = ERR_PTR(-ELOOP); + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); + } else if (!IS_ROOT(new)) { + int err = __d_unalias(inode, dentry, new); write_sequnlock(&rename_lock); - goto found; + if (err) { + dput(new); + new = ERR_PTR(err); + } } else { - /* Nope, but we must(!) avoid directory - * aliasing. This drops inode->i_lock */ - actual = __d_unalias(inode, dentry, alias); - } - write_sequnlock(&rename_lock); - if (IS_ERR(actual)) { - if (PTR_ERR(actual) == -ELOOP) - pr_warn_ratelimited( - "VFS: Lookup of '%s' in %s %s" - " would have caused loop\n", - dentry->d_name.name, - inode->i_sb->s_type->name, - inode->i_sb->s_id); - dput(alias); + __d_move(new, dentry, false); + write_sequnlock(&rename_lock); + spin_unlock(&inode->i_lock); + security_d_instantiate(new, inode); } - goto out_nolock; + iput(inode); + return new; } } - - /* Add a unique reference */ - actual = __d_instantiate_unique(dentry, inode); - if (!actual) - actual = dentry; - - d_rehash(actual); -found: + /* already taking inode->i_lock, so d_add() by hand */ + __d_instantiate(dentry, inode); spin_unlock(&inode->i_lock); -out_nolock: - if (actual == dentry) { - security_d_instantiate(dentry, inode); - return NULL; - } - - iput(inode); - return actual; +out: + security_d_instantiate(dentry, inode); + d_rehash(dentry); + return NULL; } -EXPORT_SYMBOL_GPL(d_materialise_unique); +EXPORT_SYMBOL(d_splice_alias); static int prepend(char **buffer, int *buflen, const char *str, int namelen) { @@ -3320,7 +3326,7 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) { inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || - !hlist_unhashed(&dentry->d_alias) || + !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 76c08c2beb2f..517e64938438 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -22,6 +22,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/atomic.h> +#include <linux/device.h> static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -692,18 +693,19 @@ EXPORT_SYMBOL_GPL(debugfs_create_u32_array); * because some peripherals have several blocks of identical registers, * for example configuration of dma channels */ -int debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, - int nregs, void __iomem *base, char *prefix) +void debugfs_print_regs32(struct seq_file *s, const struct debugfs_reg32 *regs, + int nregs, void __iomem *base, char *prefix) { - int i, ret = 0; + int i; for (i = 0; i < nregs; i++, regs++) { if (prefix) - ret += seq_printf(s, "%s", prefix); - ret += seq_printf(s, "%s = 0x%08x\n", regs->name, - readl(base + regs->offset)); + seq_printf(s, "%s", prefix); + seq_printf(s, "%s = 0x%08x\n", regs->name, + readl(base + regs->offset)); + if (seq_has_overflowed(s)) + break; } - return ret; } EXPORT_SYMBOL_GPL(debugfs_print_regs32); @@ -761,3 +763,56 @@ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, EXPORT_SYMBOL_GPL(debugfs_create_regset32); #endif /* CONFIG_HAS_IOMEM */ + +struct debugfs_devm_entry { + int (*read)(struct seq_file *seq, void *data); + struct device *dev; +}; + +static int debugfs_devm_entry_open(struct inode *inode, struct file *f) +{ + struct debugfs_devm_entry *entry = inode->i_private; + + return single_open(f, entry->read, entry->dev); +} + +static const struct file_operations debugfs_devm_entry_ops = { + .owner = THIS_MODULE, + .open = debugfs_devm_entry_open, + .release = single_release, + .read = seq_read, + .llseek = seq_lseek +}; + +/** + * debugfs_create_devm_seqfile - create a debugfs file that is bound to device. + * + * @dev: device related to this debugfs file. + * @name: name of the debugfs file. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @read_fn: function pointer called to print the seq_file content. + */ +struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, + void *data)) +{ + struct debugfs_devm_entry *entry; + + if (IS_ERR(parent)) + return ERR_PTR(-ENOENT); + + entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + entry->read = read_fn; + entry->dev = dev; + + return debugfs_create_file(name, S_IRUGO, parent, entry, + &debugfs_devm_entry_ops); +} +EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); + diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 1e3b99d3db0d..61e72d44cf94 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev, - void *data, const struct file_operations *fops) - +static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); - if (inode) { inode->i_ino = get_next_ino(); - inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - switch (mode & S_IFMT) { - default: - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - inode->i_fop = fops ? fops : &debugfs_file_operations; - inode->i_private = data; - break; - case S_IFLNK: - inode->i_op = &debugfs_link_operations; - inode->i_private = data; - break; - case S_IFDIR: - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* directory inodes start off with i_nlink == 2 - * (for "." entry) */ - inc_nlink(inode); - break; - } } return inode; } -/* SMP-safe */ -static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t dev, void *data, - const struct file_operations *fops) -{ - struct inode *inode; - int error = -EPERM; - - if (dentry->d_inode) - return -EEXIST; - - inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops); - if (inode) { - d_instantiate(dentry, inode); - dget(dentry); - error = 0; - } - return error; -} - -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int res; - - mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; - res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL); - if (!res) { - inc_nlink(dir); - fsnotify_mkdir(dir, dentry); - } - return res; -} - -static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode, - void *data) -{ - mode = (mode & S_IALLUGO) | S_IFLNK; - return debugfs_mknod(dir, dentry, mode, 0, data, NULL); -} - -static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - void *data, const struct file_operations *fops) -{ - int res; - - mode = (mode & S_IALLUGO) | S_IFREG; - res = debugfs_mknod(dir, dentry, mode, 0, data, fops); - if (!res) - fsnotify_create(dir, dentry); - return res; -} - static inline int debugfs_positive(struct dentry *dentry) { return dentry->d_inode && !d_unhashed(dentry); @@ -246,10 +169,31 @@ static int debugfs_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void debugfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_private); +} + static const struct super_operations debugfs_super_operations = { .statfs = simple_statfs, .remount_fs = debugfs_remount, .show_options = debugfs_show_options, + .evict_inode = debugfs_evict_inode, +}; + +static struct vfsmount *debugfs_automount(struct path *path) +{ + struct vfsmount *(*f)(void *); + f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; + return f(path->dentry->d_inode->i_private); +} + +static const struct dentry_operations debugfs_dops = { + .d_delete = always_delete_dentry, + .d_automount = debugfs_automount, }; static int debug_fill_super(struct super_block *sb, void *data, int silent) @@ -276,6 +220,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) goto fail; sb->s_op = &debugfs_super_operations; + sb->s_d_op = &debugfs_dops; debugfs_apply_options(sb); @@ -302,19 +247,20 @@ static struct file_system_type debug_fs_type = { }; MODULE_ALIAS_FS("debugfs"); -static struct dentry *__create_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops) +static struct dentry *start_creating(const char *name, struct dentry *parent) { - struct dentry *dentry = NULL; + struct dentry *dentry; int error; pr_debug("debugfs: creating file '%s'\n",name); + if (IS_ERR(parent)) + return parent; + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) - goto exit; + return ERR_PTR(error); /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super @@ -326,31 +272,26 @@ static struct dentry *__create_file(const char *name, umode_t mode, mutex_lock(&parent->d_inode->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry)) { - switch (mode & S_IFMT) { - case S_IFDIR: - error = debugfs_mkdir(parent->d_inode, dentry, mode); - - break; - case S_IFLNK: - error = debugfs_link(parent->d_inode, dentry, mode, - data); - break; - default: - error = debugfs_create(parent->d_inode, dentry, mode, - data, fops); - break; - } + if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); - } else - error = PTR_ERR(dentry); - mutex_unlock(&parent->d_inode->i_mutex); - - if (error) { - dentry = NULL; - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + dentry = ERR_PTR(-EEXIST); } -exit: + if (IS_ERR(dentry)) + mutex_unlock(&parent->d_inode->i_mutex); + return dentry; +} + +static struct dentry *failed_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + dput(dentry); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + return NULL; +} + +static struct dentry *end_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); return dentry; } @@ -384,19 +325,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { - switch (mode & S_IFMT) { - case S_IFREG: - case 0: - break; - default: - BUG(); - } + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); - return __create_file(name, mode, parent, data, fops); + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; + d_instantiate(dentry, inode); + fsnotify_create(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_file); /** + * debugfs_create_file_size - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * @file_size: initial file size + * + * This is the basic "create a file" function for debugfs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the debugfs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_file_size(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops, + loff_t file_size) +{ + struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); + + if (de) + de->d_inode->i_size = file_size; + return de; +} +EXPORT_SYMBOL_GPL(debugfs_create_file_size); + +/** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. @@ -416,12 +409,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { - return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, - parent, NULL, NULL); + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** + * debugfs_create_automount - create automount point in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @f: function to be called when pathname resolution steps on that one. + * @data: opaque argument to pass to f(). + * + * @f should return what ->d_automount() would. + */ +struct dentry *debugfs_create_automount(const char *name, + struct dentry *parent, + struct vfsmount *(*f)(void *), + void *data) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_flags |= S_AUTOMOUNT; + inode->i_private = data; + dentry->d_fsdata = (void *)f; + d_instantiate(dentry, inode); + return end_creating(dentry); +} +EXPORT_SYMBOL(debugfs_create_automount); + +/** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. @@ -447,17 +493,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { - struct dentry *result; - char *link; - - link = kstrdup(target, GFP_KERNEL); + struct dentry *dentry; + struct inode *inode; + char *link = kstrdup(target, GFP_KERNEL); if (!link) return NULL; - result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL); - if (!result) + dentry = start_creating(name, parent); + if (IS_ERR(dentry)) { kfree(link); - return result; + return NULL; + } + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) { + kfree(link); + return failed_creating(dentry); + } + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_op = &debugfs_link_operations; + inode->i_private = link; + d_instantiate(dentry, inode); + return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); @@ -466,23 +523,14 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) int ret = 0; if (debugfs_positive(dentry)) { - if (dentry->d_inode) { - dget(dentry); - switch (dentry->d_inode->i_mode & S_IFMT) { - case S_IFDIR: - ret = simple_rmdir(parent->d_inode, dentry); - break; - case S_IFLNK: - kfree(dentry->d_inode->i_private); - /* fall through */ - default: - simple_unlink(parent->d_inode, dentry); - break; - } - if (!ret) - d_delete(dentry); - dput(dentry); - } + dget(dentry); + if (S_ISDIR(dentry->d_inode->i_mode)) + ret = simple_rmdir(parent->d_inode, dentry); + else + simple_unlink(parent->d_inode, dentry); + if (!ret) + d_delete(dentry); + dput(dentry); } return ret; } @@ -553,7 +601,7 @@ void debugfs_remove_recursive(struct dentry *dentry) * use the d_u.d_child as the rcu head and corrupt this list. */ spin_lock(&parent->d_lock); - list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) { + list_for_each_entry(child, &parent->d_subdirs, d_child) { if (!debugfs_positive(child)) continue; @@ -645,7 +693,7 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, } d_move(old_dentry, dentry); fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, - S_ISDIR(old_dentry->d_inode->i_mode), + d_is_dir(old_dentry), NULL, old_dentry); fsnotify_oldname_free(old_name); unlock_rename(new_dir, old_dir); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 1323c568e362..eea64912c9c0 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -48,8 +48,8 @@ static char *print_lockmode(int mode) } } -static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, - struct dlm_rsb *res) +static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) { seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); @@ -68,21 +68,17 @@ static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - return seq_puts(s, "\n"); + seq_puts(s, "\n"); } -static int print_format1(struct dlm_rsb *res, struct seq_file *s) +static void print_format1(struct dlm_rsb *res, struct seq_file *s) { struct dlm_lkb *lkb; int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; - int rv; lock_rsb(res); - rv = seq_printf(s, "\nResource %p Name (len=%d) \"", - res, res->res_length); - if (rv) - goto out; + seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); for (i = 0; i < res->res_length; i++) { if (isprint(res->res_name[i])) @@ -92,17 +88,16 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } if (res->res_nodeid > 0) - rv = seq_printf(s, "\"\nLocal Copy, Master is node %d\n", - res->res_nodeid); + seq_printf(s, "\"\nLocal Copy, Master is node %d\n", + res->res_nodeid); else if (res->res_nodeid == 0) - rv = seq_puts(s, "\"\nMaster Copy\n"); + seq_puts(s, "\"\nMaster Copy\n"); else if (res->res_nodeid == -1) - rv = seq_printf(s, "\"\nLooking up master (lkid %x)\n", - res->res_first_lkid); + seq_printf(s, "\"\nLooking up master (lkid %x)\n", + res->res_first_lkid); else - rv = seq_printf(s, "\"\nInvalid master %d\n", - res->res_nodeid); - if (rv) + seq_printf(s, "\"\nInvalid master %d\n", res->res_nodeid); + if (seq_has_overflowed(s)) goto out; /* Print the LVB: */ @@ -116,8 +111,8 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) } if (rsb_flag(res, RSB_VALNOTVALID)) seq_puts(s, " (INVALID)"); - rv = seq_puts(s, "\n"); - if (rv) + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) goto out; } @@ -125,32 +120,30 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) recover_list = !list_empty(&res->res_recover_list); if (root_list || recover_list) { - rv = seq_printf(s, "Recovery: root %d recover %d flags %lx " - "count %d\n", root_list, recover_list, - res->res_flags, res->res_recover_locks_count); - if (rv) - goto out; + seq_printf(s, "Recovery: root %d recover %d flags %lx count %d\n", + root_list, recover_list, + res->res_flags, res->res_recover_locks_count); } /* Print the locks attached to this resource */ seq_puts(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) { - rv = print_format1_lock(s, lkb, res); - if (rv) + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) goto out; } seq_puts(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) { - rv = print_format1_lock(s, lkb, res); - if (rv) + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) goto out; } seq_puts(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) { - rv = print_format1_lock(s, lkb, res); - if (rv) + print_format1_lock(s, lkb, res); + if (seq_has_overflowed(s)) goto out; } @@ -159,23 +152,23 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s) seq_puts(s, "Lookup Queue\n"); list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) { - rv = seq_printf(s, "%08x %s", lkb->lkb_id, - print_lockmode(lkb->lkb_rqmode)); + seq_printf(s, "%08x %s", + lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - rv = seq_puts(s, "\n"); + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) + goto out; } out: unlock_rsb(res); - return rv; } -static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, - struct dlm_rsb *r) +static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) { u64 xid = 0; u64 us; - int rv; if (lkb->lkb_flags & DLM_IFL_USER) { if (lkb->lkb_ua) @@ -188,103 +181,97 @@ static int print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us r_nodeid r_len r_name */ - rv = seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", - lkb->lkb_id, - lkb->lkb_nodeid, - lkb->lkb_remid, - lkb->lkb_ownpid, - (unsigned long long)xid, - lkb->lkb_exflags, - lkb->lkb_flags, - lkb->lkb_status, - lkb->lkb_grmode, - lkb->lkb_rqmode, - (unsigned long long)us, - r->res_nodeid, - r->res_length, - r->res_name); - return rv; + seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + (unsigned long long)us, + r->res_nodeid, + r->res_length, + r->res_name); } -static int print_format2(struct dlm_rsb *r, struct seq_file *s) +static void print_format2(struct dlm_rsb *r, struct seq_file *s) { struct dlm_lkb *lkb; - int rv = 0; lock_rsb(r); list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { - rv = print_format2_lock(s, lkb, r); - if (rv) + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) goto out; } list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { - rv = print_format2_lock(s, lkb, r); - if (rv) + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) goto out; } list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { - rv = print_format2_lock(s, lkb, r); - if (rv) + print_format2_lock(s, lkb, r); + if (seq_has_overflowed(s)) goto out; } out: unlock_rsb(r); - return rv; } -static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, +static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, int rsb_lookup) { u64 xid = 0; - int rv; if (lkb->lkb_flags & DLM_IFL_USER) { if (lkb->lkb_ua) xid = lkb->lkb_ua->xid; } - rv = seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", - lkb->lkb_id, - lkb->lkb_nodeid, - lkb->lkb_remid, - lkb->lkb_ownpid, - (unsigned long long)xid, - lkb->lkb_exflags, - lkb->lkb_flags, - lkb->lkb_status, - lkb->lkb_grmode, - lkb->lkb_rqmode, - lkb->lkb_last_bast.mode, - rsb_lookup, - lkb->lkb_wait_type, - lkb->lkb_lvbseq, - (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), - (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time)); - return rv; + seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_last_bast.mode, + rsb_lookup, + lkb->lkb_wait_type, + lkb->lkb_lvbseq, + (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), + (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time)); } -static int print_format3(struct dlm_rsb *r, struct seq_file *s) +static void print_format3(struct dlm_rsb *r, struct seq_file *s) { struct dlm_lkb *lkb; int i, lvblen = r->res_ls->ls_lvblen; int print_name = 1; - int rv; lock_rsb(r); - rv = seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", - r, - r->res_nodeid, - r->res_first_lkid, - r->res_flags, - !list_empty(&r->res_root_list), - !list_empty(&r->res_recover_list), - r->res_recover_locks_count, - r->res_length); - if (rv) + seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", + r, + r->res_nodeid, + r->res_first_lkid, + r->res_flags, + !list_empty(&r->res_root_list), + !list_empty(&r->res_recover_list), + r->res_recover_locks_count, + r->res_length); + if (seq_has_overflowed(s)) goto out; for (i = 0; i < r->res_length; i++) { @@ -292,7 +279,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) print_name = 0; } - seq_printf(s, "%s", print_name ? "str " : "hex"); + seq_puts(s, print_name ? "str " : "hex"); for (i = 0; i < r->res_length; i++) { if (print_name) @@ -300,8 +287,8 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_puts(s, "\n"); - if (rv) + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) goto out; if (!r->res_lvbptr) @@ -311,65 +298,62 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - rv = seq_puts(s, "\n"); - if (rv) + seq_puts(s, "\n"); + if (seq_has_overflowed(s)) goto out; do_locks: list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { - rv = print_format3_lock(s, lkb, 0); - if (rv) + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) goto out; } list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { - rv = print_format3_lock(s, lkb, 0); - if (rv) + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) goto out; } list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) { - rv = print_format3_lock(s, lkb, 0); - if (rv) + print_format3_lock(s, lkb, 0); + if (seq_has_overflowed(s)) goto out; } list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) { - rv = print_format3_lock(s, lkb, 1); - if (rv) + print_format3_lock(s, lkb, 1); + if (seq_has_overflowed(s)) goto out; } out: unlock_rsb(r); - return rv; } -static int print_format4(struct dlm_rsb *r, struct seq_file *s) +static void print_format4(struct dlm_rsb *r, struct seq_file *s) { int our_nodeid = dlm_our_nodeid(); int print_name = 1; - int i, rv; + int i; lock_rsb(r); - rv = seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ", - r, - r->res_nodeid, - r->res_master_nodeid, - r->res_dir_nodeid, - our_nodeid, - r->res_toss_time, - r->res_flags, - r->res_length); - if (rv) - goto out; + seq_printf(s, "rsb %p %d %d %d %d %lu %lx %d ", + r, + r->res_nodeid, + r->res_master_nodeid, + r->res_dir_nodeid, + our_nodeid, + r->res_toss_time, + r->res_flags, + r->res_length); for (i = 0; i < r->res_length; i++) { if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) print_name = 0; } - seq_printf(s, "%s", print_name ? "str " : "hex"); + seq_puts(s, print_name ? "str " : "hex"); for (i = 0; i < r->res_length; i++) { if (print_name) @@ -377,10 +361,9 @@ static int print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - rv = seq_puts(s, "\n"); - out: + seq_puts(s, "\n"); + unlock_rsb(r); - return rv; } struct rsbtbl_iter { @@ -390,47 +373,45 @@ struct rsbtbl_iter { int header; }; -/* seq_printf returns -1 if the buffer is full, and 0 otherwise. - If the buffer is full, seq_printf can be called again, but it - does nothing and just returns -1. So, the these printing routines - periodically check the return value to avoid wasting too much time - trying to print to a full buffer. */ +/* + * If the buffer is full, seq_printf can be called again, but it + * does nothing. So, the these printing routines periodically check + * seq_has_overflowed to avoid wasting too much time trying to print to + * a full buffer. + */ static int table_seq_show(struct seq_file *seq, void *iter_ptr) { struct rsbtbl_iter *ri = iter_ptr; - int rv = 0; switch (ri->format) { case 1: - rv = print_format1(ri->rsb, seq); + print_format1(ri->rsb, seq); break; case 2: if (ri->header) { - seq_printf(seq, "id nodeid remid pid xid exflags " - "flags sts grmode rqmode time_ms " - "r_nodeid r_len r_name\n"); + seq_puts(seq, "id nodeid remid pid xid exflags flags sts grmode rqmode time_ms r_nodeid r_len r_name\n"); ri->header = 0; } - rv = print_format2(ri->rsb, seq); + print_format2(ri->rsb, seq); break; case 3: if (ri->header) { - seq_printf(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); ri->header = 0; } - rv = print_format3(ri->rsb, seq); + print_format3(ri->rsb, seq); break; case 4: if (ri->header) { - seq_printf(seq, "version 4 rsb 2\n"); + seq_puts(seq, "version 4 rsb 2\n"); ri->header = 0; } - rv = print_format4(ri->rsb, seq); + print_format4(ri->rsb, seq); break; } - return rv; + return 0; } static const struct seq_operations format1_seq_ops; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 83f3d5520307..35502d4046f5 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -5886,6 +5886,78 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, return error; } +/* + * The caller asks for an orphan lock on a given resource with a given mode. + * If a matching lock exists, it's moved to the owner's list of locks and + * the lkid is returned. + */ + +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs, uint32_t *lkid) +{ + struct dlm_lkb *lkb; + struct dlm_user_args *ua; + int found_other_mode = 0; + int found = 0; + int rv = 0; + + mutex_lock(&ls->ls_orphans_mutex); + list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) { + if (lkb->lkb_resource->res_length != namelen) + continue; + if (memcmp(lkb->lkb_resource->res_name, name, namelen)) + continue; + if (lkb->lkb_grmode != mode) { + found_other_mode = 1; + continue; + } + + found = 1; + list_del_init(&lkb->lkb_ownqueue); + lkb->lkb_flags &= ~DLM_IFL_ORPHAN; + *lkid = lkb->lkb_id; + break; + } + mutex_unlock(&ls->ls_orphans_mutex); + + if (!found && found_other_mode) { + rv = -EAGAIN; + goto out; + } + + if (!found) { + rv = -ENOENT; + goto out; + } + + lkb->lkb_exflags = flags; + lkb->lkb_ownpid = (int) current->pid; + + ua = lkb->lkb_ua; + + ua->proc = ua_tmp->proc; + ua->xid = ua_tmp->xid; + ua->castparam = ua_tmp->castparam; + ua->castaddr = ua_tmp->castaddr; + ua->bastparam = ua_tmp->bastparam; + ua->bastaddr = ua_tmp->bastaddr; + ua->user_lksb = ua_tmp->user_lksb; + + /* + * The lkb reference from the ls_orphans list was not + * removed above, and is now considered the reference + * for the proc locks list. + */ + + spin_lock(&ua->proc->locks_spin); + list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks); + spin_unlock(&ua->proc->locks_spin); + out: + kfree(ua_tmp); + return rv; +} + int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in) { @@ -6029,7 +6101,7 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb) struct dlm_args args; int error; - hold_lkb(lkb); + hold_lkb(lkb); /* reference for the ls_orphans list */ mutex_lock(&ls->ls_orphans_mutex); list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans); mutex_unlock(&ls->ls_orphans_mutex); @@ -6217,7 +6289,7 @@ int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc, { int error = 0; - if (nodeid != dlm_our_nodeid()) { + if (nodeid && (nodeid != dlm_our_nodeid())) { error = send_purge(ls, nodeid, pid); } else { dlm_lock_recovery(ls); diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 5e0c72e36a9b..ed8ebd3a8593 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -49,6 +49,9 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, void *name, unsigned int namelen, + unsigned long timeout_cs, uint32_t *lkid); int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, uint32_t flags, uint32_t lkid, char *lvb_in); int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index e7cfbaf8d0e2..1e6e227134d7 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); void *data = genlmsg_data(genlhdr); - int rv; - rv = genlmsg_end(skb, data); - if (rv < 0) { - nlmsg_free(skb); - return rv; - } + genlmsg_end(skb, data); return genlmsg_unicast(&init_net, skb, listener_nlportid); } diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 142e21655eed..fb85f32e9eca 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -238,6 +238,7 @@ static int device_user_lock(struct dlm_user_proc *proc, { struct dlm_ls *ls; struct dlm_user_args *ua; + uint32_t lkid; int error = -ENOMEM; ls = dlm_find_lockspace_local(proc->lockspace); @@ -260,12 +261,20 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->bastaddr = params->bastaddr; ua->xid = params->xid; - if (params->flags & DLM_LKF_CONVERT) + if (params->flags & DLM_LKF_CONVERT) { error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); - else { + } else if (params->flags & DLM_LKF_ORPHAN) { + error = dlm_user_adopt_orphan(ls, ua, + params->mode, params->flags, + params->name, params->namelen, + (unsigned long) params->timeout, + &lkid); + if (!error) + error = lkid; + } else { error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 1de7294aad20..5718cb9f7273 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -37,19 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -static void drop_slab(void) -{ - int nr_objects; - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - - nodes_setall(shrink.nodes_to_scan); - do { - nr_objects = shrink_slab(&shrink, 1000, 1000); - } while (nr_objects > 10); -} - int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 2f6735dbf1a9..719e1ce1c609 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1373,7 +1373,7 @@ out: int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode) { struct dentry *lower_dentry = - ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry; + ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; ssize_t size; int rc = 0; @@ -1917,7 +1917,6 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, break; case 2: dst[dst_byte_offset++] |= (src_byte); - dst[dst_byte_offset] = 0; current_bit_offset = 0; break; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 90d1882b306f..5ba029e627cc 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key) } #define ECRYPTFS_MAX_KEYSET_SIZE 1024 -#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32 +#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31 #define ECRYPTFS_MAX_NUM_ENC_KEYS 64 #define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */ #define ECRYPTFS_SALT_BYTES 2 @@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat { struct crypto_ablkcipher *tfm; struct crypto_hash *hash_tfm; /* Crypto context for generating * the initialization vectors */ - unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; unsigned char key[ECRYPTFS_MAX_KEY_BYTES]; unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES]; struct list_head keysig_list; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index f5bce9096555..fd39bad6f1bd 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -75,11 +75,11 @@ struct ecryptfs_getdents_callback { /* Inspired by generic filldir in fs/readdir.c */ static int -ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, - loff_t offset, u64 ino, unsigned int d_type) +ecryptfs_filldir(struct dir_context *ctx, const char *lower_name, + int lower_namelen, loff_t offset, u64 ino, unsigned int d_type) { struct ecryptfs_getdents_callback *buf = - (struct ecryptfs_getdents_callback *)dirent; + container_of(ctx, struct ecryptfs_getdents_callback, ctx); size_t name_size; char *name; int rc; @@ -190,23 +190,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file) { int rc = 0; struct ecryptfs_crypt_stat *crypt_stat = NULL; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR) - || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC) - || (file->f_flags & O_APPEND))) { - printk(KERN_WARNING "Mount has encrypted view enabled; " - "files may only be read\n"); - rc = -EPERM; - goto out; - } /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); ecryptfs_set_file_private(file, file_info); @@ -242,7 +230,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (d_is_dir(ecryptfs_dentry)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); mutex_lock(&crypt_stat->cs_mutex); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); @@ -315,9 +303,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOTTY; - if (lower_file->f_op->unlocked_ioctl) + if (!lower_file->f_op->unlocked_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC_GETFLAGS: + case FS_IOC_SETFLAGS: + case FS_IOC_GETVERSION: + case FS_IOC_SETVERSION: rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #ifdef CONFIG_COMPAT @@ -327,9 +328,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (lower_file->f_op->compat_ioctl) + if (!lower_file->f_op->compat_ioctl) + return rc; + + switch (cmd) { + case FITRIM: + case FS_IOC32_GETFLAGS: + case FS_IOC32_SETFLAGS: + case FS_IOC32_GETVERSION: + case FS_IOC32_SETVERSION: rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); - return rc; + fsstack_copy_attr_all(file_inode(file), file_inode(lower_file)); + + return rc; + default: + return rc; + } } #endif diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1686dc2da9fd..b08b5187f662 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) inode->i_ino = lower_inode->i_ino; inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; if (S_ISLNK(inode->i_mode)) inode->i_op = &ecryptfs_symlink_iops; @@ -908,9 +907,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); mutex_lock(&crypt_stat->cs_mutex); - if (S_ISDIR(dentry->d_inode->i_mode)) + if (d_is_dir(dentry)) crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - else if (S_ISREG(dentry->d_inode->i_mode) + else if (d_is_reg(dentry) && (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED) || !(crypt_stat->flags & ECRYPTFS_KEY_VALID))) { struct ecryptfs_mount_crypt_stat *mount_crypt_stat; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 635e8e16a5b7..6bd67e2011f0 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -100,12 +100,12 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, (*size) = 0; if (data[0] < 192) { /* One-byte length */ - (*size) = (unsigned char)data[0]; + (*size) = data[0]; (*length_size) = 1; } else if (data[0] < 224) { /* Two-byte length */ - (*size) = (((unsigned char)(data[0]) - 192) * 256); - (*size) += ((unsigned char)(data[1]) + 192); + (*size) = (data[0] - 192) * 256; + (*size) += data[1] + 192; (*length_size) = 2; } else if (data[0] == 255) { /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */ @@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { struct blkcipher_desc desc; char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; char iv[ECRYPTFS_MAX_IV_BYTES]; - char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; }; /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c4cd1fd86cc2..c095d3264259 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); - BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); + BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } @@ -493,6 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags { struct super_block *s; struct ecryptfs_sb_info *sbi; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; @@ -511,6 +512,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags err = "Error parsing options"; goto out; } + mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { @@ -518,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); + rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs"); if (rc) goto out1; @@ -557,11 +559,19 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags /** * Set the POSIX ACL flag based on whether they're enabled in the lower - * mount. Force a read-only eCryptfs mount if the lower mount is ro. - * Allow a ro eCryptfs mount even when the lower mount is rw. + * mount. */ s->s_flags = flags & ~MS_POSIXACL; - s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL); + s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; + + /** + * Force a read-only eCryptfs mount when: + * 1) The lower mount is ro + * 2) The ecryptfs_encrypted_view mount option is specified + */ + if (path.dentry->d_sb->s_flags & MS_RDONLY || + mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + s->s_flags |= MS_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 564a1fa34b99..4626976794e7 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -419,7 +419,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) ssize_t size; void *xattr_virt; struct dentry *lower_dentry = - ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry; + ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; struct inode *lower_inode = lower_dentry->d_inode; int rc; diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig index 367bbb10c543..c2499ef174a2 100644 --- a/fs/efivarfs/Kconfig +++ b/fs/efivarfs/Kconfig @@ -1,6 +1,7 @@ config EFIVAR_FS tristate "EFI Variable filesystem" depends on EFI + default m help efivarfs is a replacement filesystem for the old EFI variable support via sysfs, as it doesn't suffer from the diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index cdb2971192a5..90001da9abfd 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -47,8 +47,8 @@ static ssize_t efivarfs_file_write(struct file *file, if (bytes == -ENOENT) { drop_nlink(inode); - d_delete(file->f_dentry); - dput(file->f_dentry); + d_delete(file->f_path.dentry); + dput(file->f_path.dentry); } else { mutex_lock(&inode->i_mutex); i_size_write(inode, datasize + sizeof(attributes)); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 0a48886e069c..ddbce42548c9 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len] = '-'; - efi_guid_unparse(&entry->var.VendorGuid, name + len + 1); + efi_guid_to_str(&entry->var.VendorGuid, name + len + 1); name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; @@ -236,6 +236,7 @@ static void efivarfs_kill_sb(struct super_block *sb) } static struct file_system_type efivarfs_type = { + .owner = THIS_MODULE, .name = "efivarfs", .mount = efivarfs_mount, .kill_sb = efivarfs_kill_sb, @@ -244,17 +245,23 @@ static struct file_system_type efivarfs_type = { static __init int efivarfs_init(void) { if (!efi_enabled(EFI_RUNTIME_SERVICES)) - return 0; + return -ENODEV; if (!efivars_kobject()) - return 0; + return -ENODEV; return register_filesystem(&efivarfs_type); } +static __exit void efivarfs_exit(void) +{ + unregister_filesystem(&efivarfs_type); +} + MODULE_AUTHOR("Matthew Garrett, Jeremy Kerr"); MODULE_DESCRIPTION("EFI Variable Filesystem"); MODULE_LICENSE("GPL"); MODULE_ALIAS_FS("efivarfs"); module_init(efivarfs_init); +module_exit(efivarfs_exit); diff --git a/fs/eventfd.c b/fs/eventfd.c index d6a88e7812f3..8d0c0df01854 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; unsigned int events = 0; - unsigned long flags; + u64 count; poll_wait(file, &ctx->wqh, wait); + smp_rmb(); + count = ctx->count; - spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->count > 0) + if (count > 0) events |= POLLIN; - if (ctx->count == ULLONG_MAX) + if (count == ULLONG_MAX) events |= POLLERR; - if (ULLONG_MAX - 1 > ctx->count) + if (ULLONG_MAX - 1 > count) events |= POLLOUT; - spin_unlock_irqrestore(&ctx->wqh.lock, flags); return events; } @@ -287,17 +287,14 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c } #ifdef CONFIG_PROC_FS -static int eventfd_show_fdinfo(struct seq_file *m, struct file *f) +static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; - int ret; spin_lock_irq(&ctx->wqh.lock); - ret = seq_printf(m, "eventfd-count: %16llx\n", - (unsigned long long)ctx->count); + seq_printf(m, "eventfd-count: %16llx\n", + (unsigned long long)ctx->count); spin_unlock_irq(&ctx->wqh.lock); - - return ret; } #endif diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7bcfff900f05..1e009cad8d5c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -870,25 +870,22 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) } #ifdef CONFIG_PROC_FS -static int ep_show_fdinfo(struct seq_file *m, struct file *f) +static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; - int ret = 0; mutex_lock(&ep->mtx); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); - ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", - epi->ffd.fd, epi->event.events, - (long long)epi->event.data); - if (ret) + seq_printf(m, "tfd: %8d events: %8x data: %16llx\n", + epi->ffd.fd, epi->event.events, + (long long)epi->event.data); + if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); - - return ret; } #endif @@ -1642,9 +1639,9 @@ fetch_events: spin_lock_irqsave(&ep->lock, flags); } - __remove_wait_queue(&ep->wq, &wait); - set_current_state(TASK_RUNNING); + __remove_wait_queue(&ep->wq, &wait); + __set_current_state(TASK_RUNNING); } check_events: /* Is it worth to try to dig for events ? */ diff --git a/fs/exec.c b/fs/exec.c index 7302b75a9820..c7f9b733406d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -277,6 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) goto err; mm->stack_vm = mm->total_vm = 1; + arch_bprm_mm_init(mm, vma); up_write(&mm->mmap_sem); bprm->p = vma->vm_end - sizeof(void *); return 0; @@ -747,18 +748,25 @@ EXPORT_SYMBOL(setup_arg_pages); #endif /* CONFIG_MMU */ -static struct file *do_open_exec(struct filename *name) +static struct file *do_open_execat(int fd, struct filename *name, int flags) { struct file *file; int err; - static const struct open_flags open_exec_flags = { + struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC | MAY_OPEN, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; - file = do_filp_open(AT_FDCWD, name, &open_exec_flags); + if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) + return ERR_PTR(-EINVAL); + if (flags & AT_SYMLINK_NOFOLLOW) + open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + open_exec_flags.lookup_flags |= LOOKUP_EMPTY; + + file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) goto out; @@ -769,12 +777,13 @@ static struct file *do_open_exec(struct filename *name) if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) goto exit; - fsnotify_open(file); - err = deny_write_access(file); if (err) goto exit; + if (name->name[0] != '\0') + fsnotify_open(file); + out: return file; @@ -785,8 +794,14 @@ exit: struct file *open_exec(const char *name) { - struct filename tmp = { .name = name }; - return do_open_exec(&tmp); + struct filename *filename = getname_kernel(name); + struct file *f = ERR_CAST(filename); + + if (!IS_ERR(filename)) { + f = do_open_execat(AT_FDCWD, filename, 0); + putname(filename); + } + return f; } EXPORT_SYMBOL(open_exec); @@ -1427,10 +1442,12 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execve_common(struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp) +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) { + char *pathbuf = NULL; struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; @@ -1471,7 +1488,7 @@ static int do_execve_common(struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_exec(filename); + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1479,7 +1496,28 @@ static int do_execve_common(struct filename *filename, sched_exec(); bprm->file = file; - bprm->filename = bprm->interp = filename->name; + if (fd == AT_FDCWD || filename->name[0] == '/') { + bprm->filename = filename->name; + } else { + if (filename->name[0] == '\0') + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); + else + pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", + fd, filename->name); + if (!pathbuf) { + retval = -ENOMEM; + goto out_unmark; + } + /* + * Record that a name derived from an O_CLOEXEC fd will be + * inaccessible after exec. Relies on having exclusive access to + * current->files (due to unshare_files above). + */ + if (close_on_exec(fd, rcu_dereference_raw(current->files->fdt))) + bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; + bprm->filename = pathbuf; + } + bprm->interp = bprm->filename; retval = bprm_mm_init(bprm); if (retval) @@ -1520,6 +1558,7 @@ static int do_execve_common(struct filename *filename, acct_update_integrals(current); task_numa_free(current); free_bprm(bprm); + kfree(pathbuf); putname(filename); if (displaced) put_files_struct(displaced); @@ -1537,6 +1576,7 @@ out_unmark: out_free: free_bprm(bprm); + kfree(pathbuf); out_files: if (displaced) @@ -1552,7 +1592,18 @@ int do_execve(struct filename *filename, { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +int do_execveat(int fd, struct filename *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT @@ -1568,7 +1619,23 @@ static int compat_do_execve(struct filename *filename, .is_compat = true, .ptr.compat = __envp, }; - return do_execve_common(filename, argv, envp); + return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); +} + +static int compat_do_execveat(int fd, struct filename *filename, + const compat_uptr_t __user *__argv, + const compat_uptr_t __user *__envp, + int flags) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execveat_common(fd, filename, argv, envp, flags); } #endif @@ -1608,6 +1675,20 @@ SYSCALL_DEFINE3(execve, { return do_execve(getname(filename), argv, envp); } + +SYSCALL_DEFINE5(execveat, + int, fd, const char __user *, filename, + const char __user *const __user *, argv, + const char __user *const __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, @@ -1615,4 +1696,17 @@ COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, { return compat_do_execve(getname(filename), argv, envp); } + +COMPAT_SYSCALL_DEFINE5(execveat, int, fd, + const char __user *, filename, + const compat_uptr_t __user *, argv, + const compat_uptr_t __user *, envp, + int, flags) +{ + int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return compat_do_execveat(fd, + getname_flags(filename, lookup_flags, NULL), + argv, envp, flags); +} #endif diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index f1d3d4eb8c4f..a198e94813fe 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = { .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ - .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, .is_partially_uptodate = NULL, @@ -1214,7 +1213,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } - inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1314,7 +1312,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode) set_obj_2bcreated(oi); - inode->i_mapping->backing_dev_info = sb->s_bdi; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 95965503afcb..fcc2e565f540 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + ret = bdi_setup_and_register(&sbi->bdi, "exofs"); if (ret) { EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); dput(sb->s_root); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b01fbfb51f43..714cd37a6ba3 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -50,7 +50,7 @@ find_acceptable_alias(struct dentry *result, inode = result->d_inode; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) @@ -241,10 +241,11 @@ struct getdents_callback { * A rather strange filldir function to capture * the name matching the specified inode number. */ -static int filldir_one(void * __buf, const char * name, int len, +static int filldir_one(struct dir_context *ctx, const char *name, int len, loff_t pos, u64 ino, unsigned int d_type) { - struct getdents_callback *buf = __buf; + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); int result = 0; buf->sequence++; @@ -428,7 +429,7 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, if (IS_ERR(result)) return result; - if (S_ISDIR(result->d_inode->i_mode)) { + if (d_is_dir(result)) { /* * This request is for a directory. * diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 14a6780fd034..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -42,14 +42,3 @@ config EXT2_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index f42af45cfd88..445b0e996a12 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o -ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index d9a17d0b124d..678f9ab08c48 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,10 +380,15 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ -#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */ #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ +#ifdef CONFIG_FS_DAX +#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ +#else +#define EXT2_MOUNT_DAX 0 +#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt @@ -689,6 +694,9 @@ struct ext2_inode_info { struct mutex truncate_mutex; struct inode vfs_inode; struct list_head i_orphan; /* unlinked but open inodes */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif }; /* @@ -785,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_xip_file_operations; +extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_aops_xip; extern const struct address_space_operations ext2_nobh_aops; /* namei.c */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 7c87b22a7228..e31701713516 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,6 +25,36 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext2_get_block); +} + +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext2_get_block); +} + +static const struct vm_operations_struct ext2_dax_vm_ops = { + .fault = ext2_dax_fault, + .page_mkwrite = ext2_dax_mkwrite, +}; + +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_mmap(file, vma); + + file_accessed(file); + vma->vm_ops = &ext2_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} +#else +#define ext2_file_mmap generic_file_mmap +#endif + /* * Called when filp is released. This happens when all file descriptors * for a single struct file are closed. Note that different open() calls @@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, @@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_EXT2_FS_XIP -const struct file_operations ext2_xip_file_operations = { +#ifdef CONFIG_FS_DAX +const struct file_operations ext2_dax_file_operations = { .llseek = generic_file_llseek, - .read = xip_file_read, - .write = xip_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = xip_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 7d66fb0e4cca..6c14bb8322fa 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode) struct ext2_group_desc * gdp; struct backing_dev_info *bdi; - bdi = inode->i_mapping->backing_dev_info; + bdi = inode_to_bdi(inode); if (bdi_read_congested(bdi)) return; if (bdi_write_congested(bdi)) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 36d35c36311d..6434bc000125 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,7 +34,6 @@ #include <linux/aio.h> #include "ext2.h" #include "acl.h" -#include "xip.h" #include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode, goto cleanup; } - if (ext2_use_xip(inode->i_sb)) { + if (IS_DAX(inode)) { /* - * we need to clear the block + * block must be initialised before we put it in the tree + * so that it's not found by another thread before it's + * initialised */ - err = ext2_clear_xip_target (inode, - le32_to_cpu(chain[depth-1].key)); + err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block, + NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ext2_get_block); if (ret < 0 && (rw & WRITE)) ext2_write_failed(mapping, offset + count); return ret; @@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = { .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_aops_xip = { - .bmap = ext2_bmap, - .get_xip_mem = ext2_get_xip_mem, -}; - const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (mapping_is_xip(inode->i_mapping)) - error = xip_truncate_page(inode->i_mapping, newsize); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, ext2_get_block); else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode) { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_DAX); if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT2_APPEND_FL) @@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + inode->i_flags |= S_DAX; } /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ @@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c268d0af1db9..148f6e3789ea 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { @@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; @@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 170dc41e8bf4..d0e746e96511 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, int wait); @@ -166,6 +165,10 @@ static struct inode *ext2_alloc_inode(struct super_block *sb) return NULL; ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + return &ei->vfs_inode; } @@ -288,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpquota"); #endif -#if defined(CONFIG_EXT2_FS_XIP) +#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) + seq_puts(seq, ",dax"); #endif if (!test_opt(sb, RESERVATION)) @@ -303,6 +308,10 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_QUOTA static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static struct dquot **ext2_get_dquots(struct inode *inode) +{ + return EXT2_I(inode)->i_dquot; +} #endif static const struct super_operations ext2_sops = { @@ -320,6 +329,7 @@ static const struct super_operations ext2_sops = { #ifdef CONFIG_QUOTA .quota_read = ext2_quota_read, .quota_write = ext2_quota_write, + .get_dquots = ext2_get_dquots, #endif }; @@ -394,7 +404,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; @@ -423,6 +433,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_xip, "xip"}, + {Opt_dax, "dax"}, {Opt_grpquota, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, @@ -550,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb) break; #endif case Opt_xip: -#ifdef CONFIG_EXT2_FS_XIP - set_opt (sbi->s_mount_opt, XIP); + ext2_msg(sb, KERN_INFO, "use dax instead of xip"); + set_opt(sbi->s_mount_opt, XIP); + /* Fall through */ + case Opt_dax: +#ifdef CONFIG_FS_DAX + set_opt(sbi->s_mount_opt, DAX); #else - ext2_msg(sb, KERN_INFO, "xip option not supported"); + ext2_msg(sb, KERN_INFO, "dax option not supported"); #endif break; @@ -868,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || @@ -900,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { - if (!silent) + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for xip"); - goto failed_mount; + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext2_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1090,6 +1108,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &dquot_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif root = ext2_iget(sb, EXT2_ROOT_INO); @@ -1249,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; - unsigned long old_mount_opt = sbi->s_mount_opt; struct ext2_mount_options old_opts; unsigned long old_sb_flags; int err; @@ -1274,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - - if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - ext2_msg(sb, KERN_WARNING, - "warning: unsupported blocksize for xip"); - err = -EINVAL; - goto restore_opts; - } - es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " - "xip flag with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; - sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT2_MOUNT_DAX; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { spin_unlock(&sbi->s_lock); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c deleted file mode 100644 index e98171a11cfe..000000000000 --- a/fs/ext2/xip.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * linux/fs/ext2/xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include "ext2.h" -#include "xip.h" - -static inline int -__inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - const struct block_device_operations *ops = bdev->bd_disk->fops; - sector_t sector; - - sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ - - BUG_ON(!ops->direct_access); - return ops->direct_access(bdev, sector, kaddr, pfn); -} - -static inline int -__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, - sector_t *result) -{ - struct buffer_head tmp; - int rc; - - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = 1 << inode->i_blkbits; - rc = ext2_get_block(inode, pgoff, &tmp, create); - *result = tmp.b_blocknr; - - /* did we get a sparse block (hole in the file)? */ - if (!tmp.b_blocknr && !rc) { - BUG_ON(create); - rc = -ENODATA; - } - - return rc; -} - -int -ext2_clear_xip_target(struct inode *inode, sector_t block) -{ - void *kaddr; - unsigned long pfn; - int rc; - - rc = __inode_direct_access(inode, block, &kaddr, &pfn); - if (!rc) - clear_page(kaddr); - return rc; -} - -void ext2_xip_verify_sb(struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - - if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && - !sb->s_bdev->bd_disk->fops->direct_access) { - sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_msg(sb, KERN_WARNING, - "warning: ignoring xip option - " - "not supported by bdev"); - } -} - -int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, - void **kmem, unsigned long *pfn) -{ - int rc; - sector_t block; - - /* first, retrieve the sector number */ - rc = __ext2_get_block(mapping->host, pgoff, create, &block); - if (rc) - return rc; - - /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn); - return rc; -} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h deleted file mode 100644 index 18b34d2f31b3..000000000000 --- a/fs/ext2/xip.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * linux/fs/ext2/xip.h - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#ifdef CONFIG_EXT2_FS_XIP -extern void ext2_xip_verify_sb (struct super_block *); -extern int ext2_clear_xip_target (struct inode *, sector_t); - -static inline int ext2_use_xip (struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - return (sbi->s_mount_opt & EXT2_MOUNT_XIP); -} -int ext2_get_xip_mem(struct address_space *, pgoff_t, int, - void **, unsigned long *); -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) -#else -#define mapping_is_xip(map) 0 -#define ext2_xip_verify_sb(sb) do { } while (0) -#define ext2_use_xip(sb) 0 -#define ext2_clear_xip_target(inode, chain) 0 -#define ext2_get_xip_mem NULL -#endif diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h index fc3cdcf24aed..f483a80b3fe7 100644 --- a/fs/ext3/ext3.h +++ b/fs/ext3/ext3.h @@ -615,6 +615,10 @@ struct ext3_inode_info { atomic_t i_sync_tid; atomic_t i_datasync_tid; +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + struct inode vfs_inode; }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index eb742d0e67ff..d4dbf3c259b3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb) } sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); + mutex_destroy(&sbi->s_orphan_lock); + mutex_destroy(&sbi->s_resize_lock); kfree(sbi); } @@ -485,6 +487,10 @@ static struct inode *ext3_alloc_inode(struct super_block *sb) ei->vfs_inode.i_version = 1; atomic_set(&ei->i_datasync_tid, 0); atomic_set(&ei->i_sync_tid, 0); +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + return &ei->vfs_inode; } @@ -764,6 +770,10 @@ static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext3_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static struct dquot **ext3_get_dquots(struct inode *inode) +{ + return EXT3_I(inode)->i_dquot; +} static const struct dquot_operations ext3_quota_operations = { .write_dquot = ext3_write_dquot, @@ -803,6 +813,7 @@ static const struct super_operations ext3_sops = { #ifdef CONFIG_QUOTA .quota_read = ext3_quota_read, .quota_write = ext3_quota_write, + .get_dquots = ext3_get_dquots, #endif .bdev_try_to_free_page = bdev_try_to_free_page, }; @@ -2001,6 +2012,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c55a1faaed58..f63c3d5805c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -158,17 +158,8 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED (1 << BH_Mapped) #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of - * ext4_map_blocks wants to know whether or not the underlying cluster has - * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that - * the requested mapping was from previously mapped (or delayed allocated) - * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster - * should never appear on buffer_head's state flags. - */ -#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_FROM_CLUSTER) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -373,7 +364,8 @@ struct flex_groups { #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ @@ -430,7 +422,7 @@ enum { EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ECOMPR = 11, /* Compression error */ + EXT4_INODE_ENCRYPT = 11, /* Compression error */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ @@ -475,7 +467,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); - CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); @@ -565,10 +557,8 @@ enum { #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Do not take i_data_sem locking in ext4_map_blocks */ #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 - /* Do not put hole in extent cache */ -#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 /* * The bit position of these flags must not overlap with any of the @@ -889,10 +879,12 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; - struct list_head i_es_lru; + struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ - unsigned int i_es_lru_nr; /* protected by i_es_lock */ - unsigned long i_touch_when; /* jiffies of last accessing */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -941,6 +933,10 @@ struct ext4_inode_info { tid_t i_sync_tid; tid_t i_datasync_tid; +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; }; @@ -970,6 +966,11 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -1048,6 +1049,12 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 + /* * Structure of the super block */ @@ -1161,7 +1168,8 @@ struct ext4_super_block { __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ - __le32 s_reserved[106]; /* Padding to the end of the block */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __le32 s_reserved[105]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1333,10 +1341,11 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; - struct list_head s_es_lru; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; - spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; @@ -1526,6 +1535,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1541,6 +1551,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -2192,7 +2203,6 @@ extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); -extern void ext4_kvfree(void *ptr); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, @@ -2583,6 +2593,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ @@ -2643,7 +2654,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, int *retval); extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - int *has_inline); + int *has_inline, __u64 start, __u64 len); extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); @@ -2791,16 +2802,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* - * Note that these flags will never ever appear in a buffer_head's state flag. - * See EXT4_MAP_... to see where this is used. - */ -enum ext4_state_bits { - BH_AllocFromCluster /* allocated blocks were part of already - * allocated cluster. */ - = BH_JBDPrivateStart -}; - -/* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0b16fb4c06d3..bed43081720f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len = 0; - ext4_lblk_t lblock = 0; + ext4_lblk_t len; + ext4_lblk_t lblock; struct ext4_extent *ex; + struct extent_status es; ex = path[depth].p_ext; if (ex == NULL) { - /* - * there is no extent yet, so gap is [0;-] and we - * don't cache it - */ + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else { BUG(); } - ext_debug(" -> %u:%lu\n", lblock, len); + ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); + if (es.es_len) { + /* There's delayed extent containing lblock? */ + if (es.es_lblk <= lblock) + return; + len = min(es.es_lblk - lblock, len); + } + ext_debug(" -> %u:%u\n", lblock, len); + ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } /* @@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = ext4_ext_get_actual_len(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; int flags = get_default_free_blocks_flags(inode); @@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * at the beginning of the extent. Instead, we make a note * that we tried freeing the cluster, and check to see if we * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of the ext4_truncate() operation. + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; @@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if ((*partial_cluster > 0) && - (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if (*partial_cluster > 0 && + *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; - unsigned int unaligned; + long long first_cluster; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; @@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * used by any other extent (partial_cluster is negative). */ if (*partial_cluster < 0 && - -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) + *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; ext_debug("free last %u blocks starting %llu partial %lld\n", @@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * beginning of a cluster, and we removed the entire * extent and the cluster is not used by any other extent, * save the partial cluster here, since we might need to - * delete if we determine that the truncate operation has - * removed all of the blocks in the cluster. + * delete if we determine that the truncate or punch hole + * operation has removed all of the blocks in the cluster. + * If that cluster is used by another extent, preserve its + * negative value so it isn't freed later on. * - * On the other hand, if we did not manage to free the whole - * extent, we have to mark the cluster as used (store negative - * cluster number in partial_cluster). + * If the whole extent wasn't freed, we've reached the + * start of the truncated/punched region and have finished + * removing blocks. If there's a partial cluster here it's + * shared with the remainder of the extent and is no longer + * a candidate for removal. */ - unaligned = EXT4_PBLK_COFF(sbi, pblk); - if (unaligned && (ee_len == num) && - (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) - *partial_cluster = EXT4_B2C(sbi, pblk); - else if (unaligned) - *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); - else if (*partial_cluster > 0) + if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { + first_cluster = (long long) EXT4_B2C(sbi, pblk); + if (first_cluster != -*partial_cluster) + *partial_cluster = first_cluster; + } else { *partial_cluster = 0; + } } else ext4_error(sbi->s_sb, "strange request: removal(2) " "%u-%u from %u:%u\n", @@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, /* * ext4_ext_rm_leaf() Removes the extents associated with the - * blocks appearing between "start" and "end", and splits the extents - * if "start" and "end" appear in the same extent + * blocks appearing between "start" and "end". Both "start" + * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents - * has been released from it. It gets negative in case - * that the cluster is still used. + * has been released from it. However, if this value is + * negative, it's a cluster just to the right of the + * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ @@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - /* - * If we're starting with an extent other than the last one in the - * node, we need to see if it shares a cluster with the extent to - * the right (towards the end of the file). If its leftmost cluster - * is this extent's rightmost cluster and it is not cluster aligned, - * we'll mark it as a partial that is not to be deallocated. - */ - - if (ex != EXT_LAST_EXTENT(eh)) { - ext4_fsblk_t current_pblk, right_pblk; - long long current_cluster, right_cluster; - - current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - current_cluster = (long long)EXT4_B2C(sbi, current_pblk); - right_pblk = ext4_ext_pblock(ex + 1); - right_cluster = (long long)EXT4_B2C(sbi, right_pblk); - if (current_cluster == right_cluster && - EXT4_PBLK_COFF(sbi, right_pblk)) - *partial_cluster = -right_cluster; - } - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && @@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, - * so if this extent is not cluster aligned we have - * to mark the current cluster as used to avoid - * accidentally freeing it later on + * so note that its first cluster is in use to avoid + * freeing it when removing blocks. Eventually, the + * right edge of the truncated/punched region will + * be just to the left. */ - pblk = ext4_ext_pblock(ex); - if (EXT4_PBLK_COFF(sbi, pblk)) + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex); *partial_cluster = - -((long long)EXT4_B2C(sbi, pblk)); + -(long long) EXT4_B2C(sbi, pblk); + } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else if (*partial_cluster > 0) - *partial_cluster = 0; + } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the - * current extent. If there's a partial cluster and no extents - * remain in the leaf, it can't be freed here. It can only be - * freed when it's possible to determine if it's not shared with - * any other extent - when the next leaf is processed or when space - * removal is complete. + * current extent. If it is shared with the current extent + * we zero partial_cluster because we've reached the start of the + * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && eh->eh_entries && - (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != - *partial_cluster)) { - int flags = get_default_free_blocks_flags(inode); - - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); + if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); + } *partial_cluster = 0; } @@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; long long partial_cluster = 0; @@ -2845,9 +2829,10 @@ again: */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, ex_end, lblk; + ext4_fsblk_t pblk; - /* find extent for this block */ + /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); @@ -2867,6 +2852,7 @@ again: } ee_block = le32_to_cpu(ex->ee_block); + ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split @@ -2874,8 +2860,19 @@ again: * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ - if (end >= ee_block && - end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + if (end >= ee_block && end < ex_end) { + + /* + * If we're going to split the extent, note that + * the cluster containing the block after 'end' is + * in use to avoid freeing it when removing blocks. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } + /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not @@ -2886,6 +2883,24 @@ again: end + 1, 1); if (err < 0) goto out; + + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + /* + * If there's an extent to the right its first cluster + * contains the immediate right boundary of the + * truncated/punched region. Set partial_cluster to + * its negative value so it won't be freed if shared + * with the current extent. The end < ee_block case + * is handled in ext4_ext_rm_leaf(). + */ + lblk = ex_end + 1; + err = ext4_ext_search_right(inode, path, &lblk, &pblk, + &ex); + if (err) + goto out; + if (pblk) + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); } } /* @@ -2996,16 +3011,18 @@ again: trace_ext4_ext_remove_space_done(inode, start, end, depth, partial_cluster, path->p_hdr->eh_entries); - /* If we still have something in the partial cluster and we have removed + /* + * If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial - * cluster as well. */ - if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { - int flags = get_default_free_blocks_flags(inode); - + * cluster as well. (This code will only run when there are no leaves + * to the immediate left of the truncated/punched region.) + */ + if (partial_cluster > 0 && err == 0) { + /* don't zero partial_cluster since it's not used afterwards */ ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(EXT4_SB(sb), partial_cluster), - EXT4_SB(sb)->s_cluster_ratio, flags); - partial_cluster = 0; + EXT4_C2B(sbi, partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); } /* TODO: flexible tree reduction should be here */ @@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; int set_unwritten = 0; + bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } } - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* * Okay, we need to do block allocation. */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); @@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4523,7 +4535,7 @@ got_allocated_blocks: */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); - if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (map_from_cluster) { if (reserved_clusters) { /* * We have clusters reserved for this range. @@ -4620,7 +4632,6 @@ out2: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); - ext4_es_lru_add(inode); return err ? err : allocated; } @@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ext4_has_inline_data(inode)) { int has_inline = 1; - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, + start, len); if (has_inline) return error; @@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - ext4_es_lru_add(inode); return error; } @@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, return -EIO; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); - if (!ex_last) - return -EIO; err = ext4_access_path(handle, inode, path + depth); if (err) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 94e7855ae71b..e04d45733976 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, - int nr_to_scan); -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei); +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); int __init ext4_init_es(void) { @@ -298,6 +297,36 @@ out: trace_ext4_es_find_delayed_extent_range_exit(inode, es); } +static void ext4_es_list_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (!list_empty(&ei->i_es_list)) + return; + + spin_lock(&sbi->s_es_lock); + if (list_empty(&ei->i_es_list)) { + list_add_tail(&ei->i_es_list, &sbi->s_es_list); + sbi->s_es_nr_inode++; + } + spin_unlock(&sbi->s_es_lock); +} + +static void ext4_es_list_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lock); + if (!list_empty(&ei->i_es_list)) { + list_del_init(&ei->i_es_list); + sbi->s_es_nr_inode--; + WARN_ON_ONCE(sbi->s_es_nr_inode < 0); + } + spin_unlock(&sbi->s_es_lock); +} + static struct extent_status * ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) @@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, * We don't count delayed extent because we never try to reclaim them */ if (!ext4_es_is_delayed(es)) { - EXT4_I(inode)->i_es_lru_nr++; + if (!EXT4_I(inode)->i_es_shk_nr++) + ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } EXT4_I(inode)->i_es_all_nr++; @@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); - /* Decrease the lru counter when this es is not delayed */ + /* Decrease the shrink counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { - BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); - EXT4_I(inode)->i_es_lru_nr--; + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); + if (!--EXT4_I(inode)->i_es_shk_nr) + ext4_es_list_del(inode); percpu_counter_dec(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (ext4_es_type(es1) != ext4_es_type(es2)) return 0; if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { @@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es1, es)) { es1->es_len += es->es_len; + if (ext4_es_is_referenced(es)) + ext4_es_set_referenced(es1); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); es = es1; @@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es, es1)) { es->es_len += es1->es_len; + if (ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es); rb_erase(node, &tree->root); ext4_es_free_extent(inode, es1); } @@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, goto error; retry: err = __es_insert_extent(inode, &newes); - if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) goto retry; if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; @@ -782,6 +817,8 @@ out: es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + if (!ext4_es_is_referenced(es)) + ext4_es_set_referenced(es); stats->es_stats_cache_hits++; } else { stats->es_stats_cache_misses++; @@ -841,8 +878,8 @@ retry: es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; if ((err == -ENOMEM) && - __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) goto retry; goto out; } @@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, end = lblk + len - 1; BUG_ON(end < lblk); + /* + * ext4_clear_inode() depends on us taking i_es_lock unconditionally + * so that we are sure __es_shrink() is done with the inode before it + * is reclaimed. + */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); @@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct ext4_inode_info *eia, *eib; - eia = list_entry(a, struct ext4_inode_info, i_es_lru); - eib = list_entry(b, struct ext4_inode_info, i_es_lru); - - if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return 1; - if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return -1; - if (eia->i_touch_when == eib->i_touch_when) - return 0; - if (time_after(eia->i_touch_when, eib->i_touch_when)) - return 1; - else - return -1; -} - -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei) +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; struct ext4_es_stats *es_stats; - struct list_head *cur, *tmp; - LIST_HEAD(skipped); ktime_t start_time; u64 scan_time; + int nr_to_walk; int nr_shrunk = 0; - int retried = 0, skip_precached = 1, nr_skipped = 0; + int retried = 0, nr_skipped = 0; es_stats = &sbi->s_es_stats; start_time = ktime_get(); - spin_lock(&sbi->s_es_lru_lock); retry: - list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - int shrunk; - - /* - * If we have already reclaimed all extents from extent - * status tree, just stop the loop immediately. - */ - if (percpu_counter_read_positive( - &es_stats->es_stats_lru_cnt) == 0) - break; - - ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (nr_to_walk-- > 0) { + if (list_empty(&sbi->s_es_list)) { + spin_unlock(&sbi->s_es_lock); + goto out; + } + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ + list_move_tail(&ei->i_es_list, &sbi->s_es_list); /* - * Skip the inode that is newer than the last_sorted - * time. Normally we try hard to avoid shrinking - * precached inodes, but we will as a last resort. + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. */ - if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || - (skip_precached && ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED))) { + if (!retried && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) { nr_skipped++; - list_move_tail(cur, &skipped); continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei || - !write_trylock(&ei->i_es_lock)) + if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { + nr_skipped++; continue; + } + /* + * Now we hold i_es_lock which protects us from inode reclaim + * freeing inode under us + */ + spin_unlock(&sbi->s_es_lock); - shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); - if (ei->i_es_lru_nr == 0) - list_del_init(&ei->i_es_lru); + nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); write_unlock(&ei->i_es_lock); - nr_shrunk += shrunk; - nr_to_scan -= shrunk; - if (nr_to_scan == 0) - break; + if (nr_to_scan <= 0) + goto out; + spin_lock(&sbi->s_es_lock); } - - /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skipped, &sbi->s_es_lru); - INIT_LIST_HEAD(&skipped); + spin_unlock(&sbi->s_es_lock); /* * If we skipped any inodes, and we weren't able to make any - * forward progress, sort the list and try again. + * forward progress, try again to scan precached inodes. */ if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - es_stats->es_stats_last_sorted = jiffies; - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, - i_es_lru); - /* - * If there are no non-precached inodes left on the - * list, start releasing precached extents. - */ - if (ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED)) - skip_precached = 0; goto retry; } - spin_unlock(&sbi->s_es_lru_lock); - if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); +out: scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); if (likely(es_stats->es_stats_scan_time)) es_stats->es_stats_scan_time = (scan_time + @@ -1043,7 +1046,7 @@ retry: else es_stats->es_stats_shrunk = nr_shrunk; - trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, nr_skipped, retried); return nr_shrunk; } @@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; - nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; @@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) return 0; /* here we just find an inode that has the max nr. of objects */ - spin_lock(&sbi->s_es_lru_lock); - list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + spin_lock(&sbi->s_es_lock); + list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr) max = ei; else if (!max) max = ei; } - spin_unlock(&sbi->s_es_lru_lock); + spin_unlock(&sbi->s_es_lock); seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), - percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); seq_printf(seq, " %lu/%lu cache hits/misses\n", es_stats->es_stats_cache_hits, es_stats->es_stats_cache_misses); - if (es_stats->es_stats_last_sorted != 0) - seq_printf(seq, " %u ms last sorted interval\n", - jiffies_to_msecs(jiffies - - es_stats->es_stats_last_sorted)); if (inode_cnt) - seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + seq_printf(seq, " %d inodes on list\n", inode_cnt); seq_printf(seq, "average:\n %llu us scan time\n", div_u64(es_stats->es_stats_scan_time, 1000)); @@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, "maximum:\n %lu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", - max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); return 0; @@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; - INIT_LIST_HEAD(&sbi->s_es_lru); - spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_stats.es_stats_last_sorted = 0; + /* Make sure we have enough bits for physical block number */ + BUILD_BUG_ON(ES_SHIFT < 48); + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; sbi->s_es_stats.es_stats_cache_hits = 0; sbi->s_es_stats.es_stats_cache_misses = 0; @@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) return err; - err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); + err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) goto err1; @@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) return 0; err2: - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); err1: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); return err; @@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) if (sbi->s_proc) remove_proc_entry("es_shrinker_info", sbi->s_proc); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); } -void ext4_es_lru_add(struct inode *inode) +/* + * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at + * most *nr_to_scan extents, update *nr_to_scan accordingly. + * + * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. + * Increment *nr_shrunk by the number of reclaimed extents. Also update + * ei->i_es_shrink_lblk to where we should continue scanning. + */ +static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, + int *nr_to_scan, int *nr_shrunk) { - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - ei->i_touch_when = jiffies; - - if (!list_empty(&ei->i_es_lru)) - return; + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct extent_status *es; + struct rb_node *node; - spin_lock(&sbi->s_es_lru_lock); - if (list_empty(&ei->i_es_lru)) - list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - spin_unlock(&sbi->s_es_lru_lock); -} + es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); + if (!es) + goto out_wrap; + node = &es->rb_node; + while (*nr_to_scan > 0) { + if (es->es_lblk > end) { + ei->i_es_shrink_lblk = end + 1; + return 0; + } -void ext4_es_lru_del(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + (*nr_to_scan)--; + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (ext4_es_is_delayed(es)) + goto next; + if (ext4_es_is_referenced(es)) { + ext4_es_clear_referenced(es); + goto next; + } - spin_lock(&sbi->s_es_lru_lock); - if (!list_empty(&ei->i_es_lru)) - list_del_init(&ei->i_es_lru); - spin_unlock(&sbi->s_es_lru_lock); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + (*nr_shrunk)++; +next: + if (!node) + goto out_wrap; + es = rb_entry(node, struct extent_status, rb_node); + } + ei->i_es_shrink_lblk = es->es_lblk; + return 1; +out_wrap: + ei->i_es_shrink_lblk = 0; + return 0; } -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, - int nr_to_scan) +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) { struct inode *inode = &ei->vfs_inode; - struct ext4_es_tree *tree = &ei->i_es_tree; - struct rb_node *node; - struct extent_status *es; - unsigned long nr_shrunk = 0; + int nr_shrunk = 0; + ext4_lblk_t start = ei->i_es_shrink_lblk; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (ei->i_es_lru_nr == 0) + if (ei->i_es_shk_nr == 0) return 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && __ratelimit(&_rs)) ext4_warning(inode->i_sb, "forced shrink of precached extents"); - node = rb_first(&tree->root); - while (node != NULL) { - es = rb_entry(node, struct extent_status, rb_node); - node = rb_next(&es->rb_node); - /* - * We can't reclaim delayed extent from status tree because - * fiemap, bigallic, and seek_data/hole need to use it. - */ - if (!ext4_es_is_delayed(es)) { - rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(inode, es); - nr_shrunk++; - if (--nr_to_scan == 0) - break; - } - } - tree->cache_es = NULL; + if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && + start != 0) + es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); + + ei->i_es_tree.cache_es = NULL; return nr_shrunk; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index efd5f970b501..691b52613ce4 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -29,25 +29,28 @@ /* * These flags live in the high bits of extent_status.es_pblk */ -#define ES_SHIFT 60 - -#define EXTENT_STATUS_WRITTEN (1 << 3) -#define EXTENT_STATUS_UNWRITTEN (1 << 2) -#define EXTENT_STATUS_DELAYED (1 << 1) -#define EXTENT_STATUS_HOLE (1 << 0) +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; -#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ - EXTENT_STATUS_UNWRITTEN | \ - EXTENT_STATUS_DELAYED | \ - EXTENT_STATUS_HOLE) +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) -#define ES_WRITTEN (1ULL << 63) -#define ES_UNWRITTEN (1ULL << 62) -#define ES_DELAYED (1ULL << 61) -#define ES_HOLE (1ULL << 60) +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) -#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ - ES_DELAYED | ES_HOLE) +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; @@ -65,14 +68,13 @@ struct ext4_es_tree { }; struct ext4_es_stats { - unsigned long es_stats_last_sorted; unsigned long es_stats_shrunk; unsigned long es_stats_cache_hits; unsigned long es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; - struct percpu_counter es_stats_lru_cnt; + struct percpu_counter es_stats_shk_cnt; }; extern int __init ext4_init_es(void); @@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & ES_WRITTEN) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & ES_UNWRITTEN) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & ES_DELAYED) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & ES_HOLE) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } -static inline unsigned int ext4_es_status(struct extent_status *es) +static inline void ext4_es_set_referenced(struct extent_status *es) { - return es->es_pblk >> ES_SHIFT; + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) @@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es, static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { - es->es_pblk = (((ext4_fsblk_t) - (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | - (es->es_pblk & ~ES_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { - es->es_pblk = (((ext4_fsblk_t) - (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | - (pb & ~ES_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); -extern void ext4_es_lru_add(struct inode *inode); -extern void ext4_es_lru_del(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af3..33a09da16c9c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = file->f_flags & O_DIRECT; + int o_direct = io_is_direct(file); int overwrite = 0; size_t length = iov_iter_count(from); ssize_t ret; @@ -191,17 +191,41 @@ errout: return ret; } +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -600,6 +624,26 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_DAX +const struct file_operations ext4_dax_file_operations = { + .llseek = ext4_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + /* Splice not yet supported with DAX */ + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36b369697a13..45fe924f82bc 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -689,14 +689,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); @@ -1393,10 +1401,7 @@ end_range: * to free. Everything was covered by the start * of the range. */ - return 0; - } else { - /* Shared branch grows from an indirect block */ - partial2--; + goto do_indirects; } } else { /* @@ -1427,56 +1432,96 @@ end_range: /* Punch happened within the same level (n == n2) */ partial = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); - /* - * ext4_find_shared returns Indirect structure which - * points to the last element which should not be - * removed by truncate. But this is end of the range - * in punch_hole so we need to point to the next element - */ - partial2->p++; - while ((partial > chain) || (partial2 > chain2)) { - /* We're at the same block, so we're almost finished */ - if ((partial->bh && partial2->bh) && - (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { - if ((partial > chain) && (partial2 > chain2)) { + + /* Free top, but only if partial2 isn't its subtree. */ + if (nr) { + int level = min(partial - chain, partial2 - chain2); + int i; + int subtree = 1; + + for (i = 0; i <= level; i++) { + if (offsets[i] != offsets2[i]) { + subtree = 0; + break; + } + } + + if (!subtree) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, + (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, - partial->p + 1, - partial2->p, + partial->p, + partial->p+1, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); } - return 0; } + } + + if (!nr2) { /* - * Clear the ends of indirect blocks on the shared branch - * at the start of the range + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element */ - if (partial > chain) { + partial2->p++; + } + + while (partial > chain || partial2 > chain2) { + int depth = (chain+n-1) - partial; + int depth2 = (chain2+n2-1) - partial2; + + if (partial > chain && partial2 > chain2 && + partial->bh->b_blocknr == partial2->bh->b_blocknr) { + /* + * We've converged on the same block. Clear the range, + * then we're done. + */ ext4_free_branches(handle, inode, partial->bh, - partial->p + 1, - (__le32 *)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); + partial->p + 1, + partial2->p, + (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); - partial--; + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + return 0; } + /* - * Clear the ends of indirect blocks on the shared branch - * at the end of the range + * The start and end partial branches may not be at the same + * level even though the punch happened within one level. So, we + * give them a chance to arrive at the same level, then walk + * them in step with each other until we converge on the same + * block. */ - if (partial2 > chain2) { + if (partial > chain && depth <= depth2) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, - (chain2+n-1) - partial2); + (chain2+n2-1) - partial2); BUFFER_TRACE(partial2->bh, "call brelse"); brelse(partial2->bh); partial2--; } } + return 0; do_indirects: /* Kill the remaining (whole) subtrees */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3ea62695abce..4b143febf21f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, ret = __block_write_begin(page, 0, inline_size, ext4_da_get_block_prep); if (ret) { + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); ext4_truncate_failed_write(inode); - goto out; + return ret; } SetPageDirty(page); @@ -870,6 +873,12 @@ retry_journal: goto out_journal; } + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + if (ret == -ENOSPC) { ret = ext4_da_convert_inline_data_to_extent(mapping, inode, @@ -882,11 +891,6 @@ retry_journal: goto out; } - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { @@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - int *has_inline) + int *has_inline, __u64 start, __u64 len) { __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + __u64 inline_len; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; int error = 0; struct ext4_iloc iloc; @@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode, *has_inline = 0; goto out; } + inline_len = min_t(size_t, ext4_get_inline_size(inode), + i_size_read(inode)); + if (start >= inline_len) + goto out; + if (start + len < inline_len) + inline_len = start + len; + inline_len -= start; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode, physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); - length = i_size_read(inode); if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3356ab5395f4..5cb9a212b86f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) up_read((&EXT4_I(inode)->i_data_sem)); - /* - * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag - * because it shouldn't be marked in es_map->m_flags. - */ - map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); /* * We don't check m_len because extent will be collpased in status @@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { - ext4_es_lru_add(inode); if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -663,6 +657,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -700,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -1013,6 +1024,7 @@ static int ext4_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; @@ -1043,6 +1055,8 @@ static int ext4_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock @@ -1084,6 +1098,7 @@ static int ext4_journalled_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; @@ -1116,6 +1131,9 @@ static int ext4_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) @@ -1393,7 +1411,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { - ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); @@ -1434,24 +1451,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_has_inline_data(inode)) { - /* - * We will soon create blocks for this page, and let - * us pretend as if the blocks aren't allocated yet. - * In case of clusters, we have to handle the work - * of mapping from cluster so that the reserved space - * is calculated properly. - */ - if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + if (ext4_has_inline_data(inode)) retval = 0; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, - EXT4_GET_BLOCKS_NO_PUT_HOLE); + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, - EXT4_GET_BLOCKS_NO_PUT_HOLE); + retval = ext4_ind_map_blocks(NULL, inode, map, 0); add_delayed: if (retval == 0) { @@ -1465,7 +1470,8 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + !ext4_find_delalloc_cluster(inode, map->m_lblk)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { /* not enough space to reserve */ @@ -1481,11 +1487,6 @@ add_delayed: goto out_unlock; } - /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served - * and it should not appear on the bh->b_state. - */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; - map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); @@ -3033,13 +3034,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, - offset, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3203,19 +3205,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3228,14 +3223,6 @@ static int ext4_block_zero_page_range(handle_t *handle, return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3301,6 +3288,33 @@ unlock: } /* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end @@ -3643,7 +3657,7 @@ out_stop: * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the + * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) @@ -3821,8 +3835,10 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4075,7 +4091,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4162,6 +4181,65 @@ static int ext4_inode_blocks_set(handle_t *handle, return 0; } +struct other_inode { + unsigned long orig_ino; + struct ext4_inode *raw_inode; +}; + +static int other_inode_match(struct inode * inode, unsigned long ino, + void *data) +{ + struct other_inode *oi = (struct other_inode *) data; + + if ((inode->i_ino != ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + ((inode->i_state & I_DIRTY_TIME) == 0)) + return 0; + spin_lock(&inode->i_lock); + if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && + (inode->i_state & I_DIRTY_TIME)) { + struct ext4_inode_info *ei = EXT4_I(inode); + + inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + spin_unlock(&inode->i_lock); + + spin_lock(&ei->i_raw_lock); + EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); + ext4_inode_csum_set(inode, oi->raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + trace_ext4_other_inode_update_time(inode, oi->orig_ino); + return -1; + } + spin_unlock(&inode->i_lock); + return -1; +} + +/* + * Opportunistically update the other time fields for other inodes in + * the same inode table block. + */ +static void ext4_update_other_inodes_time(struct super_block *sb, + unsigned long orig_ino, char *buf) +{ + struct other_inode oi; + unsigned long ino; + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int inode_size = EXT4_INODE_SIZE(sb); + + oi.orig_ino = orig_ino; + ino = orig_ino & ~(inodes_per_block - 1); + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { + if (ino == orig_ino) + continue; + oi.raw_inode = (struct ext4_inode *) buf; + (void) find_inode_nowait(sb, ino, other_inode_match, &oi); + } +} + /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the @@ -4271,10 +4349,11 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } - ext4_inode_csum_set(inode, raw_inode, ei); - spin_unlock(&ei->i_raw_lock); + if (inode->i_sb->s_flags & MS_LAZYTIME) + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, + bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -4557,7 +4636,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == @@ -4863,11 +4942,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. + * + * If only the I_DIRTY_TIME flag is set, we can skip everything. If + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need + * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; + if (flags == I_DIRTY_TIME) + return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfda18a15592..f58a0d106726 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode1); - ext4_es_lru_del(inode2); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbfe15c2533c..8d1e60214ef0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) if (sbi->s_group_info) { memcpy(new_groupinfo, sbi->s_group_info, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); } sbi->s_group_info = new_groupinfo; sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); @@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - meta_group_info = kmalloc(metalen, GFP_KERNEL); + meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); @@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; @@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, { struct buffer_head *bh; meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); + kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); BUG_ON(bh == NULL); @@ -2495,7 +2495,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - if (sbi->s_buddy_cache) - iput(sbi->s_buddy_cache); + iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a432634f2e6a..3cb267aee802 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -592,7 +592,7 @@ err_out: /* * set the i_blocks count to zero - * so that the ext4_delete_inode does the + * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 9f2311bc9c4f..370420bfae8d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -267,12 +267,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; - unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; int err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + struct super_block *sb = orig_inode->i_sb; /* * It needs twice the amount of ordinary journal buffers because @@ -287,9 +287,6 @@ again: return 0; } - if (segment_eq(get_fs(), KERNEL_DS)) - w_flags |= AOP_FLAG_UNINTERRUPTIBLE; - orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; @@ -405,10 +402,13 @@ unlock_pages: page_cache_release(pagep[1]); stop_journal: ext4_journal_stop(handle); + if (*err == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ - if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, - &retries)) + if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 426211882f72..28fe71a2904c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,7 +2235,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2299,7 +2302,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); @@ -2814,7 +2820,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_orphan_add(handle, inode); inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); - retval = 0; end_unlink: brelse(bh); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ca4588388fc3..8a8ec6293b19 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -24,6 +24,18 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_warning(sb, "won't resize using backup superblock at %llu", + (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + /* * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ @@ -758,18 +770,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - /* - * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ - if (EXT4_SB(sb)->s_sbh->b_blocknr != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); - return -EPERM; - } - gdb_bh = sb_bread(sb, gdblock); if (!gdb_bh) return -EIO; @@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); + kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_super(handle, sb); @@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: - ext4_kvfree(n_group_desc); + kvfree(n_group_desc); brelse(iloc.bh); exit_dind: brelse(dind); @@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); + kvfree(o_group_desc); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2c9e6864abd9..e061e66c8280 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) return ret; } -void ext4_kvfree(void *ptr) -{ - if (is_vmalloc_addr(ptr)) - vfree(ptr); - else - kfree(ptr); - -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -343,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func, static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } @@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_group_desc); + kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); - INIT_LIST_HEAD(&ei->i_es_lru); + INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; - ei->i_es_lru_nr = 0; - ei->i_touch_when = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -892,6 +883,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); @@ -972,7 +964,6 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1055,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, struct path *path); -static int ext4_quota_on_sysfile(struct super_block *sb, int type, - int format_id); static int ext4_quota_off(struct super_block *sb, int type); -static int ext4_quota_off_sysfile(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1068,6 +1056,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); +static struct dquot **ext4_get_dquots(struct inode *inode) +{ + return EXT4_I(inode)->i_dquot; +} + static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, @@ -1088,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = { .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk }; - -static const struct quotactl_ops ext4_qctl_sysfile_operations = { - .quota_on_meta = ext4_quota_on_sysfile, - .quota_off = ext4_quota_off_sysfile, - .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk -}; #endif static const struct super_operations ext4_sops = { @@ -1117,6 +1100,7 @@ static const struct super_operations ext4_sops = { #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, + .get_dquots = ext4_get_dquots, #endif .bdev_try_to_free_page = bdev_try_to_free_page, }; @@ -1140,13 +1124,14 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, + Opt_max_dir_size_kb, Opt_nojournal_checksum, }; static const match_table_t tokens = { @@ -1180,6 +1165,7 @@ static const match_table_t tokens = { {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, + {Opt_nojournal_checksum, "nojournal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, @@ -1202,8 +1188,11 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_removed, "mblk_io_submit"}, {Opt_removed, "nomblk_io_submit"}, @@ -1361,6 +1350,8 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | @@ -1384,6 +1375,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_i_version: sb->s_flags |= MS_I_VERSION; return 1; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + return 1; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + return 1; } for (m = ext4_mount_opts; m->token != Opt_err; m++) @@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; +#endif } else { if (!args->from) arg = 1; @@ -1702,6 +1705,12 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " + "in data=ordered mode"); + return 0; + } return 1; } @@ -1939,7 +1948,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) memcpy(new_groups, sbi->s_flex_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups))); - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_flex_groups); } sbi->s_flex_groups = new_groups; sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); @@ -2770,6 +2779,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (readonly) return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) { + ext4_msg(sb, KERN_INFO, "filesystem is read-only"); + sb->s_flags |= MS_RDONLY; + return 1; + } + /* Check that feature set is OK for a read-write mount */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " @@ -3310,7 +3325,7 @@ int ext4_calculate_overhead(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_KERNEL); + char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; @@ -3338,8 +3353,8 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the journal blocks as well */ - if (sbi->s_journal) + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); sbi->s_overhead = overhead; @@ -3476,7 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are " + ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ @@ -3596,6 +3611,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3659,6 +3679,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -3909,9 +3942,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - init_timer(&sbi->s_err_report); - sbi->s_err_report.function = print_daily_error_info; - sbi->s_err_report.data = (unsigned long) sb; + setup_timer(&sbi->s_err_report, print_daily_error_info, + (unsigned long) sb); /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) @@ -3929,9 +3961,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) - sb->s_qcop = &ext4_qctl_sysfile_operations; + sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -4224,7 +4257,7 @@ failed_mount7: failed_mount6: ext4_mb_release(sb); if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -4253,7 +4286,7 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); @@ -4838,9 +4871,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; - /* - * Allow the "check" option to be passed as a remount option. - */ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; @@ -4849,9 +4879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " - "during remount not supported"); - err = -EINVAL; - goto restore_opts; + "during remount not supported; ignoring"); + sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -4867,6 +4896,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) @@ -4915,7 +4956,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_mark_recovery_complete(sb, es); } else { /* Make sure we can mount this feature set readwrite */ - if (!ext4_feature_set_ok(sb, 0)) { + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_READONLY) || + !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } @@ -5005,6 +5048,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } #endif + *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; @@ -5273,21 +5317,6 @@ static int ext4_enable_quotas(struct super_block *sb) return 0; } -/* - * quota_on function that is used when QUOTA feature is set. - */ -static int ext4_quota_on_sysfile(struct super_block *sb, int type, - int format_id) -{ - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) - return -EINVAL; - - /* - * USAGE was enabled at mount time. Only need to enable LIMITS now. - */ - return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED); -} - static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; @@ -5314,18 +5343,6 @@ out: return dquot_quota_off(sb, type); } -/* - * quota_off function that is used when QUOTA feature is set. - */ -static int ext4_quota_off_sysfile(struct super_block *sb, int type) -{ - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) - return -EINVAL; - - /* Disable only the limits. */ - return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); -} - /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 736a348509f7..94e2d2ffabe1 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -71,3 +71,13 @@ config F2FS_CHECK_FS Enables BUG_ONs which check the filesystem consistency in runtime. If you want to improve the performance, say N. + +config F2FS_IO_TRACE + bool "F2FS IO tracer" + depends on F2FS_FS + depends on FUNCTION_TRACER + help + F2FS IO trace is based on a function trace, which gathers process + information and block IO patterns in the filesystem level. + + If unsure, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 2e35da12d292..d92397731db8 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -5,3 +5,4 @@ f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o +f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 83b9b5a8d112..742202779bd5 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) if (count == 0) return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); + acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); @@ -116,7 +116,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) int i; f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_KERNEL); + sizeof(struct f2fs_acl_entry), GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -162,7 +162,8 @@ fail: return ERR_PTR(-EINVAL); } -struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, + struct page *dpage) { int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; @@ -172,12 +173,13 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - retval = f2fs_getxattr(inode, name_index, "", NULL, 0); + retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { value = kmalloc(retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); - retval = f2fs_getxattr(inode, name_index, "", value, retval); + retval = f2fs_getxattr(inode, name_index, "", value, + retval, dpage); } if (retval > 0) @@ -194,6 +196,11 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) return acl; } +struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +{ + return __f2fs_get_acl(inode, type, NULL); +} + static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { @@ -229,7 +236,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (acl) { value = f2fs_acl_to_disk(acl, &size); if (IS_ERR(value)) { - cond_clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(fi, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -240,7 +247,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - cond_clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(fi, FI_ACL_MODE); return error; } @@ -249,12 +256,137 @@ int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return __f2fs_set_acl(inode, type, acl, NULL); } -int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) +/* + * Most part of f2fs_acl_clone, f2fs_acl_create_masq, f2fs_acl_create + * are copied from posix_acl.c + */ +static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, + gfp_t flags) +{ + struct posix_acl *clone = NULL; + + if (acl) { + int size = sizeof(struct posix_acl) + acl->a_count * + sizeof(struct posix_acl_entry); + clone = kmemdup(acl, size, flags); + if (clone) + atomic_set(&clone->a_refcount, 1); + } + return clone; +} + +static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) +{ + struct posix_acl_entry *pa, *pe; + struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; + umode_t mode = *mode_p; + int not_equiv = 0; + + /* assert(atomic_read(acl->a_refcount) == 1); */ + + FOREACH_ACL_ENTRY(pa, acl, pe) { + switch(pa->e_tag) { + case ACL_USER_OBJ: + pa->e_perm &= (mode >> 6) | ~S_IRWXO; + mode &= (pa->e_perm << 6) | ~S_IRWXU; + break; + + case ACL_USER: + case ACL_GROUP: + not_equiv = 1; + break; + + case ACL_GROUP_OBJ: + group_obj = pa; + break; + + case ACL_OTHER: + pa->e_perm &= mode | ~S_IRWXO; + mode &= pa->e_perm | ~S_IRWXO; + break; + + case ACL_MASK: + mask_obj = pa; + not_equiv = 1; + break; + + default: + return -EIO; + } + } + + if (mask_obj) { + mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (mask_obj->e_perm << 3) | ~S_IRWXG; + } else { + if (!group_obj) + return -EIO; + group_obj->e_perm &= (mode >> 3) | ~S_IRWXO; + mode &= (group_obj->e_perm << 3) | ~S_IRWXG; + } + + *mode_p = (*mode_p & ~S_IRWXUGO) | mode; + return not_equiv; +} + +static int f2fs_acl_create(struct inode *dir, umode_t *mode, + struct posix_acl **default_acl, struct posix_acl **acl, + struct page *dpage) +{ + struct posix_acl *p; + int ret; + + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) + goto no_acl; + + p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); + if (IS_ERR(p)) { + if (p == ERR_PTR(-EOPNOTSUPP)) + goto apply_umask; + return PTR_ERR(p); + } + + if (!p) + goto apply_umask; + + *acl = f2fs_acl_clone(p, GFP_NOFS); + if (!*acl) + return -ENOMEM; + + ret = f2fs_acl_create_masq(*acl, mode); + if (ret < 0) { + posix_acl_release(*acl); + return -ENOMEM; + } + + if (ret == 0) { + posix_acl_release(*acl); + *acl = NULL; + } + + if (!S_ISDIR(*mode)) { + posix_acl_release(p); + *default_acl = NULL; + } else { + *default_acl = p; + } + return 0; + +apply_umask: + *mode &= ~current_umask(); +no_acl: + *default_acl = NULL; + *acl = NULL; + return 0; +} + +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, + struct page *dpage) { - struct posix_acl *default_acl, *acl; + struct posix_acl *default_acl = NULL, *acl = NULL; int error = 0; - error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); if (error) return error; @@ -264,7 +396,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage) posix_acl_release(default_acl); } if (acl) { - if (error) + if (!error) error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl, ipage); posix_acl_release(acl); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index e0864651cdc1..997ca8edb6cb 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -38,14 +38,15 @@ struct f2fs_acl_header { extern struct posix_acl *f2fs_get_acl(struct inode *, int); extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int f2fs_init_acl(struct inode *, struct inode *, struct page *); +extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, + struct page *); #else #define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, - struct page *page) + struct page *ipage, struct page *dpage) { return 0; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index dd10a031c052..7f794b72b3b7 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -20,10 +20,11 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "trace.h" #include <trace/events/f2fs.h> static struct kmem_cache *ino_entry_slab; -static struct kmem_cache *inode_entry_slab; +struct kmem_cache *inode_entry_slab; /* * We guarantee no failure on the returned page. @@ -50,6 +51,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; + struct f2fs_io_info fio = { + .type = META, + .rw = READ_SYNC | REQ_META | REQ_PRIO, + .blk_addr = index, + }; repeat: page = grab_cache_page(mapping, index); if (!page) { @@ -59,8 +65,7 @@ repeat: if (PageUptodate(page)) goto out; - if (f2fs_submit_page_bio(sbi, page, index, - READ_SYNC | REQ_META | REQ_PRIO)) + if (f2fs_submit_page_bio(sbi, page, &fio)) goto repeat; lock_page(page); @@ -72,36 +77,36 @@ out: return page; } -struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index) -{ - bool readahead = false; - struct page *page; - - page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) - readahead = true; - f2fs_put_page(page, 0); - - if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); - return get_meta_page(sbi, index); -} - -static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) { switch (type) { case META_NAT: - return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK; + break; case META_SIT: - return SIT_BLK_CNT(sbi); + if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) + return false; + break; case META_SSA: + if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || + blkaddr < SM_I(sbi)->ssa_blkaddr)) + return false; + break; case META_CP: - return 0; + if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || + blkaddr < __start_cp_addr(sbi))) + return false; + break; case META_POR: - return MAX_BLKADDR(sbi); + if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || + blkaddr < MAIN_BLKADDR(sbi))) + return false; + break; default: BUG(); } + + return true; } /* @@ -112,48 +117,43 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type block_t prev_blk_addr = 0; struct page *page; block_t blkno = start; - block_t max_blks = get_max_meta_blks(sbi, type); - struct f2fs_io_info fio = { .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO }; for (; nrpages-- > 0; blkno++) { - block_t blk_addr; + + if (!is_valid_blkaddr(sbi, blkno, type)) + goto out; switch (type) { case META_NAT: - /* get nat block addr */ - if (unlikely(blkno >= max_blks)) + if (unlikely(blkno >= + NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; - blk_addr = current_nat_addr(sbi, + /* get nat block addr */ + fio.blk_addr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: /* get sit block addr */ - if (unlikely(blkno >= max_blks)) - goto out; - blk_addr = current_sit_addr(sbi, + fio.blk_addr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - if (blkno != start && prev_blk_addr + 1 != blk_addr) + if (blkno != start && prev_blk_addr + 1 != fio.blk_addr) goto out; - prev_blk_addr = blk_addr; + prev_blk_addr = fio.blk_addr; break; case META_SSA: case META_CP: case META_POR: - if (unlikely(blkno >= max_blks)) - goto out; - if (unlikely(blkno < SEG0_BLKADDR(sbi))) - goto out; - blk_addr = blkno; + fio.blk_addr = blkno; break; default: BUG(); } - page = grab_cache_page(META_MAPPING(sbi), blk_addr); + page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); if (!page) continue; if (PageUptodate(page)) { @@ -161,7 +161,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type continue; } - f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); + f2fs_submit_page_mbio(sbi, page, &fio); f2fs_put_page(page, 0); } out: @@ -169,6 +169,20 @@ out: return blkno - start; } +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + bool readahead = false; + + page = find_get_page(META_MAPPING(sbi), index); + if (!page || (page && !PageUptodate(page))) + readahead = true; + f2fs_put_page(page, 0); + + if (readahead) + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); +} + static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { @@ -176,9 +190,9 @@ static int f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); - if (unlikely(sbi->por_doing)) + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (wbc->for_reclaim) + if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; @@ -187,6 +201,9 @@ static int f2fs_write_meta_page(struct page *page, write_meta_page(sbi, page); dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); + + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, META, WRITE); return 0; redirty_out: @@ -285,6 +302,8 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); + SetPagePrivate(page); + f2fs_trace_pid(page); return 1; } return 0; @@ -294,50 +313,63 @@ const struct address_space_operations f2fs_meta_aops = { .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .set_page_dirty = f2fs_set_meta_page_dirty, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { + struct inode_management *im = &sbi->im[type]; struct ino_entry *e; retry: - spin_lock(&sbi->ino_lock[type]); + if (radix_tree_preload(GFP_NOFS)) { + cond_resched(); + goto retry; + } + + spin_lock(&im->ino_lock); - e = radix_tree_lookup(&sbi->ino_root[type], ino); + e = radix_tree_lookup(&im->ino_root, ino); if (!e) { e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC); if (!e) { - spin_unlock(&sbi->ino_lock[type]); + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); goto retry; } - if (radix_tree_insert(&sbi->ino_root[type], ino, e)) { - spin_unlock(&sbi->ino_lock[type]); + if (radix_tree_insert(&im->ino_root, ino, e)) { + spin_unlock(&im->ino_lock); kmem_cache_free(ino_entry_slab, e); + radix_tree_preload_end(); goto retry; } memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; - list_add_tail(&e->list, &sbi->ino_list[type]); + list_add_tail(&e->list, &im->ino_list); + if (type != ORPHAN_INO) + im->ino_num++; } - spin_unlock(&sbi->ino_lock[type]); + spin_unlock(&im->ino_lock); + radix_tree_preload_end(); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { + struct inode_management *im = &sbi->im[type]; struct ino_entry *e; - spin_lock(&sbi->ino_lock[type]); - e = radix_tree_lookup(&sbi->ino_root[type], ino); + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); if (e) { list_del(&e->list); - radix_tree_delete(&sbi->ino_root[type], ino); - if (type == ORPHAN_INO) - sbi->n_orphans--; - spin_unlock(&sbi->ino_lock[type]); + radix_tree_delete(&im->ino_root, ino); + im->ino_num--; + spin_unlock(&im->ino_lock); kmem_cache_free(ino_entry_slab, e); return; } - spin_unlock(&sbi->ino_lock[type]); + spin_unlock(&im->ino_lock); } void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -355,10 +387,12 @@ void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) /* mode should be APPEND_INO or UPDATE_INO */ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { + struct inode_management *im = &sbi->im[mode]; struct ino_entry *e; - spin_lock(&sbi->ino_lock[mode]); - e = radix_tree_lookup(&sbi->ino_root[mode], ino); - spin_unlock(&sbi->ino_lock[mode]); + + spin_lock(&im->ino_lock); + e = radix_tree_lookup(&im->ino_root, ino); + spin_unlock(&im->ino_lock); return e ? true : false; } @@ -368,36 +402,42 @@ void release_dirty_inode(struct f2fs_sb_info *sbi) int i; for (i = APPEND_INO; i <= UPDATE_INO; i++) { - spin_lock(&sbi->ino_lock[i]); - list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) { + struct inode_management *im = &sbi->im[i]; + + spin_lock(&im->ino_lock); + list_for_each_entry_safe(e, tmp, &im->ino_list, list) { list_del(&e->list); - radix_tree_delete(&sbi->ino_root[i], e->ino); + radix_tree_delete(&im->ino_root, e->ino); kmem_cache_free(ino_entry_slab, e); + im->ino_num--; } - spin_unlock(&sbi->ino_lock[i]); + spin_unlock(&im->ino_lock); } } int acquire_orphan_inode(struct f2fs_sb_info *sbi) { + struct inode_management *im = &sbi->im[ORPHAN_INO]; int err = 0; - spin_lock(&sbi->ino_lock[ORPHAN_INO]); - if (unlikely(sbi->n_orphans >= sbi->max_orphans)) + spin_lock(&im->ino_lock); + if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else - sbi->n_orphans++; - spin_unlock(&sbi->ino_lock[ORPHAN_INO]); + im->ino_num++; + spin_unlock(&im->ino_lock); return err; } void release_orphan_inode(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->ino_lock[ORPHAN_INO]); - f2fs_bug_on(sbi, sbi->n_orphans == 0); - sbi->n_orphans--; - spin_unlock(&sbi->ino_lock[ORPHAN_INO]); + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + spin_lock(&im->ino_lock); + f2fs_bug_on(sbi, im->ino_num == 0); + im->ino_num--; + spin_unlock(&im->ino_lock); } void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -429,7 +469,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) return; - sbi->por_doing = true; + set_sbi_flag(sbi, SBI_POR_DOING); start_blk = __start_cp_addr(sbi) + 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); @@ -450,7 +490,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi) } /* clear Orphan Flag */ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); - sbi->por_doing = false; + clear_sbi_flag(sbi, SBI_POR_DOING); return; } @@ -460,17 +500,19 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; unsigned short index; - unsigned short orphan_blocks = - (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans); + unsigned short orphan_blocks; struct page *page = NULL; struct ino_entry *orphan = NULL; + struct inode_management *im = &sbi->im[ORPHAN_INO]; + + orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); for (index = 0; index < orphan_blocks; index++) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&sbi->ino_lock[ORPHAN_INO]); - head = &sbi->ino_list[ORPHAN_INO]; + spin_lock(&im->ino_lock); + head = &im->ino_list; /* loop for each orphan inode entry and write them in Jornal block */ list_for_each_entry(orphan, head, list) { @@ -510,7 +552,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) f2fs_put_page(page, 1); } - spin_unlock(&sbi->ino_lock[ORPHAN_INO]); + spin_unlock(&im->ino_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -532,7 +574,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp1; - crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); + crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp1; @@ -547,7 +589,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, if (crc_offset >= blk_size) goto invalid_cp2; - crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset))); + crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); if (!f2fs_crc_valid(crc, cp_block, crc_offset)) goto invalid_cp2; @@ -634,7 +676,7 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) +static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -651,7 +693,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dir_inode_entry *new; + struct inode_entry *new; int ret = 0; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) @@ -675,12 +717,13 @@ void update_dirty_page(struct inode *inode, struct page *page) kmem_cache_free(inode_entry_slab, new); out: SetPagePrivate(page); + f2fs_trace_pid(page); } void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dir_inode_entry *new = + struct inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); int ret = 0; @@ -698,7 +741,7 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dir_inode_entry *entry; + struct inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; @@ -728,9 +771,12 @@ void remove_dirty_dir_inode(struct inode *inode) void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) { struct list_head *head; - struct dir_inode_entry *entry; + struct inode_entry *entry; struct inode *inode; retry: + if (unlikely(f2fs_cp_error(sbi))) + return; + spin_lock(&sbi->dir_inode_lock); head = &sbi->dir_inode_list; @@ -738,7 +784,7 @@ retry: spin_unlock(&sbi->dir_inode_lock); return; } - entry = list_entry(head->next, struct dir_inode_entry, list); + entry = list_entry(head->next, struct inode_entry, list); inode = igrab(entry->inode); spin_unlock(&sbi->dir_inode_lock); if (inode) { @@ -830,6 +876,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; struct page *cp_page; @@ -883,34 +930,41 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->next_free_nid = cpu_to_le32(last_nid); /* 2 cp + n data seg summary + orphan inode blocks */ - data_sum_blocks = npages_for_summary_flush(sbi); + data_sum_blocks = npages_for_summary_flush(sbi, false); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans); + orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) { - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + if (__remain_node_summaries(cpc->reason)) ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); - } else { - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks); - } - if (sbi->n_orphans) + if (cpc->reason == CP_UMOUNT) + set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); else clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - if (sbi->need_fsck) + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) set_ckpt_flags(ckpt, CP_FSCK_FLAG); /* update SIT/NAT bitmap */ @@ -927,27 +981,26 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write out checkpoint buffer at block 0 */ cp_page = grab_meta_page(sbi, start_blk++); kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); + memcpy(kaddr, ckpt, F2FS_BLKSIZE); set_page_dirty(cp_page); f2fs_put_page(cp_page, 1); for (i = 1; i < 1 + cp_payload_blks; i++) { cp_page = grab_meta_page(sbi, start_blk++); kaddr = page_address(cp_page); - memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, - (1 << sbi->log_blocksize)); + memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE); set_page_dirty(cp_page); f2fs_put_page(cp_page, 1); } - if (sbi->n_orphans) { + if (orphan_num) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; } write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; - if (cpc->reason == CP_UMOUNT) { + if (__remain_node_summaries(cpc->reason)) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -955,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* writeout checkpoint block */ cp_page = grab_meta_page(sbi, start_blk); kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, (1 << sbi->log_blocksize)); + memcpy(kaddr, ckpt, F2FS_BLKSIZE); set_page_dirty(cp_page); f2fs_put_page(cp_page, 1); @@ -975,13 +1028,16 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + /* wait for previous submitted meta pages writeback */ + wait_on_all_pages_writeback(sbi); + release_dirty_inode(sbi); if (unlikely(f2fs_cp_error(sbi))) return; clear_prefree_segments(sbi); - F2FS_RESET_SB_DIRT(sbi); + clear_sbi_flag(sbi, SBI_IS_DIRTY); } /* @@ -996,10 +1052,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); - if (!sbi->s_dirty && cpc->reason != CP_DISCARD) + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && + cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT) goto out; if (unlikely(f2fs_cp_error(sbi))) goto out; + if (f2fs_readonly(sbi->sb)) + goto out; if (block_operations(sbi)) goto out; @@ -1036,9 +1095,12 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) int i; for (i = 0; i < MAX_INO_ENTRY; i++) { - INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC); - spin_lock_init(&sbi->ino_lock[i]); - INIT_LIST_HEAD(&sbi->ino_list[i]); + struct inode_management *im = &sbi->im[i]; + + INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); + spin_lock_init(&im->ino_lock); + INIT_LIST_HEAD(&im->ino_list); + im->ino_num = 0; } /* @@ -1047,7 +1109,6 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) * orphan entries with the limitation one reserved segment * for cp pack we can have max 1020*504 orphan entries */ - sbi->n_orphans = 0; sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; } @@ -1058,8 +1119,8 @@ int __init create_checkpoint_caches(void) sizeof(struct ino_entry)); if (!ino_entry_slab) return -ENOMEM; - inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", - sizeof(struct dir_inode_entry)); + inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", + sizeof(struct inode_entry)); if (!inode_entry_slab) { kmem_cache_destroy(ino_entry_slab); return -ENOMEM; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8e58c4cc2cb9..985ed023a750 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -22,6 +22,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "trace.h" #include <trace/events/f2fs.h> static void f2fs_read_end_io(struct bio *bio, int err) @@ -61,11 +62,6 @@ static void f2fs_write_end_io(struct bio *bio, int err) dec_page_count(sbi, F2FS_WRITEBACK); } - if (sbi->wait_io) { - complete(sbi->wait_io); - sbi->wait_io = NULL; - } - if (!get_pages(sbi, F2FS_WRITEBACK) && !list_empty(&sbi->cp_wait.task_list)) wake_up(&sbi->cp_wait); @@ -95,34 +91,16 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; - int rw; if (!io->bio) return; - rw = fio->rw; - - if (is_read_io(rw)) { - trace_f2fs_submit_read_bio(io->sbi->sb, rw, - fio->type, io->bio); - submit_bio(rw, io->bio); - } else { - trace_f2fs_submit_write_bio(io->sbi->sb, rw, - fio->type, io->bio); - /* - * META_FLUSH is only from the checkpoint procedure, and we - * should wait this metadata bio for FS consistency. - */ - if (fio->type == META_FLUSH) { - DECLARE_COMPLETION_ONSTACK(wait); - io->sbi->wait_io = &wait; - submit_bio(rw, io->bio); - wait_for_completion(&wait); - } else { - submit_bio(rw, io->bio); - } - } + if (is_read_io(fio->rw)) + trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); + else + trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); + submit_bio(fio->rw, io->bio); io->bio = NULL; } @@ -153,14 +131,15 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, * Return unlocked page. */ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, int rw) + struct f2fs_io_info *fio) { struct bio *bio; - trace_f2fs_submit_page_bio(page, blk_addr, rw); + trace_f2fs_submit_page_bio(page, fio); + f2fs_trace_ios(page, fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw)); + bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw)); if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { bio_put(bio); @@ -168,12 +147,12 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, return -EFAULT; } - submit_bio(rw, bio); + submit_bio(fio->rw, bio); return 0; } void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, - block_t blk_addr, struct f2fs_io_info *fio) + struct f2fs_io_info *fio) { enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; @@ -181,21 +160,21 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, io = is_read ? &sbi->read_io : &sbi->write_io[btype]; - verify_block_addr(sbi, blk_addr); + verify_block_addr(sbi, fio->blk_addr); down_write(&io->io_rwsem); if (!is_read) inc_page_count(sbi, F2FS_WRITEBACK); - if (io->bio && (io->last_block_in_bio != blk_addr - 1 || + if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); + io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); io->fio = *fio; } @@ -205,10 +184,11 @@ alloc_new: goto alloc_new; } - io->last_block_in_bio = blk_addr; + io->last_block_in_bio = fio->blk_addr; + f2fs_trace_ios(page, fio, 0); up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr); + trace_f2fs_submit_page_mbio(page, fio); } /* @@ -217,7 +197,7 @@ alloc_new: * ->node_page * update block addresses in the node page */ -static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) +static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn; __le32 *addr_array; @@ -230,7 +210,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(new_addr); + addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); set_page_dirty(node_page); } @@ -245,8 +225,8 @@ int reserve_new_block(struct dnode_of_data *dn) trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); - __set_data_blkaddr(dn, NEW_ADDR); dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; @@ -257,9 +237,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) bool need_put = dn->inode_page ? false : true; int err; - /* if inode_page exists, index should be zero */ - f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index); - err = get_dnode_of_data(dn, index, ALLOC_NODE); if (err) return err; @@ -297,7 +274,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, unsigned int blkbits = inode->i_sb->s_blocksize_bits; size_t count; - clear_buffer_new(bh_result); + set_buffer_new(bh_result); map_bh(bh_result, inode->i_sb, start_blkaddr + pgofs - start_fofs); count = end_fofs - pgofs + 1; @@ -314,23 +291,24 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, return 0; } -void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) +void update_extent_cache(struct dnode_of_data *dn) { struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs, start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; int need_update = true; - f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR); - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); /* Update the page address in the parent node */ - __set_data_blkaddr(dn, blk_addr); + __set_data_blkaddr(dn); if (is_inode_flag_set(fi, FI_NO_EXTENT)) return; + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + write_lock(&fi->ext.ext_lock); start_fofs = fi->ext.fofs; @@ -344,16 +322,16 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) /* Initial extent */ if (fi->ext.len == 0) { - if (blk_addr != NULL_ADDR) { + if (dn->data_blkaddr != NULL_ADDR) { fi->ext.fofs = fofs; - fi->ext.blk_addr = blk_addr; + fi->ext.blk_addr = dn->data_blkaddr; fi->ext.len = 1; } goto end_update; } /* Front merge */ - if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) { + if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) { fi->ext.fofs--; fi->ext.blk_addr--; fi->ext.len++; @@ -361,7 +339,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) } /* Back merge */ - if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) { + if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) { fi->ext.len++; goto end_update; } @@ -400,6 +378,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) struct dnode_of_data dn; struct page *page; int err; + struct f2fs_io_info fio = { + .type = DATA, + .rw = sync ? READ_SYNC : READA, + }; page = find_get_page(mapping, index); if (page && PageUptodate(page)) @@ -428,8 +410,8 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return page; } - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr, - sync ? READ_SYNC : READA); + fio.blk_addr = dn.data_blkaddr; + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); if (err) return ERR_PTR(err); @@ -454,7 +436,10 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) struct dnode_of_data dn; struct page *page; int err; - + struct f2fs_io_info fio = { + .type = DATA, + .rw = READ_SYNC, + }; repeat: page = grab_cache_page(mapping, index); if (!page) @@ -488,8 +473,8 @@ repeat: return page; } - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, - dn.data_blkaddr, READ_SYNC); + fio.blk_addr = dn.data_blkaddr; + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); if (err) return ERR_PTR(err); @@ -539,8 +524,12 @@ repeat: zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, - dn.data_blkaddr, READ_SYNC); + struct f2fs_io_info fio = { + .type = DATA, + .rw = READ_SYNC, + .blk_addr = dn.data_blkaddr, + }; + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); if (err) goto put_err; @@ -574,30 +563,25 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; - block_t new_blkaddr; struct node_info ni; + int seg = CURSEG_WARM_DATA; pgoff_t fofs; - int type; if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; - __set_data_blkaddr(dn, NEW_ADDR); - dn->data_blkaddr = NEW_ADDR; - get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - type = CURSEG_WARM_DATA; + if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) + seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type); + allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg); /* direct IO doesn't use extent cache to maximize the performance */ - set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); - update_extent_cache(new_blkaddr, dn); - clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + __set_data_blkaddr(dn); /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + @@ -605,10 +589,59 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); - dn->data_blkaddr = new_blkaddr; return 0; } +static void __allocate_data_blocks(struct inode *inode, loff_t offset, + size_t count) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + u64 start = F2FS_BYTES_TO_BLK(offset); + u64 len = F2FS_BYTES_TO_BLK(count); + bool allocated; + u64 end_offset; + + while (len) { + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + + /* When reading holes, we need its node page */ + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, start, ALLOC_NODE)) + goto out; + + allocated = false; + end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + + while (dn.ofs_in_node < end_offset && len) { + if (dn.data_blkaddr == NULL_ADDR) { + if (__allocate_data_block(&dn)) + goto sync_out; + allocated = true; + } + len--; + start++; + dn.ofs_in_node++; + } + + if (allocated) + sync_inode_page(&dn); + + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + } + return; + +sync_out: + if (allocated) + sync_inode_page(&dn); + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + return; +} + /* * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. * If original data blocks are allocated, then give them to blockdev. @@ -634,10 +667,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (check_extent_cache(inode, pgofs, bh_result)) goto out; - if (create) { - f2fs_balance_fs(F2FS_I_SB(inode)); + if (create) f2fs_lock_op(F2FS_I_SB(inode)); - } /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -651,12 +682,14 @@ static int __get_data_block(struct inode *inode, sector_t iblock, goto put_out; if (dn.data_blkaddr != NULL_ADDR) { + set_buffer_new(bh_result); map_bh(bh_result, inode->i_sb, dn.data_blkaddr); } else if (create) { err = __allocate_data_block(&dn); if (err) goto put_out; allocated = true; + set_buffer_new(bh_result); map_bh(bh_result, inode->i_sb, dn.data_blkaddr); } else { goto put_out; @@ -740,14 +773,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, static int f2fs_read_data_page(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; - int ret; + int ret = -EAGAIN; trace_f2fs_readpage(page, DATA); /* If the file has inline data, try to read it directly */ if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); - else + if (ret == -EAGAIN) ret = mpage_readpage(page, get_data_block); return ret; @@ -769,7 +802,6 @@ static int f2fs_read_data_pages(struct file *file, int do_write_data_page(struct page *page, struct f2fs_io_info *fio) { struct inode *inode = page->mapping->host; - block_t old_blkaddr, new_blkaddr; struct dnode_of_data dn; int err = 0; @@ -778,10 +810,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) if (err) return err; - old_blkaddr = dn.data_blkaddr; + fio->blk_addr = dn.data_blkaddr; /* This page is already truncated */ - if (old_blkaddr == NULL_ADDR) + if (fio->blk_addr == NULL_ADDR) goto out_writepage; set_page_writeback(page); @@ -790,14 +822,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(old_blkaddr != NEW_ADDR && + if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(page, old_blkaddr, fio); + rewrite_data_page(page, fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); } else { - write_data_page(page, &dn, &new_blkaddr, fio); - update_extent_cache(new_blkaddr, &dn); + write_data_page(page, &dn, fio); + update_extent_cache(&dn); set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); } out_writepage: @@ -836,7 +868,12 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_CACHE_SIZE); write: - if (unlikely(sbi->por_doing)) + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (f2fs_is_drop_cache(inode)) + goto out; + if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)) goto redirty_out; /* Dentry blocks are controlled by checkpoint */ @@ -850,7 +887,6 @@ write: /* we should bypass data pages to proceed the kworkder jobs */ if (unlikely(f2fs_cp_error(sbi))) { SetPageError(page); - unlock_page(page); goto out; } @@ -859,10 +895,11 @@ write: else if (has_not_enough_free_secs(sbi, 0)) goto redirty_out; + err = -EAGAIN; f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode)) - err = f2fs_write_inline_data(inode, page, offset); - else + if (f2fs_has_inline_data(inode)) + err = f2fs_write_inline_data(inode, page); + if (err == -EAGAIN) err = do_write_data_page(page, &fio); f2fs_unlock_op(sbi); done: @@ -951,7 +988,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page; + struct page *page, *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; int err = 0; @@ -959,45 +996,60 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); f2fs_balance_fs(sbi); -repeat: - err = f2fs_convert_inline_data(inode, pos + len, NULL); - if (err) - goto fail; + /* + * We should check this at this moment to avoid deadlock on inode page + * and #0 page. The locking rule for inline_data conversion should be: + * lock_page(page #0) -> lock_page(inode_page) + */ + if (index != 0) { + err = f2fs_convert_inline_inode(inode); + if (err) + goto fail; + } +repeat: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { err = -ENOMEM; goto fail; } - /* to avoid latency during memory pressure */ - unlock_page(page); - *pagep = page; - if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA) - goto inline_data; - f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = f2fs_reserve_block(&dn, index); - f2fs_unlock_op(sbi); - if (err) { - f2fs_put_page(page, 0); - goto fail; - } -inline_data: - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); - goto repeat; + + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_fail; } - f2fs_wait_on_page_writeback(page, DATA); + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); + goto put_next; + } + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto put_fail; + } + err = f2fs_reserve_block(&dn, index); + if (err) + goto put_fail; +put_next: + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); if ((len == PAGE_CACHE_SIZE) || PageUptodate(page)) return 0; + f2fs_wait_on_page_writeback(page, DATA); + if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { unsigned start = pos & (PAGE_CACHE_SIZE - 1); unsigned end = start + len; @@ -1010,18 +1062,14 @@ inline_data: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { - if (f2fs_has_inline_data(inode)) { - err = f2fs_read_inline_data(inode, page); - if (err) { - page_cache_release(page); - goto fail; - } - } else { - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, - READ_SYNC); - if (err) - goto fail; - } + struct f2fs_io_info fio = { + .type = DATA, + .rw = READ_SYNC, + .blk_addr = dn.data_blkaddr, + }; + err = f2fs_submit_page_bio(sbi, page, &fio); + if (err) + goto fail; lock_page(page); if (unlikely(!PageUptodate(page))) { @@ -1038,6 +1086,12 @@ out: SetPageUptodate(page); clear_cold_data(page); return 0; + +put_fail: + f2fs_put_dnode(&dn); +unlock_fail: + f2fs_unlock_op(sbi); + f2fs_put_page(page, 1); fail: f2fs_write_failed(mapping, pos + len); return err; @@ -1052,10 +1106,7 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) - register_inmem_page(inode, page); - else - set_page_dirty(page); + set_page_dirty(page); if (pos + copied > i_size_read(inode)) { i_size_write(inode, pos + copied); @@ -1093,15 +1144,21 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); int err; - /* Let buffer I/O handle the inline data case. */ - if (f2fs_has_inline_data(inode)) - return 0; + /* we don't need to use inline_data strictly */ + if (f2fs_has_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } if (check_direct_IO(inode, rw, iter, offset)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); + if (rw & WRITE) + __allocate_data_blocks(inode, offset, count); + err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); if (err < 0 && (rw & WRITE)) f2fs_write_failed(mapping, offset + count); @@ -1111,21 +1168,33 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, return err; } -static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, - unsigned int length) +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length) { struct inode *inode = page->mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE) + if (inode->i_ino >= F2FS_ROOT_INO(sbi) && + (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) return; - if (PageDirty(page)) - inode_dec_dirty_pages(inode); + if (PageDirty(page)) { + if (inode->i_ino == F2FS_META_INO(sbi)) + dec_page_count(sbi, F2FS_DIRTY_META); + else if (inode->i_ino == F2FS_NODE_INO(sbi)) + dec_page_count(sbi, F2FS_DIRTY_NODES); + else + inode_dec_dirty_pages(inode); + } ClearPagePrivate(page); } -static int f2fs_release_data_page(struct page *page, gfp_t wait) +int f2fs_release_page(struct page *page, gfp_t wait) { + /* If this is dirty page, keep PagePrivate */ + if (PageDirty(page)) + return 0; + ClearPagePrivate(page); return 1; } @@ -1138,6 +1207,12 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); SetPageUptodate(page); + + if (f2fs_is_atomic_file(inode)) { + register_inmem_page(inode, page); + return 1; + } + mark_inode_dirty(inode); if (!PageDirty(page)) { @@ -1152,9 +1227,12 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - if (f2fs_has_inline_data(inode)) - return 0; - + /* we don't need to use inline_data strictly */ + if (f2fs_has_inline_data(inode)) { + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } return generic_block_bmap(mapping, block, get_data_block); } @@ -1166,8 +1244,8 @@ const struct address_space_operations f2fs_dblock_aops = { .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .set_page_dirty = f2fs_set_data_page_dirty, - .invalidatepage = f2fs_invalidate_data_page, - .releasepage = f2fs_release_data_page, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0a91ab813a9e..e671373cc8ab 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -39,13 +39,16 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); - si->inline_inode = sbi->inline_inode; + si->inline_inode = atomic_read(&sbi->inline_inode); + si->inline_dir = atomic_read(&sbi->inline_dir); si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -55,7 +58,9 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->node_pages = NODE_MAPPING(sbi)->nrpages; si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; - si->sits = SIT_I(sbi)->dirty_sentries; + si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; + si->sits = MAIN_SEGS(sbi); + si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->fnids = NM_I(sbi)->fcnt; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -77,6 +82,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->segment_count[i] = sbi->segment_count[i]; si->block_count[i] = sbi->block_count[i]; } + + si->inplace_count = atomic_read(&sbi->inplace_count); } /* @@ -118,6 +125,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); unsigned npages; + int i; if (si->base_mem) goto get_cache; @@ -134,6 +142,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); @@ -156,19 +165,32 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); +get_cache: + si->cache_mem = 0; + /* build gc */ - si->base_mem += sizeof(struct f2fs_gc_kthread); + if (sbi->gc_thread) + si->cache_mem += sizeof(struct f2fs_gc_kthread); + + /* build merge flush thread */ + if (SM_I(sbi)->cmd_control_info) + si->cache_mem += sizeof(struct flush_cmd_control); -get_cache: /* free nids */ - si->cache_mem = NM_I(sbi)->fcnt; - si->cache_mem += NM_I(sbi)->nat_cnt; + si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); + si->cache_mem += NM_I(sbi)->dirty_nat_cnt * + sizeof(struct nat_entry_set); + si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); + si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); + for (i = 0; i <= UPDATE_INO; i++) + si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + + si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->cache_mem += npages << PAGE_CACHE_SHIFT; + si->page_mem += npages << PAGE_CACHE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->cache_mem += npages << PAGE_CACHE_SHIFT; - si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); + si->page_mem += npages << PAGE_CACHE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -200,6 +222,8 @@ static int stat_show(struct seq_file *s, void *v) si->valid_count - si->valid_node_count); seq_printf(s, " - Inline_data Inode: %u\n", si->inline_inode); + seq_printf(s, " - Inline_dentry Inode: %u\n", + si->inline_dir); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -244,14 +268,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); seq_puts(s, "\nBalancing F2FS Async:\n"); + seq_printf(s, " - inmem: %4d, wb: %4d\n", + si->inmem_pages, si->wb_pages); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - NATs: %9d\n - SITs: %9d\n", - si->nats, si->sits); + seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", + si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", si->fnids); seq_puts(s, "\nDistribution of User Blocks:"); @@ -269,6 +295,7 @@ static int stat_show(struct seq_file *s, void *v) for (j = 0; j < si->util_free; j++) seq_putc(s, '-'); seq_puts(s, "]\n\n"); + seq_printf(s, "IPU: %u blocks\n", si->inplace_count); seq_printf(s, "SSR: %u blocks in %u segments\n", si->block_count[SSR], si->segment_count[SSR]); seq_printf(s, "LFS: %u blocks in %u segments\n", @@ -281,9 +308,14 @@ static int stat_show(struct seq_file *s, void *v) /* memory footprint */ update_mem_info(si->sbi); - seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", - (si->base_mem + si->cache_mem) >> 10, - si->base_mem >> 10, si->cache_mem >> 10); + seq_printf(s, "\nMemory: %u KB\n", + (si->base_mem + si->cache_mem + si->page_mem) >> 10); + seq_printf(s, " - static: %u KB\n", + si->base_mem >> 10); + seq_printf(s, " - cached: %u KB\n", + si->cache_mem >> 10); + seq_printf(s, " - paged : %u KB\n", + si->page_mem >> 10); } mutex_unlock(&f2fs_stat_mutex); return 0; @@ -321,6 +353,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) si->sbi = sbi; sbi->stat_info = si; + atomic_set(&sbi->inline_inode, 0); + atomic_set(&sbi->inline_dir, 0); + atomic_set(&sbi->inplace_count, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b54f87149c09..b74097a7f6d9 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -59,7 +59,7 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, }; -static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) +void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) { umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; @@ -90,51 +90,70 @@ static bool early_match_name(size_t namelen, f2fs_hash_t namehash, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct qstr *name, int *max_slots, - f2fs_hash_t namehash, struct page **res_page) + struct qstr *name, int *max_slots, + struct page **res_page) +{ + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + + dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + de = find_target_dentry(name, max_slots, &d); + + if (de) + *res_page = dentry_page; + else + kunmap(dentry_page); + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs has occurred. + */ + f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); + return de; +} + +struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots, + struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; - struct f2fs_dentry_block *dentry_blk = kmap(dentry_page); - const void *dentry_bits = &dentry_blk->dentry_bitmap; + f2fs_hash_t namehash = f2fs_dentry_hash(name); int max_len = 0; - while (bit_pos < NR_DENTRY_IN_BLOCK) { - if (!test_bit_le(bit_pos, dentry_bits)) { + if (max_slots) + *max_slots = 0; + while (bit_pos < d->max) { + if (!test_bit_le(bit_pos, d->bitmap)) { if (bit_pos == 0) max_len = 1; - else if (!test_bit_le(bit_pos - 1, dentry_bits)) + else if (!test_bit_le(bit_pos - 1, d->bitmap)) max_len++; bit_pos++; continue; } - de = &dentry_blk->dentry[bit_pos]; - if (early_match_name(name->len, namehash, de)) { - if (!memcmp(dentry_blk->filename[bit_pos], - name->name, - name->len)) { - *res_page = dentry_page; - goto found; - } - } - if (max_len > *max_slots) { + de = &d->dentry[bit_pos]; + if (early_match_name(name->len, namehash, de) && + !memcmp(d->filename[bit_pos], name->name, name->len)) + goto found; + + if (max_slots && *max_slots >= 0 && max_len > *max_slots) { *max_slots = max_len; max_len = 0; } - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len); + /* remain bug on condition */ + if (unlikely(!de->name_len)) + d->max = -1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } de = NULL; - kunmap(dentry_page); found: - if (max_len > *max_slots) + if (max_slots && max_len > *max_slots) *max_slots = max_len; return de; } @@ -149,7 +168,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct page *dentry_page; struct f2fs_dir_entry *de = NULL; bool room = false; - int max_slots = 0; + int max_slots; f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); @@ -168,8 +187,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, continue; } - de = find_in_block(dentry_page, name, &max_slots, - namehash, res_page); + de = find_in_block(dentry_page, name, &max_slots, res_page); if (de) break; @@ -201,6 +219,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, unsigned int max_depth; unsigned int level; + if (f2fs_has_inline_dentry(dir)) + return find_in_inline_dir(dir, child, res_page); + if (npages == 0) return NULL; @@ -227,6 +248,9 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) struct f2fs_dir_entry *de; struct f2fs_dentry_block *dentry_blk; + if (f2fs_has_inline_dentry(dir)) + return f2fs_parent_inline_dir(dir, p); + page = get_lock_data_page(dir, 0); if (IS_ERR(page)) return NULL; @@ -247,7 +271,7 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) de = f2fs_find_entry(dir, qstr, &page); if (de) { res = le32_to_cpu(de->ino); - kunmap(page); + f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); } @@ -257,11 +281,12 @@ ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { + enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, type); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode); - kunmap(page); + f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; mark_inode_dirty(dir); @@ -296,36 +321,48 @@ int update_dent_inode(struct inode *inode, const struct qstr *name) return 0; } -static int make_empty_dir(struct inode *inode, - struct inode *parent, struct page *page) +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d) { - struct page *dentry_page; - struct f2fs_dentry_block *dentry_blk; struct f2fs_dir_entry *de; - dentry_page = get_new_data_page(inode, page, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); - - - dentry_blk = kmap_atomic(dentry_page); - - de = &dentry_blk->dentry[0]; + de = &d->dentry[0]; de->name_len = cpu_to_le16(1); de->hash_code = 0; de->ino = cpu_to_le32(inode->i_ino); - memcpy(dentry_blk->filename[0], ".", 1); + memcpy(d->filename[0], ".", 1); set_de_type(de, inode); - de = &dentry_blk->dentry[1]; + de = &d->dentry[1]; de->hash_code = 0; de->name_len = cpu_to_le16(2); de->ino = cpu_to_le32(parent->i_ino); - memcpy(dentry_blk->filename[1], "..", 2); + memcpy(d->filename[1], "..", 2); set_de_type(de, inode); - test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); - test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); + test_and_set_bit_le(0, (void *)d->bitmap); + test_and_set_bit_le(1, (void *)d->bitmap); +} + +static int make_empty_dir(struct inode *inode, + struct inode *parent, struct page *page) +{ + struct page *dentry_page; + struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr d; + + if (f2fs_has_inline_dentry(inode)) + return make_empty_inline_dir(inode, parent, page); + + dentry_page = get_new_data_page(inode, page, 0, true); + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); + + dentry_blk = kmap_atomic(dentry_page); + + make_dentry_ptr(&d, (void *)dentry_blk, 1); + do_make_empty_dir(inode, parent, &d); + kunmap_atomic(dentry_blk); set_page_dirty(dentry_page); @@ -333,8 +370,8 @@ static int make_empty_dir(struct inode *inode, return 0; } -static struct page *init_inode_metadata(struct inode *inode, - struct inode *dir, const struct qstr *name) +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *name, struct page *dpage) { struct page *page; int err; @@ -350,7 +387,7 @@ static struct page *init_inode_metadata(struct inode *inode, goto error; } - err = f2fs_init_acl(inode, dir, page); + err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; @@ -395,7 +432,7 @@ error: return ERR_PTR(err); } -static void update_parent_metadata(struct inode *dir, struct inode *inode, +void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { @@ -417,27 +454,23 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } -static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots) +int room_for_filename(const void *bitmap, int slots, int max_slots) { int bit_start = 0; int zero_start, zero_end; next: - zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, - bit_start); - if (zero_start >= NR_DENTRY_IN_BLOCK) - return NR_DENTRY_IN_BLOCK; + zero_start = find_next_zero_bit_le(bitmap, max_slots, bit_start); + if (zero_start >= max_slots) + return max_slots; - zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, - zero_start); + zero_end = find_next_bit_le(bitmap, max_slots, zero_start); if (zero_end - zero_start >= slots) return zero_start; bit_start = zero_end + 1; - if (zero_end + 1 >= NR_DENTRY_IN_BLOCK) - return NR_DENTRY_IN_BLOCK; + if (zero_end + 1 >= max_slots) + return max_slots; goto next; } @@ -463,6 +496,14 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, int err = 0; int i; + if (f2fs_has_inline_dentry(dir)) { + err = f2fs_add_inline_entry(dir, name, inode); + if (!err || err != -EAGAIN) + return err; + else + err = 0; + } + dentry_hash = f2fs_dentry_hash(name); level = 0; current_depth = F2FS_I(dir)->i_current_depth; @@ -491,7 +532,8 @@ start: return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); - bit_pos = room_for_filename(dentry_blk, slots); + bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; @@ -506,7 +548,7 @@ add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA); down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name); + page = init_inode_metadata(inode, dir, name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -545,7 +587,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -560,26 +602,57 @@ fail: return err; } +void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + down_write(&F2FS_I(inode)->i_sem); + + if (S_ISDIR(inode->i_mode)) { + drop_nlink(dir); + if (page) + update_inode(dir, page); + else + update_inode_page(dir); + } + inode->i_ctime = CURRENT_TIME; + + drop_nlink(inode); + if (S_ISDIR(inode->i_mode)) { + drop_nlink(inode); + i_size_write(inode, 0); + } + up_write(&F2FS_I(inode)->i_sem); + update_inode_page(inode); + + if (inode->i_nlink == 0) + add_orphan_inode(sbi, inode->i_ino); + else + release_orphan_inode(sbi); +} + /* * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, - struct inode *inode) + struct inode *dir, struct inode *inode) { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; - struct inode *dir = page->mapping->host; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + if (f2fs_has_inline_dentry(dir)) + return f2fs_delete_inline_entry(dentry, page, dir, inode); + lock_page(page); f2fs_wait_on_page_writeback(page, DATA); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, @@ -590,29 +663,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; - if (inode) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - - down_write(&F2FS_I(inode)->i_sem); - - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - update_inode_page(dir); - } - inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); - } - up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); - - if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); - else - release_orphan_inode(sbi); - } + if (inode) + f2fs_drop_nlink(dir, inode, NULL); if (bit_pos == NR_DENTRY_IN_BLOCK) { truncate_hole(dir, page->index, page->index + 1); @@ -628,9 +680,12 @@ bool f2fs_empty_dir(struct inode *dir) unsigned long bidx; struct page *dentry_page; unsigned int bit_pos; - struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_block *dentry_blk; unsigned long nblock = dir_blocks(dir); + if (f2fs_has_inline_dentry(dir)) + return f2fs_empty_inline_dir(dir); + for (bidx = 0; bidx < nblock; bidx++) { dentry_page = get_lock_data_page(dir, bidx); if (IS_ERR(dentry_page)) { @@ -640,7 +695,6 @@ bool f2fs_empty_dir(struct inode *dir) return false; } - dentry_blk = kmap_atomic(dentry_page); if (bidx == 0) bit_pos = 2; @@ -659,19 +713,48 @@ bool f2fs_empty_dir(struct inode *dir) return true; } +bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos) +{ + unsigned char d_type = DT_UNKNOWN; + unsigned int bit_pos; + struct f2fs_dir_entry *de = NULL; + + bit_pos = ((unsigned long)ctx->pos % d->max); + + while (bit_pos < d->max) { + bit_pos = find_next_bit_le(d->bitmap, d->max, bit_pos); + if (bit_pos >= d->max) + break; + + de = &d->dentry[bit_pos]; + if (de->file_type < F2FS_FT_MAX) + d_type = f2fs_filetype_table[de->file_type]; + else + d_type = DT_UNKNOWN; + if (!dir_emit(ctx, d->filename[bit_pos], + le16_to_cpu(de->name_len), + le32_to_cpu(de->ino), d_type)) + return true; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + ctx->pos = start_pos + bit_pos; + } + return false; +} + static int f2fs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); - unsigned int bit_pos = 0; struct f2fs_dentry_block *dentry_blk = NULL; - struct f2fs_dir_entry *de = NULL; struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); - unsigned char d_type = DT_UNKNOWN; + struct f2fs_dentry_ptr d; - bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK); + if (f2fs_has_inline_dentry(inode)) + return f2fs_read_inline_dir(file, ctx); /* readahead for multi pages of dir */ if (npages - n > 1 && !ra_has_index(ra, n)) @@ -684,28 +767,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) continue; dentry_blk = kmap(dentry_page); - while (bit_pos < NR_DENTRY_IN_BLOCK) { - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_DENTRY_IN_BLOCK, - bit_pos); - if (bit_pos >= NR_DENTRY_IN_BLOCK) - break; - - de = &dentry_blk->dentry[bit_pos]; - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; - if (!dir_emit(ctx, - dentry_blk->filename[bit_pos], - le16_to_cpu(de->name_len), - le32_to_cpu(de->ino), d_type)) - goto stop; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - ctx->pos = n * NR_DENTRY_IN_BLOCK + bit_pos; - } - bit_pos = 0; + make_dentry_ptr(&d, (void *)dentry_blk, 1); + + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK)) + goto stop; + ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8171e80b2ee9..7fa3313ab0e2 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -28,7 +28,7 @@ do { \ if (unlikely(condition)) { \ WARN_ON(1); \ - sbi->need_fsck = true; \ + set_sbi_flag(sbi, SBI_NEED_FSCK); \ } \ } while (0) #define f2fs_down_write(x, y) down_write(x) @@ -46,8 +46,10 @@ #define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040 #define F2FS_MOUNT_INLINE_XATTR 0x00000080 #define F2FS_MOUNT_INLINE_DATA 0x00000100 -#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 -#define F2FS_MOUNT_NOBARRIER 0x00000400 +#define F2FS_MOUNT_INLINE_DENTRY 0x00000200 +#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 +#define F2FS_MOUNT_NOBARRIER 0x00000800 +#define F2FS_MOUNT_FASTBOOT 0x00001000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -98,10 +100,15 @@ enum { enum { CP_UMOUNT, + CP_FASTBOOT, CP_SYNC, CP_DISCARD, }; +#define DEF_BATCHED_TRIM_SECTIONS 32 +#define BATCHED_TRIM_SEGMENTS(sbi) \ + (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + struct cp_control { int reason; __u64 trim_start; @@ -134,8 +141,14 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* for the list of directory inodes */ -struct dir_inode_entry { +/* + * for the list of directory inodes or gc inodes. + * NOTE: there are two slab users for this structure, if we add/modify/delete + * fields in structure for one of slab users, it may affect fields or size of + * other one, in this condition, it's better to split both of slab and related + * data structure. + */ +struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ }; @@ -194,11 +207,14 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, */ #define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS #define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS +#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION #define F2FS_IOCTL_MAGIC 0xf5 #define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) #define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) +#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) +#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -211,6 +227,32 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * For INODE and NODE manager */ +/* for directory operations */ +struct f2fs_dentry_ptr { + const void *bitmap; + struct f2fs_dir_entry *dentry; + __u8 (*filename)[F2FS_SLOT_LEN]; + int max; +}; + +static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d, + void *src, int type) +{ + if (type == 1) { + struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; + } else { + struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; + } +} + /* * XATTR_NODE_OFFSET stores xattrs to one node block per file keeping -1 * as its node offset to distinguish from index node blocks. @@ -267,8 +309,9 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ - struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ + struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ }; @@ -303,7 +346,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - rwlock_t nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ @@ -369,7 +412,8 @@ enum { CURSEG_HOT_NODE, /* direct node blocks of directory files */ CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ - NO_CHECK_TYPE + NO_CHECK_TYPE, + CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { @@ -408,6 +452,9 @@ struct f2fs_sm_info { int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ + /* for batched trimming */ + unsigned int trim_sections; /* # of sections to trim */ + struct list_head sit_entry_set; /* sit entry set list */ unsigned int ipu_policy; /* in-place-update policy */ @@ -433,6 +480,7 @@ enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_NODES, F2FS_DIRTY_META, + F2FS_INMEM_PAGES, NR_COUNT_TYPE, }; @@ -459,6 +507,7 @@ enum page_type { struct f2fs_io_info { enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + block_t blk_addr; /* block address to be written */ }; #define is_read_io(rw) (((rw) & 1) == READ) @@ -470,13 +519,28 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +/* for inner inode cache management */ +struct inode_management { + struct radix_tree_root ino_root; /* ino entry array */ + spinlock_t ino_lock; /* for ino entry lock */ + struct list_head ino_list; /* inode list head */ + unsigned long ino_num; /* number of entries */ +}; + +/* For s_flag in struct f2fs_sb_info */ +enum { + SBI_IS_DIRTY, /* dirty flag for checkpoint */ + SBI_IS_CLOSE, /* specify unmounting */ + SBI_NEED_FSCK, /* need fsck.f2fs to fix */ + SBI_POR_DOING, /* recovery is doing or not */ +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - int s_dirty; /* dirty flag for checkpoint */ - bool need_fsck; /* need fsck.f2fs to fix */ + int s_flag; /* flags for sbi */ /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ @@ -488,7 +552,6 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ - struct completion *wait_io; /* for completion bios */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -497,16 +560,11 @@ struct f2fs_sb_info { struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ - bool por_doing; /* recovery is doing or not */ wait_queue_head_t cp_wait; - /* for inode management */ - struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */ - spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */ - struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */ + struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ - unsigned int n_orphans; /* # of orphan inodes */ unsigned int max_orphans; /* max orphan inodes */ /* for directory inode management */ @@ -556,8 +614,10 @@ struct f2fs_sb_info { struct f2fs_stat_info *stat_info; /* FS status information */ unsigned int segment_count[2]; /* # of allocated segments */ unsigned int block_count[2]; /* # of allocated blocks */ + atomic_t inplace_count; /* # of inplace update */ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */ - int inline_inode; /* # of inline_data inodes */ + atomic_t inline_inode; /* # of inline_data inodes */ + atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ unsigned int n_dirty_dirs; /* # of dir inodes */ #endif @@ -652,14 +712,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) return sbi->node_inode->i_mapping; } -static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi) +static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) +{ + return sbi->s_flag & (0x01 << type); +} + +static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_dirty = 1; + sbi->s_flag |= (0x01 << type); } -static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi) +static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_dirty = 0; + sbi->s_flag &= ~(0x01 << type); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -707,6 +772,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) up_write(&sbi->cp_rwsem); } +static inline int __get_cp_reason(struct f2fs_sb_info *sbi) +{ + int reason = CP_SYNC; + + if (test_opt(sbi, FASTBOOT)) + reason = CP_FASTBOOT; + if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) + reason = CP_UMOUNT; + return reason; +} + +static inline bool __remain_node_summaries(int reason) +{ + return (reason == CP_UMOUNT || reason == CP_FASTBOOT); +} + +static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) +{ + return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || + is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); +} + /* * Check whether the given nid is within node id range. */ @@ -771,7 +858,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - F2FS_SET_SB_DIRT(sbi); + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) @@ -988,6 +1075,13 @@ retry: return entry; } +static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, + unsigned long index, void *item) +{ + while (radix_tree_insert(root, index, item)) + cond_resched(); +} + #define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino) static inline bool IS_INODE(struct page *page) @@ -1020,7 +1114,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) return mask & *addr; } -static inline int f2fs_set_bit(unsigned int nr, char *addr) +static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) { int mask; int ret; @@ -1032,7 +1126,7 @@ static inline int f2fs_set_bit(unsigned int nr, char *addr) return ret; } -static inline int f2fs_clear_bit(unsigned int nr, char *addr) +static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) { int mask; int ret; @@ -1044,6 +1138,15 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr) return ret; } +static inline void f2fs_change_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr ^= mask; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1057,11 +1160,14 @@ enum { FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ + FI_INLINE_DENTRY, /* used for inline dentry */ FI_APPEND_WRITE, /* inode has appended data */ FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ FI_VOLATILE_FILE, /* indicate volatile file */ + FI_DROP_CACHE, /* drop dirty page cache */ + FI_DATA_EXIST, /* indicate data exists */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1087,15 +1193,6 @@ static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) set_inode_flag(fi, FI_ACL_MODE); } -static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag) -{ - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - clear_inode_flag(fi, FI_ACL_MODE); - return 1; - } - return 0; -} - static inline void get_inline_info(struct f2fs_inode_info *fi, struct f2fs_inode *ri) { @@ -1103,6 +1200,10 @@ static inline void get_inline_info(struct f2fs_inode_info *fi, set_inode_flag(fi, FI_INLINE_XATTR); if (ri->i_inline & F2FS_INLINE_DATA) set_inode_flag(fi, FI_INLINE_DATA); + if (ri->i_inline & F2FS_INLINE_DENTRY) + set_inode_flag(fi, FI_INLINE_DENTRY); + if (ri->i_inline & F2FS_DATA_EXIST) + set_inode_flag(fi, FI_DATA_EXIST); } static inline void set_raw_inline(struct f2fs_inode_info *fi, @@ -1114,6 +1215,10 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_XATTR; if (is_inode_flag_set(fi, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; + if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + ri->i_inline |= F2FS_INLINE_DENTRY; + if (is_inode_flag_set(fi, FI_DATA_EXIST)) + ri->i_inline |= F2FS_DATA_EXIST; } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1148,6 +1253,17 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); } +static inline void f2fs_clear_inline_inode(struct inode *inode) +{ + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); +} + +static inline int f2fs_exist_data(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); +} + static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); @@ -1158,12 +1274,34 @@ static inline bool f2fs_is_volatile_file(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); } +static inline bool f2fs_is_drop_cache(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); +} + static inline void *inline_data_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); return (void *)&(ri->i_addr[1]); } +static inline int f2fs_has_inline_dentry(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); +} + +static inline void *inline_dentry_addr(struct page *page) +{ + struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[1]); +} + +static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) +{ + if (!f2fs_has_inline_dentry(dir)) + kunmap(page); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -1224,6 +1362,19 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ +extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; +void set_de_type(struct f2fs_dir_entry *, struct inode *); +struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *, + struct f2fs_dentry_ptr *); +bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, + unsigned int); +void do_make_empty_dir(struct inode *, struct inode *, + struct f2fs_dentry_ptr *); +struct page *init_inode_metadata(struct inode *, struct inode *, + const struct qstr *, struct page *); +void update_parent_metadata(struct inode *, struct inode *, unsigned int); +int room_for_filename(const void *, int, int); +void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); @@ -1232,7 +1383,8 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, const struct qstr *); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); +void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, + struct inode *); int f2fs_do_tmpfile(struct inode *, struct inode *); int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); @@ -1307,16 +1459,16 @@ void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); void release_discard_addrs(struct f2fs_sb_info *); void discard_next_dnode(struct f2fs_sb_info *, block_t); -int npages_for_summary_flush(struct f2fs_sb_info *); +int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, - struct f2fs_io_info *, unsigned int, block_t, block_t *); -void write_data_page(struct page *, struct dnode_of_data *, block_t *, - struct f2fs_io_info *); -void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); + unsigned int, struct f2fs_io_info *); +void write_data_page(struct page *, struct dnode_of_data *, + struct f2fs_io_info *); +void rewrite_data_page(struct page *, struct f2fs_io_info *); void recover_data_page(struct f2fs_sb_info *, struct page *, struct f2fs_summary *, block_t, block_t); void allocate_data_block(struct f2fs_sb_info *, struct page *, @@ -1337,8 +1489,8 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); +void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); @@ -1363,17 +1515,20 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int); -void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t, +int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, + struct f2fs_io_info *); +void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, struct f2fs_io_info *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -void update_extent_cache(block_t, struct dnode_of_data *); +void update_extent_cache(struct dnode_of_data *); struct page *find_data_page(struct inode *, pgoff_t, bool); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct page *, struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); +int f2fs_release_page(struct page *, gfp_t); /* * gc.c @@ -1383,8 +1538,6 @@ void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); -int __init create_gc_caches(void); -void destroy_gc_caches(void); /* * recovery.c @@ -1403,9 +1556,9 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; int hit_ext, total_ext; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; - int nats, sits, fnids; + int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inline_inode; + int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages; unsigned int valid_count, valid_node_count, valid_inode_count; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -1420,7 +1573,8 @@ struct f2fs_stat_info { unsigned int segment_count[2]; unsigned int block_count[2]; - unsigned base_mem, cache_mem; + unsigned int inplace_count; + unsigned base_mem, cache_mem, page_mem; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) @@ -1438,19 +1592,29 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ - ((F2FS_I_SB(inode))->inline_inode++); \ + (atomic_inc(&F2FS_I_SB(inode)->inline_inode)); \ } while (0) #define stat_dec_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ - ((F2FS_I_SB(inode))->inline_inode--); \ + (atomic_dec(&F2FS_I_SB(inode)->inline_inode)); \ + } while (0) +#define stat_inc_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_inc(&F2FS_I_SB(inode)->inline_dir)); \ + } while (0) +#define stat_dec_inline_dir(inode) \ + do { \ + if (f2fs_has_inline_dentry(inode)) \ + (atomic_dec(&F2FS_I_SB(inode)->inline_dir)); \ } while (0) - #define stat_inc_seg_type(sbi, curseg) \ ((sbi)->segment_count[(curseg)->alloc_type]++) #define stat_inc_block_count(sbi, curseg) \ ((sbi)->block_count[(curseg)->alloc_type]++) - +#define stat_inc_inplace_blocks(sbi) \ + (atomic_inc(&(sbi)->inplace_count)) #define stat_inc_seg_count(sbi, type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -1492,8 +1656,11 @@ void f2fs_destroy_root_stats(void); #define stat_inc_read_hit(sb) #define stat_inc_inline_inode(inode) #define stat_dec_inline_inode(inode) +#define stat_inc_inline_dir(inode) +#define stat_dec_inline_dir(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) +#define stat_inc_inplace_blocks(sbi) #define stat_inc_seg_count(si, type) #define stat_inc_tot_blk_count(si, blks) #define stat_inc_data_blk_count(si, blks) @@ -1514,14 +1681,25 @@ extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; +extern struct kmem_cache *inode_entry_slab; /* * inline.c */ bool f2fs_may_inline(struct inode *); +void read_inline_data(struct page *, struct page *); int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *); -int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); -void truncate_inline_data(struct inode *, u64); +int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); +int f2fs_convert_inline_inode(struct inode *); +int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, + struct page **); +struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); +int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, + struct inode *, struct inode *); +bool f2fs_empty_inline_dir(struct inode *); +int f2fs_read_inline_dir(struct file *, struct dir_context *); #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8e68bb64f835..98dac27bc3f7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -26,6 +26,7 @@ #include "segment.h" #include "xattr.h" #include "acl.h" +#include "trace.h" #include <trace/events/f2fs.h> static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, @@ -41,18 +42,18 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, sb_start_pagefault(inode->i_sb); - /* force to convert with normal data indices */ - err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page); - if (err) - goto out; + f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); /* block allocation */ f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_reserve_block(&dn, page->index); - f2fs_unlock_op(sbi); - if (err) + if (err) { + f2fs_unlock_op(sbi); goto out; + } + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); file_update_time(vma->vm_file); lock_page(page); @@ -92,7 +93,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int get_parent_ino(struct inode *inode, nid_t *pino) @@ -130,10 +130,45 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) need_cp = true; + else if (test_opt(sbi, FASTBOOT)) + need_cp = true; + else if (sbi->active_logs == 2) + need_cp = true; return need_cp; } +static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + bool ret = false; + /* But we need to avoid that there are some inode updates */ + if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) + ret = true; + f2fs_put_page(i, 0); + return ret; +} + +static void try_to_fix_pino(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + nid_t pino; + + down_write(&fi->i_sem); + fi->xattr_ver = 0; + if (file_wrong_pino(inode) && inode->i_nlink == 1 && + get_parent_ino(inode, &pino)) { + fi->i_pino = pino; + file_got_pino(inode); + up_write(&fi->i_sem); + + mark_inode_dirty_sync(inode); + f2fs_write_inode(inode, NULL); + } else { + up_write(&fi->i_sem); + } +} + int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -164,19 +199,21 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return ret; } + /* if the inode is dirty, let's recover all the time */ + if (!datasync && is_inode_flag_set(fi, FI_DIRTY_INODE)) { + update_inode_page(inode); + goto go_write; + } + /* * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { - struct page *i = find_get_page(NODE_MAPPING(sbi), ino); - /* But we need to avoid that there are some inode updates */ - if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) { - f2fs_put_page(i, 0); + /* it may call write_inode just prior to fsync */ + if (need_inode_page_update(sbi, ino)) goto go_write; - } - f2fs_put_page(i, 0); if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) @@ -196,51 +233,43 @@ go_write: up_read(&fi->i_sem); if (need_cp) { - nid_t pino; - /* all the dirty node pages should be flushed for POR */ ret = f2fs_sync_fs(inode->i_sb, 1); - down_write(&fi->i_sem); - F2FS_I(inode)->xattr_ver = 0; - if (file_wrong_pino(inode) && inode->i_nlink == 1 && - get_parent_ino(inode, &pino)) { - F2FS_I(inode)->i_pino = pino; - file_got_pino(inode); - up_write(&fi->i_sem); - mark_inode_dirty_sync(inode); - ret = f2fs_write_inode(inode, NULL); - if (ret) - goto out; - } else { - up_write(&fi->i_sem); - } - } else { + /* + * We've secured consistency through sync_fs. Following pino + * will be used only for fsynced inodes after checkpoint. + */ + try_to_fix_pino(inode); + goto out; + } sync_nodes: - sync_node_pages(sbi, ino, &wbc); - - if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); - ret = f2fs_write_inode(inode, NULL); - if (ret) - goto out; - goto sync_nodes; - } + sync_node_pages(sbi, ino, &wbc); - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) + goto out; - /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); -flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(F2FS_I_SB(inode)); + if (need_inode_block_update(sbi, ino)) { + mark_inode_dirty_sync(inode); + f2fs_write_inode(inode, NULL); + goto sync_nodes; } + + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + + /* once recovery info is written, don't need to tack this */ + remove_dirty_inode(sbi, ino, APPEND_INO); + clear_inode_flag(fi, FI_APPEND_WRITE); +flush_out: + remove_dirty_inode(sbi, ino, UPDATE_INO); + clear_inode_flag(fi, FI_UPDATE_WRITE); + ret = f2fs_issue_flush(sbi); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); + f2fs_trace_ios(NULL, NULL, 1); return ret; } @@ -296,7 +325,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto fail; /* handle inline data case */ - if (f2fs_has_inline_data(inode)) { + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) { if (whence == SEEK_HOLE) data_ofs = isize; goto found; @@ -327,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, - data_ofs = pgofs << PAGE_CACHE_SHIFT) { + data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); @@ -374,6 +403,15 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct inode *inode = file_inode(file); + + /* we don't need to use inline_data strictly */ + if (f2fs_has_inline_data(inode)) { + int err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } + file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; return 0; @@ -394,7 +432,8 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) if (blkaddr == NULL_ADDR) continue; - update_extent_cache(NULL_ADDR, dn); + dn->data_blkaddr = NULL_ADDR; + update_extent_cache(dn); invalidate_blocks(sbi, blkaddr); nr_free++; } @@ -415,20 +454,17 @@ void truncate_data_blocks(struct dnode_of_data *dn) truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); } -static void truncate_partial_data_page(struct inode *inode, u64 from) +static int truncate_partial_data_page(struct inode *inode, u64 from) { unsigned offset = from & (PAGE_CACHE_SIZE - 1); struct page *page; - if (f2fs_has_inline_data(inode)) - return truncate_inline_data(inode, from); - if (!offset) - return; + return 0; page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); if (IS_ERR(page)) - return; + return 0; lock_page(page); if (unlikely(!PageUptodate(page) || @@ -438,9 +474,9 @@ static void truncate_partial_data_page(struct inode *inode, u64 from) f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); set_page_dirty(page); - out: f2fs_put_page(page, 1); + return 0; } int truncate_blocks(struct inode *inode, u64 from, bool lock) @@ -450,27 +486,32 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) struct dnode_of_data dn; pgoff_t free_from; int count = 0, err = 0; + struct page *ipage; trace_f2fs_truncate_blocks_enter(inode, from); - if (f2fs_has_inline_data(inode)) - goto done; - - free_from = (pgoff_t) - ((from + blocksize - 1) >> (sbi->log_blocksize)); + free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); if (lock) f2fs_lock_op(sbi); - set_new_dnode(&dn, inode, NULL, NULL, 0); + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; + } + + if (f2fs_has_inline_data(inode)) { + f2fs_put_page(ipage, 1); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); if (err) { if (err == -ENOENT) goto free_next; - if (lock) - f2fs_unlock_op(sbi); - trace_f2fs_truncate_blocks_exit(inode, err); - return err; + goto out; } count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); @@ -486,11 +527,13 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) f2fs_put_dnode(&dn); free_next: err = truncate_inode_blocks(inode, free_from); +out: if (lock) f2fs_unlock_op(sbi); -done: + /* lastly zero out the first data page */ - truncate_partial_data_page(inode, from); + if (!err) + err = truncate_partial_data_page(inode, from); trace_f2fs_truncate_blocks_exit(inode, err); return err; @@ -504,6 +547,12 @@ void f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); + /* we should check inline_data size */ + if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) { + if (f2fs_convert_inline_inode(inode)) + return; + } + if (!truncate_blocks(inode, i_size_read(inode), true)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); @@ -561,10 +610,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (attr->ia_valid & ATTR_SIZE) { - err = f2fs_convert_inline_data(inode, attr->ia_size, NULL); - if (err) - return err; - if (attr->ia_size != i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); @@ -665,9 +710,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) if (offset >= inode->i_size) return ret; - ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); - if (ret) - return ret; + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -721,9 +768,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (ret) return ret; - ret = f2fs_convert_inline_data(inode, offset + len, NULL); - if (ret) - return ret; + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -792,6 +841,19 @@ static long f2fs_fallocate(struct file *file, int mode, return ret; } +static int f2fs_release_file(struct inode *inode, struct file *filp) +{ + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, true); + if (f2fs_is_volatile_file(inode)) { + set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + filemap_fdatawrite(inode->i_mapping); + clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + } + return 0; +} + #define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) #define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) @@ -862,19 +924,28 @@ out: return ret; } +static int f2fs_ioc_getversion(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + return put_user(inode->i_generation, (int __user *)arg); +} + static int f2fs_ioc_start_atomic_write(struct file *filp) { struct inode *inode = file_inode(filp); - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(sbi); + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_is_atomic_file(inode)) + return 0; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); + return f2fs_convert_inline_inode(inode); } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -897,6 +968,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); mnt_drop_write_file(filp); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); return ret; } @@ -907,10 +979,56 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (f2fs_is_volatile_file(inode)) + return 0; + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + + return f2fs_convert_inline_inode(inode); +} + +static int f2fs_ioc_release_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (!f2fs_is_volatile_file(inode)) + return 0; + + punch_hole(inode, 0, F2FS_BLKSIZE); return 0; } +static int f2fs_ioc_abort_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + f2fs_balance_fs(F2FS_I_SB(inode)); + + if (f2fs_is_atomic_file(inode)) { + commit_inmem_pages(inode, false); + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + } + + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + filemap_fdatawrite(inode->i_mapping); + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + } + mnt_drop_write_file(filp); + return ret; +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -948,12 +1066,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_getflags(filp, arg); case F2FS_IOC_SETFLAGS: return f2fs_ioc_setflags(filp, arg); + case F2FS_IOC_GETVERSION: + return f2fs_ioc_getversion(filp, arg); case F2FS_IOC_START_ATOMIC_WRITE: return f2fs_ioc_start_atomic_write(filp); case F2FS_IOC_COMMIT_ATOMIC_WRITE: return f2fs_ioc_commit_atomic_write(filp); case F2FS_IOC_START_VOLATILE_WRITE: return f2fs_ioc_start_volatile_write(filp); + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + return f2fs_ioc_release_volatile_write(filp); + case F2FS_IOC_ABORT_VOLATILE_WRITE: + return f2fs_ioc_abort_volatile_write(filp); case FITRIM: return f2fs_ioc_fitrim(filp, arg); default: @@ -985,6 +1109,7 @@ const struct file_operations f2fs_file_operations = { .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .open = generic_file_open, + .release = f2fs_release_file, .mmap = f2fs_file_mmap, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 2a8f4acdb86b..76adbc3641f1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -24,8 +24,6 @@ #include "gc.h" #include <trace/events/f2fs.h> -static struct kmem_cache *winode_slab; - static int gc_thread_func(void *data) { struct f2fs_sb_info *sbi = data; @@ -46,7 +44,7 @@ static int gc_thread_func(void *data) break; if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { - wait_ms = increase_sleep_time(gc_th, wait_ms); + increase_sleep_time(gc_th, &wait_ms); continue; } @@ -67,15 +65,15 @@ static int gc_thread_func(void *data) continue; if (!is_idle(sbi)) { - wait_ms = increase_sleep_time(gc_th, wait_ms); + increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); continue; } if (has_enough_invalid_blocks(sbi)) - wait_ms = decrease_sleep_time(gc_th, wait_ms); + decrease_sleep_time(gc_th, &wait_ms); else - wait_ms = increase_sleep_time(gc_th, wait_ms); + increase_sleep_time(gc_th, &wait_ms); stat_inc_bggc_count(sbi); @@ -96,8 +94,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - if (!test_opt(sbi, BG_GC)) - goto out; gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; @@ -340,37 +336,39 @@ static const struct victim_selection default_v_ops = { .get_victim = get_victim_by_default, }; -static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist) +static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) { struct inode_entry *ie; - list_for_each_entry(ie, ilist, list) - if (ie->inode->i_ino == ino) - return ie->inode; + ie = radix_tree_lookup(&gc_list->iroot, ino); + if (ie) + return ie->inode; return NULL; } -static void add_gc_inode(struct inode *inode, struct list_head *ilist) +static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) { struct inode_entry *new_ie; - if (inode == find_gc_inode(inode->i_ino, ilist)) { + if (inode == find_gc_inode(gc_list, inode->i_ino)) { iput(inode); return; } - - new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); new_ie->inode = inode; - list_add_tail(&new_ie->list, ilist); + + f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); + list_add_tail(&new_ie->list, &gc_list->ilist); } -static void put_gc_inode(struct list_head *ilist) +static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; - list_for_each_entry_safe(ie, next_ie, ilist, list) { + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { + radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); list_del(&ie->list); - kmem_cache_free(winode_slab, ie); + kmem_cache_free(inode_entry_slab, ie); } } @@ -553,7 +551,7 @@ out: * the victim data block is ignored. */ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct list_head *ilist, unsigned int segno, int gc_type) + struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -605,27 +603,27 @@ next_step: data_page = find_data_page(inode, start_bidx + ofs_in_node, false); - if (IS_ERR(data_page)) - goto next_iput; + if (IS_ERR(data_page)) { + iput(inode); + continue; + } f2fs_put_page(data_page, 0); - add_gc_inode(inode, ilist); - } else { - inode = find_gc_inode(dni.ino, ilist); - if (inode) { - start_bidx = start_bidx_of_node(nofs, - F2FS_I(inode)); - data_page = get_lock_data_page(inode, + add_gc_inode(gc_list, inode); + continue; + } + + /* phase 3 */ + inode = find_gc_inode(gc_list, dni.ino); + if (inode) { + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = get_lock_data_page(inode, start_bidx + ofs_in_node); - if (IS_ERR(data_page)) - continue; - move_data_page(inode, data_page, gc_type); - stat_inc_data_blk_count(sbi, 1); - } + if (IS_ERR(data_page)) + continue; + move_data_page(inode, data_page, gc_type); + stat_inc_data_blk_count(sbi, 1); } - continue; -next_iput: - iput(inode); } if (++phase < 4) @@ -646,18 +644,20 @@ next_iput: } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, - int gc_type, int type) + int gc_type) { struct sit_info *sit_i = SIT_I(sbi); int ret; + mutex_lock(&sit_i->sentry_lock); - ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS); + ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, + NO_CHECK_TYPE, LFS); mutex_unlock(&sit_i->sentry_lock); return ret; } static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, - struct list_head *ilist, int gc_type) + struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; @@ -675,7 +675,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, gc_node_segment(sbi, sum->entries, segno, gc_type); break; case SUM_TYPE_DATA: - gc_data_segment(sbi, sum->entries, ilist, segno, gc_type); + gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type); break; } blk_finish_plug(&plug); @@ -688,16 +688,17 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, int f2fs_gc(struct f2fs_sb_info *sbi) { - struct list_head ilist; unsigned int segno, i; int gc_type = BG_GC; int nfree = 0; int ret = -1; - struct cp_control cpc = { - .reason = CP_SYNC, + struct cp_control cpc; + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(GFP_NOFS), }; - INIT_LIST_HEAD(&ilist); + cpc.reason = __get_cp_reason(sbi); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; @@ -709,7 +710,7 @@ gc_more: write_checkpoint(sbi, &cpc); } - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + if (!__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; @@ -719,7 +720,7 @@ gc_more: META_SSA); for (i = 0; i < sbi->segs_per_sec; i++) - do_garbage_collect(sbi, segno + i, &ilist, gc_type); + do_garbage_collect(sbi, segno + i, &gc_list, gc_type); if (gc_type == FG_GC) { sbi->cur_victim_sec = NULL_SEGNO; @@ -735,7 +736,7 @@ gc_more: stop: mutex_unlock(&sbi->gc_mutex); - put_gc_inode(&ilist); + put_gc_inode(&gc_list); return ret; } @@ -743,17 +744,3 @@ void build_gc_manager(struct f2fs_sb_info *sbi) { DIRTY_I(sbi)->v_ops = &default_v_ops; } - -int __init create_gc_caches(void) -{ - winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", - sizeof(struct inode_entry)); - if (!winode_slab) - return -ENOMEM; - return 0; -} - -void destroy_gc_caches(void) -{ - kmem_cache_destroy(winode_slab); -} diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 16f0b2b22999..b4a65be9f7d3 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -35,9 +35,9 @@ struct f2fs_gc_kthread { unsigned int gc_idle; }; -struct inode_entry { - struct list_head list; - struct inode *inode; +struct gc_inode_list { + struct list_head ilist; + struct radix_tree_root iroot; }; /* @@ -64,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } -static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) +static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, + long *wait) { - if (wait == gc_th->no_gc_sleep_time) - return wait; + if (*wait == gc_th->no_gc_sleep_time) + return; - wait += gc_th->min_sleep_time; - if (wait > gc_th->max_sleep_time) - wait = gc_th->max_sleep_time; - return wait; + *wait += gc_th->min_sleep_time; + if (*wait > gc_th->max_sleep_time) + *wait = gc_th->max_sleep_time; } -static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait) +static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, + long *wait) { - if (wait == gc_th->no_gc_sleep_time) - wait = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) + *wait = gc_th->max_sleep_time; - wait -= gc_th->min_sleep_time; - if (wait <= gc_th->min_sleep_time) - wait = gc_th->min_sleep_time; - return wait; + *wait -= gc_th->min_sleep_time; + if (*wait <= gc_th->min_sleep_time) + *wait = gc_th->min_sleep_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 88036fd75797..1484c00133cd 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -15,35 +15,50 @@ bool f2fs_may_inline(struct inode *inode) { - block_t nr_blocks; - loff_t i_size; - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) return false; if (f2fs_is_atomic_file(inode)) return false; - nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; - if (inode->i_blocks > nr_blocks) + if (!S_ISREG(inode->i_mode)) return false; - i_size = i_size_read(inode); - if (i_size > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA) return false; return true; } -int f2fs_read_inline_data(struct inode *inode, struct page *page) +void read_inline_data(struct page *page, struct page *ipage) { - struct page *ipage; void *src_addr, *dst_addr; - if (page->index) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - goto out; - } + if (PageUptodate(page)) + return; + + f2fs_bug_on(F2FS_P_SB(page), page->index); + + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + + /* Copy the whole inline data block */ + src_addr = inline_data_addr(ipage); + dst_addr = kmap_atomic(page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + flush_dcache_page(page); + kunmap_atomic(dst_addr); + SetPageUptodate(page); +} + +static void truncate_inline_data(struct page *ipage) +{ + f2fs_wait_on_page_writeback(ipage, NODE); + memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA); +} + +int f2fs_read_inline_data(struct inode *inode, struct page *page) +{ + struct page *ipage; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { @@ -51,112 +66,115 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) return PTR_ERR(ipage); } - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + if (!f2fs_has_inline_data(inode)) { + f2fs_put_page(ipage, 1); + return -EAGAIN; + } - /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); - dst_addr = kmap(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - kunmap(page); - f2fs_put_page(ipage, 1); + if (page->index) + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + else + read_inline_data(page, ipage); -out: SetPageUptodate(page); + f2fs_put_page(ipage, 1); unlock_page(page); - return 0; } -static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { - int err = 0; - struct page *ipage; - struct dnode_of_data dn; void *src_addr, *dst_addr; - block_t new_blk_addr; - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_io_info fio = { .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, }; + int dirty, err; - f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto out; - } + f2fs_bug_on(F2FS_I_SB(dn->inode), page->index); - /* someone else converted inline_data already */ - if (!f2fs_has_inline_data(inode)) - goto out; + if (!f2fs_exist_data(dn->inode)) + goto clear_out; - /* - * i_addr[0] is not used for inline data, - * so reserving new block will not destroy inline data - */ - set_new_dnode(&dn, inode, ipage, NULL, 0); - err = f2fs_reserve_block(&dn, 0); + err = f2fs_reserve_block(dn, 0); if (err) - goto out; + return err; f2fs_wait_on_page_writeback(page, DATA); + + if (PageUptodate(page)) + goto no_update; + zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); - dst_addr = kmap(page); + src_addr = inline_data_addr(dn->inode_page); + dst_addr = kmap_atomic(page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - kunmap(page); + flush_dcache_page(page); + kunmap_atomic(dst_addr); SetPageUptodate(page); +no_update: + /* clear dirty state */ + dirty = clear_page_dirty_for_io(page); /* write data page to try to make data consistent */ set_page_writeback(page); - write_data_page(page, &dn, &new_blk_addr, &fio); - update_extent_cache(new_blk_addr, &dn); + fio.blk_addr = dn->data_blkaddr; + write_data_page(page, dn, &fio); + update_extent_cache(dn); f2fs_wait_on_page_writeback(page, DATA); + if (dirty) + inode_dec_dirty_pages(dn->inode); - /* clear inline data and flag after data writeback */ - zero_user_segment(ipage, INLINE_DATA_OFFSET, - INLINE_DATA_OFFSET + MAX_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - stat_dec_inline_inode(inode); + /* this converted inline_data should be recovered. */ + set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); - sync_inode_page(&dn); - f2fs_put_dnode(&dn); -out: - f2fs_unlock_op(sbi); - return err; + /* clear inline data and flag after data writeback */ + truncate_inline_data(dn->inode_page); +clear_out: + stat_dec_inline_inode(dn->inode); + f2fs_clear_inline_inode(dn->inode); + sync_inode_page(dn); + f2fs_put_dnode(dn); + return 0; } -int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size, - struct page *page) +int f2fs_convert_inline_inode(struct inode *inode) { - struct page *new_page = page; - int err; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + struct page *ipage, *page; + int err = 0; - if (!f2fs_has_inline_data(inode)) - return 0; - else if (to_size <= MAX_INLINE_DATA) - return 0; + page = grab_cache_page(inode->i_mapping, 0); + if (!page) + return -ENOMEM; + + f2fs_lock_op(sbi); - if (!page || page->index != 0) { - new_page = grab_cache_page(inode->i_mapping, 0); - if (!new_page) - return -ENOMEM; + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out; } - err = __f2fs_convert_inline_data(inode, new_page); - if (!page || page->index != 0) - f2fs_put_page(new_page, 1); + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) + err = f2fs_convert_inline_page(&dn, page); + + f2fs_put_dnode(&dn); +out: + f2fs_unlock_op(sbi); + + f2fs_put_page(page, 1); return err; } -int f2fs_write_inline_data(struct inode *inode, - struct page *page, unsigned size) +int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; - struct page *ipage; struct dnode_of_data dn; int err; @@ -164,49 +182,28 @@ int f2fs_write_inline_data(struct inode *inode, err = get_dnode_of_data(&dn, 0, LOOKUP_NODE); if (err) return err; - ipage = dn.inode_page; - f2fs_wait_on_page_writeback(ipage, NODE); - zero_user_segment(ipage, INLINE_DATA_OFFSET, - INLINE_DATA_OFFSET + MAX_INLINE_DATA); - src_addr = kmap(page); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, size); - kunmap(page); - - /* Release the first data block if it is allocated */ if (!f2fs_has_inline_data(inode)) { - truncate_data_blocks_range(&dn, 1); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - stat_inc_inline_inode(inode); + f2fs_put_dnode(&dn); + return -EAGAIN; } + f2fs_bug_on(F2FS_I_SB(inode), page->index); + + f2fs_wait_on_page_writeback(dn.inode_page, NODE); + src_addr = kmap_atomic(page); + dst_addr = inline_data_addr(dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + kunmap_atomic(src_addr); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); f2fs_put_dnode(&dn); - return 0; } -void truncate_inline_data(struct inode *inode, u64 from) -{ - struct page *ipage; - - if (from >= MAX_INLINE_DATA) - return; - - ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(ipage)) - return; - - f2fs_wait_on_page_writeback(ipage, NODE); - - zero_user_segment(ipage, INLINE_DATA_OFFSET + from, - INLINE_DATA_OFFSET + MAX_INLINE_DATA); - set_page_dirty(ipage); - f2fs_put_page(ipage, 1); -} - bool recover_inline_data(struct inode *inode, struct page *npage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -236,6 +233,10 @@ process_inline: src_addr = inline_data_addr(npage); dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + update_inode(inode, ipage); f2fs_put_page(ipage, 1); return true; @@ -244,16 +245,279 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE); - zero_user_segment(ipage, INLINE_DATA_OFFSET, - INLINE_DATA_OFFSET + MAX_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + truncate_inline_data(ipage); + f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { truncate_blocks(inode, 0, false); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); goto process_inline; } return false; } + +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct qstr *name, struct page **res_page) +{ + struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dir_entry *de; + struct f2fs_dentry_ptr d; + struct page *ipage; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return NULL; + + inline_dentry = inline_data_addr(ipage); + + make_dentry_ptr(&d, (void *)inline_dentry, 2); + de = find_target_dentry(name, NULL, &d); + + unlock_page(ipage); + if (de) + *res_page = ipage; + else + f2fs_put_page(ipage, 0); + + /* + * For the most part, it should be a bug when name_len is zero. + * We stop here for figuring out where the bugs has occurred. + */ + f2fs_bug_on(sbi, d.max < 0); + return de; +} + +struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, + struct page **p) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + struct f2fs_dir_entry *de; + struct f2fs_inline_dentry *dentry_blk; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return NULL; + + dentry_blk = inline_data_addr(ipage); + de = &dentry_blk->dentry[1]; + *p = ipage; + unlock_page(ipage); + return de; +} + +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage) +{ + struct f2fs_inline_dentry *dentry_blk; + struct f2fs_dentry_ptr d; + + dentry_blk = inline_data_addr(ipage); + + make_dentry_ptr(&d, (void *)dentry_blk, 2); + do_make_empty_dir(inode, parent, &d); + + set_page_dirty(ipage); + + /* update i_size to MAX_INLINE_DATA */ + if (i_size_read(inode) < MAX_INLINE_DATA) { + i_size_write(inode, MAX_INLINE_DATA); + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); + } + return 0; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct page *page; + struct dnode_of_data dn; + struct f2fs_dentry_block *dentry_blk; + int err; + + page = grab_cache_page(dir->i_mapping, 0); + if (!page) + return -ENOMEM; + + set_new_dnode(&dn, dir, ipage, NULL, 0); + err = f2fs_reserve_block(&dn, 0); + if (err) + goto out; + + f2fs_wait_on_page_writeback(page, DATA); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + + dentry_blk = kmap_atomic(page); + + /* copy data from inline dentry block to new dentry block */ + memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, + INLINE_DENTRY_BITMAP_SIZE); + memcpy(dentry_blk->dentry, inline_dentry->dentry, + sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); + memcpy(dentry_blk->filename, inline_dentry->filename, + NR_INLINE_DENTRY * F2FS_SLOT_LEN); + + kunmap_atomic(dentry_blk); + SetPageUptodate(page); + set_page_dirty(page); + + /* clear inline dir and flag after data writeback */ + truncate_inline_data(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + + if (i_size_read(dir) < PAGE_CACHE_SIZE) { + i_size_write(dir, PAGE_CACHE_SIZE); + set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } + + sync_inode_page(&dn); +out: + f2fs_put_page(page, 1); + return err; +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, + struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos; + f2fs_hash_t name_hash; + struct f2fs_dir_entry *de; + size_t namelen = name->len; + struct f2fs_inline_dentry *dentry_blk = NULL; + int slots = GET_DENTRY_SLOTS(namelen); + struct page *page; + int err = 0; + int i; + + name_hash = f2fs_dentry_hash(name); + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + dentry_blk = inline_data_addr(ipage); + bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, + slots, NR_INLINE_DENTRY); + if (bit_pos >= NR_INLINE_DENTRY) { + err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + if (!err) + err = -EAGAIN; + goto out; + } + + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + + f2fs_wait_on_page_writeback(ipage, NODE); + de = &dentry_blk->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(namelen); + memcpy(dentry_blk->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(inode->i_ino); + set_de_type(de, inode); + for (i = 0; i < slots; i++) + test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + set_page_dirty(ipage); + + /* we don't need to mark_inode_dirty now */ + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + + update_parent_metadata(dir, inode, 0); +fail: + up_write(&F2FS_I(inode)->i_sem); + + if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { + update_inode(dir, ipage); + clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); + } +out: + f2fs_put_page(ipage, 1); + return err; +} + +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode) +{ + struct f2fs_inline_dentry *inline_dentry; + int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + unsigned int bit_pos; + int i; + + lock_page(page); + f2fs_wait_on_page_writeback(page, NODE); + + inline_dentry = inline_data_addr(page); + bit_pos = dentry - inline_dentry->dentry; + for (i = 0; i < slots; i++) + test_and_clear_bit_le(bit_pos + i, + &inline_dentry->dentry_bitmap); + + set_page_dirty(page); + + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + if (inode) + f2fs_drop_nlink(dir, inode, page); + + f2fs_put_page(page, 1); +} + +bool f2fs_empty_inline_dir(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct page *ipage; + unsigned int bit_pos = 2; + struct f2fs_inline_dentry *dentry_blk; + + ipage = get_node_page(sbi, dir->i_ino); + if (IS_ERR(ipage)) + return false; + + dentry_blk = inline_data_addr(ipage); + bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, + NR_INLINE_DENTRY, + bit_pos); + + f2fs_put_page(ipage, 1); + + if (bit_pos < NR_INLINE_DENTRY) + return false; + + return true; +} + +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + struct f2fs_inline_dentry *inline_dentry = NULL; + struct page *ipage = NULL; + struct f2fs_dentry_ptr d; + + if (ctx->pos == NR_INLINE_DENTRY) + return 0; + + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + + inline_dentry = inline_data_addr(ipage); + + make_dentry_ptr(&d, (void *)inline_dentry, 2); + + if (!f2fs_fill_dentries(ctx, &d, 0)) + ctx->pos = NR_INLINE_DENTRY; + + f2fs_put_page(ipage, 1); + return 0; +} diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0deead4505e7..2d002e3738a7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -67,6 +67,25 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } +static void __recover_inline_status(struct inode *inode, struct page *ipage) +{ + void *inline_data = inline_data_addr(ipage); + __le32 *start = inline_data; + __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + + while (start < end) { + if (*start++) { + f2fs_wait_on_page_writeback(ipage, NODE); + + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_page_dirty(ipage); + return; + } + } + return; +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -114,10 +133,18 @@ static int do_read_inode(struct inode *inode) get_extent_info(&fi->ext, ri->i_ext); get_inline_info(fi, ri); + /* check data exist */ + if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) + __recover_inline_status(inode, node_page); + /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); f2fs_put_page(node_page, 1); + + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + return 0; } @@ -156,7 +183,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; @@ -270,7 +297,7 @@ void f2fs_evict_inode(struct inode *inode) nid_t xnid = F2FS_I(inode)->i_xattr_nid; /* some remained atomic pages should discarded */ - if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + if (f2fs_is_atomic_file(inode)) commit_inmem_pages(inode, true); trace_f2fs_evict_inode(inode); @@ -295,11 +322,12 @@ void f2fs_evict_inode(struct inode *inode) f2fs_lock_op(sbi); remove_inode_page(inode); - stat_dec_inline_inode(inode); f2fs_unlock_op(sbi); sb_end_intwrite(inode->i_sb); no_delete: + stat_dec_inline_dir(inode); + stat_dec_inline_inode(inode); invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); @@ -325,8 +353,9 @@ void handle_failed_inode(struct inode *inode) f2fs_truncate(inode); remove_inode_page(inode); - stat_dec_inline_inode(inode); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + clear_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); alloc_nid_failed(sbi, inode->i_ino); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0d2526e5aa11..e79639a9787a 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -54,6 +54,12 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; goto out; } + + if (f2fs_may_inline(inode)) + set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode)) + set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -129,8 +135,12 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); + stat_inc_inline_inode(inode); d_instantiate(dentry, inode); unlock_new_inode(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; out: handle_failed_inode(inode); @@ -157,6 +167,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_unlock_op(sbi); d_instantiate(dentry, inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); @@ -187,14 +200,12 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, de = f2fs_find_entry(dir, &dentry->d_name, &page); if (de) { nid_t ino = le32_to_cpu(de->ino); - kunmap(page); + f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); - - stat_inc_inline_inode(inode); } return d_splice_alias(inode, dentry); @@ -219,15 +230,18 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); - kunmap(page); + f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); goto fail; } - f2fs_delete_entry(de, page, inode); + f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); /* In order to evict this inode, we set it dirty */ mark_inode_dirty(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); fail: trace_f2fs_unlink_exit(inode, err); return err; @@ -261,6 +275,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); unlock_new_inode(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); return err; out: handle_failed_inode(inode); @@ -282,7 +299,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO); + mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); @@ -291,11 +308,14 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; f2fs_unlock_op(sbi); + stat_inc_inline_dir(inode); alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); unlock_new_inode(inode); + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; out_fail: @@ -338,8 +358,12 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); + d_instantiate(dentry, inode); unlock_new_inode(inode); + + if (IS_DIRSYNC(dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; out: handle_failed_inode(inode); @@ -435,7 +459,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); - f2fs_delete_entry(old_entry, old_page, NULL); + f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (old_dir_entry) { if (old_dir != new_dir) { @@ -443,7 +467,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir_page, new_dir); update_inode_page(old_inode); } else { - kunmap(old_dir_page); + f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } drop_nlink(old_dir); @@ -452,19 +476,22 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; put_out_dir: f2fs_unlock_op(sbi); - kunmap(new_page); + f2fs_dentry_kunmap(new_dir, new_page); f2fs_put_page(new_page, 0); out_dir: if (old_dir_entry) { - kunmap(old_dir_page); + f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } out_old: - kunmap(old_page); + f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; @@ -588,6 +615,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(new_dir); f2fs_unlock_op(sbi); + + if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) + f2fs_sync_fs(sbi->sb, 1); return 0; out_undo: /* Still we may fail to recover name info of f2fs_inode here */ @@ -596,19 +626,19 @@ out_unlock: f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { - kunmap(new_dir_page); + f2fs_dentry_kunmap(new_inode, new_dir_page); f2fs_put_page(new_dir_page, 0); } out_old_dir: if (old_dir_entry) { - kunmap(old_dir_page); + f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } out_new: - kunmap(new_page); + f2fs_dentry_kunmap(new_dir, new_page); f2fs_put_page(new_page, 0); out_old: - kunmap(old_page); + f2fs_dentry_kunmap(old_dir, old_page); f2fs_put_page(old_page, 0); out: return err; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 44b8afef43d9..97bd9d3db882 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "trace.h" #include <trace/events/f2fs.h> #define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) @@ -31,22 +32,39 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct sysinfo val; + unsigned long avail_ram; unsigned long mem_size = 0; bool res = false; si_meminfo(&val); - /* give 25%, 25%, 50% memory for each components respectively */ + + /* only uses low memory */ + avail_ram = val.totalram - val.totalhigh; + + /* give 25%, 25%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12; - res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> + PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { - mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12; - res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2); + mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> + PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); - res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1); + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == INO_ENTRIES) { + int i; + + for (i = 0; i <= UPDATE_INO; i++) + mem_size += (sbi->im[i].ino_num * + sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else { + if (sbi->sb->s_bdi->dirty_exceeded) + return false; } return res; } @@ -131,7 +149,7 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, if (get_nat_flag(ne, IS_DIRTY)) return; -retry: + head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); @@ -140,11 +158,7 @@ retry: INIT_LIST_HEAD(&head->set_list); head->set = set; head->entry_cnt = 0; - - if (radix_tree_insert(&nm_i->nat_set_root, set, head)) { - cond_resched(); - goto retry; - } + f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } list_move_tail(&ne->list, &head->entry_list); nm_i->dirty_nat_cnt++; @@ -155,7 +169,7 @@ retry: static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, struct nat_entry *ne) { - nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; head = radix_tree_lookup(&nm_i->nat_set_root, set); @@ -180,11 +194,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) struct nat_entry *e; bool is_cp = true; - read_lock(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e && !get_nat_flag(e, IS_CHECKPOINTED)) is_cp = false; - read_unlock(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return is_cp; } @@ -194,11 +208,11 @@ bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool fsynced = false; - read_lock(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_FSYNCED_INODE)) fsynced = true; - read_unlock(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return fsynced; } @@ -208,13 +222,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) struct nat_entry *e; bool need_update = true; - read_lock(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ino); if (e && get_nat_flag(e, HAS_LAST_FSYNC) && (get_nat_flag(e, IS_CHECKPOINTED) || get_nat_flag(e, HAS_FSYNCED_INODE))) need_update = false; - read_unlock(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); return need_update; } @@ -222,13 +236,8 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) { struct nat_entry *new; - new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); - if (!new) - return NULL; - if (radix_tree_insert(&nm_i->nat_root, nid, new)) { - kmem_cache_free(nat_entry_slab, new); - return NULL; - } + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -241,18 +250,14 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, struct f2fs_nat_entry *ne) { struct nat_entry *e; -retry: - write_lock(&nm_i->nat_tree_lock); + + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); - if (!e) { - write_unlock(&nm_i->nat_tree_lock); - goto retry; - } node_info_from_raw_nat(&e->ni, ne); } - write_unlock(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -260,16 +265,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; -retry: - write_lock(&nm_i->nat_tree_lock); + + down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { e = grab_nat_entry(nm_i, ni->nid); - if (!e) { - write_unlock(&nm_i->nat_tree_lock); - goto retry; - } - e->ni = *ni; + copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* @@ -277,7 +278,7 @@ retry: * previous nat entry can be remained in nat cache. * So, reinitialize it with new information. */ - e->ni = *ni; + copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } @@ -310,7 +311,7 @@ retry: set_nat_flag(e, HAS_FSYNCED_INODE, true); set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); } - write_unlock(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) @@ -320,7 +321,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) if (available_free_memory(sbi, NAT_ENTRIES)) return 0; - write_lock(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); while (nr_shrink && !list_empty(&nm_i->nat_entries)) { struct nat_entry *ne; ne = list_first_entry(&nm_i->nat_entries, @@ -328,7 +329,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) __del_from_nat_cache(nm_i, ne); nr_shrink--; } - write_unlock(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); return nr_shrink; } @@ -347,21 +348,22 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct nat_entry *e; int i; - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); ni->nid = nid; /* Check nat cache */ - read_lock(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (e) { ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); } - read_unlock(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); if (e) return; + memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + /* Check current segment summary */ mutex_lock(&curseg->curseg_mutex); i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); @@ -472,7 +474,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *npage[4]; - struct page *parent; + struct page *parent = NULL; int offset[4]; unsigned int noffset[4]; nid_t nids[4]; @@ -489,6 +491,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } + + /* if inline_data is set, should not report any block indices */ + if (f2fs_has_inline_data(dn->inode) && index) { + err = -EINVAL; + f2fs_put_page(npage[0], 1); + goto release_out; + } + parent = npage[0]; if (level != 0) nids[1] = get_nid(parent, offset[0], true); @@ -586,7 +596,7 @@ static void truncate_node(struct dnode_of_data *dn) } invalidate: clear_node_page_dirty(dn->node_page); - F2FS_SET_SB_DIRT(sbi); + set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_put_page(dn->node_page, 1); @@ -977,6 +987,10 @@ static int read_node_page(struct page *page, int rw) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; + struct f2fs_io_info fio = { + .type = NODE, + .rw = rw, + }; get_node_info(sbi, page->index, &ni); @@ -988,7 +1002,8 @@ static int read_node_page(struct page *page, int rw) if (PageUptodate(page)) return LOCKED_PAGE; - return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw); + fio.blk_addr = ni.blk_addr; + return f2fs_submit_page_bio(sbi, page, &fio); } /* @@ -1029,11 +1044,11 @@ repeat: err = read_node_page(page, READ_SYNC); if (err < 0) return ERR_PTR(err); - else if (err == LOCKED_PAGE) - goto got_it; + else if (err != LOCKED_PAGE) + lock_page(page); - lock_page(page); if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { + ClearPageUptodate(page); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } @@ -1041,7 +1056,6 @@ repeat: f2fs_put_page(page, 1); goto repeat; } -got_it: return page; } @@ -1269,7 +1283,6 @@ static int f2fs_write_node_page(struct page *page, { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; - block_t new_addr; struct node_info ni; struct f2fs_io_info fio = { .type = NODE, @@ -1278,7 +1291,7 @@ static int f2fs_write_node_page(struct page *page, trace_f2fs_writepage(page, NODE); - if (unlikely(sbi->por_doing)) + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; @@ -1298,16 +1311,24 @@ static int f2fs_write_node_page(struct page *page, return 0; } - if (wbc->for_reclaim) - goto redirty_out; + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } - down_read(&sbi->node_write); set_page_writeback(page); - write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); - set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); + fio.blk_addr = ni.blk_addr; + write_node_page(sbi, page, nid, &fio); + set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); unlock_page(page); + + if (wbc->for_reclaim) + f2fs_submit_merged_bio(sbi, NODE, WRITE); + return 0; redirty_out: @@ -1350,26 +1371,12 @@ static int f2fs_set_node_page_dirty(struct page *page) __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); + f2fs_trace_pid(page); return 1; } return 0; } -static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, - unsigned int length) -{ - struct inode *inode = page->mapping->host; - if (PageDirty(page)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES); - ClearPagePrivate(page); -} - -static int f2fs_release_node_page(struct page *page, gfp_t wait) -{ - ClearPagePrivate(page); - return 1; -} - /* * Structure of the f2fs node operations */ @@ -1377,8 +1384,8 @@ const struct address_space_operations f2fs_node_aops = { .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, .set_page_dirty = f2fs_set_node_page_dirty, - .invalidatepage = f2fs_invalidate_node_page, - .releasepage = f2fs_release_node_page, + .invalidatepage = f2fs_invalidate_page, + .releasepage = f2fs_release_page, }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1410,13 +1417,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - read_lock(&nm_i->nat_tree_lock); + down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; - read_unlock(&nm_i->nat_tree_lock); + up_read(&nm_i->nat_tree_lock); if (allocated) return 0; } @@ -1425,15 +1432,22 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) i->nid = nid; i->state = NID_NEW; + if (radix_tree_preload(GFP_NOFS)) { + kmem_cache_free(free_nid_slab, i); + return 0; + } + spin_lock(&nm_i->free_nid_list_lock); if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { spin_unlock(&nm_i->free_nid_list_lock); + radix_tree_preload_end(); kmem_cache_free(free_nid_slab, i); return 0; } list_add_tail(&i->list, &nm_i->free_nid_list); nm_i->fcnt++; spin_unlock(&nm_i->free_nid_list_lock); + radix_tree_preload_end(); return 1; } @@ -1714,80 +1728,41 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) return 0; } -/* - * ra_sum_pages() merge contiguous pages into one bio and submit. - * these pre-read pages are allocated in bd_inode's mapping tree. - */ -static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, - int start, int nrpages) -{ - struct inode *inode = sbi->sb->s_bdev->bd_inode; - struct address_space *mapping = inode->i_mapping; - int i, page_idx = start; - struct f2fs_io_info fio = { - .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO - }; - - for (i = 0; page_idx < start + nrpages; page_idx++, i++) { - /* alloc page in bd_inode for reading node summary info */ - pages[i] = grab_cache_page(mapping, page_idx); - if (!pages[i]) - break; - f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio); - } - - f2fs_submit_merged_bio(sbi, META, READ); - return i; -} - int restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; struct f2fs_summary *sum_entry; - struct inode *inode = sbi->sb->s_bdev->bd_inode; block_t addr; int bio_blocks = MAX_BIO_BLOCKS(sbi); - struct page *pages[bio_blocks]; - int i, idx, last_offset, nrpages, err = 0; + int i, idx, last_offset, nrpages; /* scan the node segment */ last_offset = sbi->blocks_per_seg; addr = START_BLOCK(sbi, segno); sum_entry = &sum->entries[0]; - for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { + for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { nrpages = min(last_offset - i, bio_blocks); /* readahead node pages */ - nrpages = ra_sum_pages(sbi, pages, addr, nrpages); - if (!nrpages) - return -ENOMEM; + ra_meta_pages(sbi, addr, nrpages, META_POR); - for (idx = 0; idx < nrpages; idx++) { - if (err) - goto skip; + for (idx = addr; idx < addr + nrpages; idx++) { + struct page *page = get_meta_page(sbi, idx); - lock_page(pages[idx]); - if (unlikely(!PageUptodate(pages[idx]))) { - err = -EIO; - } else { - rn = F2FS_NODE(pages[idx]); - sum_entry->nid = rn->footer.nid; - sum_entry->version = 0; - sum_entry->ofs_in_node = 0; - sum_entry++; - } - unlock_page(pages[idx]); -skip: - page_cache_release(pages[idx]); + rn = F2FS_NODE(page); + sum_entry->nid = rn->footer.nid; + sum_entry->version = 0; + sum_entry->ofs_in_node = 0; + sum_entry++; + f2fs_put_page(page, 1); } - invalidate_mapping_pages(inode->i_mapping, addr, + invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } - return err; + return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) @@ -1804,21 +1779,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); raw_ne = nat_in_journal(sum, i); -retry: - write_lock(&nm_i->nat_tree_lock); - ne = __lookup_nat_cache(nm_i, nid); - if (ne) - goto found; - ne = grab_nat_entry(nm_i, nid); + down_write(&nm_i->nat_tree_lock); + ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - write_unlock(&nm_i->nat_tree_lock); - goto retry; + ne = grab_nat_entry(nm_i, nid); + node_info_from_raw_nat(&ne->ni, &raw_ne); } - node_info_from_raw_nat(&ne->ni, &raw_ne); -found: __set_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); + up_write(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); @@ -1889,10 +1858,10 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); - write_lock(&NM_I(sbi)->nat_tree_lock); + down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - write_unlock(&NM_I(sbi)->nat_tree_lock); + up_write(&NM_I(sbi)->nat_tree_lock); if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); @@ -1903,10 +1872,10 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, else f2fs_put_page(page, 1); - if (!set->entry_cnt) { - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); - } + f2fs_bug_on(sbi, set->entry_cnt); + + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); } /* @@ -1917,12 +1886,14 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - struct nat_entry_set *setvec[NATVEC_SIZE]; + struct nat_entry_set *setvec[SETVEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; nid_t set_idx = 0; LIST_HEAD(sets); + if (!nm_i->dirty_nat_cnt) + return; /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them @@ -1931,11 +1902,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - if (!nm_i->dirty_nat_cnt) - return; - while ((found = __gang_lookup_nat_set(nm_i, - set_idx, NATVEC_SIZE, setvec))) { + set_idx, SETVEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) @@ -1973,13 +1941,13 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); - INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); - INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC); + INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); - rwlock_init(&nm_i->nat_tree_lock); + init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP); @@ -2015,6 +1983,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *next_i; struct nat_entry *natvec[NATVEC_SIZE]; + struct nat_entry_set *setvec[SETVEC_SIZE]; nid_t nid = 0; unsigned int found; @@ -2035,16 +2004,32 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ - write_lock(&nm_i->nat_tree_lock); + down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_cache(nm_i, nid, NATVEC_SIZE, natvec))) { unsigned idx; + nid = nat_get_nid(natvec[found - 1]) + 1; for (idx = 0; idx < found; idx++) __del_from_nat_cache(nm_i, natvec[idx]); } f2fs_bug_on(sbi, nm_i->nat_cnt); - write_unlock(&nm_i->nat_tree_lock); + + /* destroy nat set cache */ + nid = 0; + while ((found = __gang_lookup_nat_set(nm_i, + nid, SETVEC_SIZE, setvec))) { + unsigned idx; + + nid = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) { + /* entry_cnt is not zero, when cp_error was occurred */ + f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list)); + radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set); + kmem_cache_free(nat_entry_set_slab, setvec[idx]); + } + } + up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); sbi->nm_info = NULL; @@ -2061,17 +2046,17 @@ int __init create_node_manager_caches(void) free_nid_slab = f2fs_kmem_cache_create("free_nid", sizeof(struct free_nid)); if (!free_nid_slab) - goto destory_nat_entry; + goto destroy_nat_entry; nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set", sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) - goto destory_free_nid; + goto destroy_free_nid; return 0; -destory_free_nid: +destroy_free_nid: kmem_cache_destroy(free_nid_slab); -destory_nat_entry: +destroy_nat_entry: kmem_cache_destroy(nat_entry_slab); fail: return -ENOMEM; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 8d5e6e0dd840..f405bbf2435a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -25,10 +25,19 @@ /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 +#define SETVEC_SIZE 32 /* return value for read_node_page */ #define LOCKED_PAGE 1 +/* For flag in struct node_info */ +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ +}; + /* * For node information */ @@ -37,18 +46,11 @@ struct node_info { nid_t ino; /* inode number of the node's owner */ block_t blk_addr; /* block address of the node */ unsigned char version; /* version of the node */ -}; - -enum { - IS_CHECKPOINTED, /* is it checkpointed before? */ - HAS_FSYNCED_INODE, /* is the inode fsynced before? */ - HAS_LAST_FSYNC, /* has the latest node fsync mark? */ - IS_DIRTY, /* this nat entry is dirty? */ + unsigned char flag; /* for node information bits */ }; struct nat_entry { struct list_head list; /* for clean or dirty nat list */ - unsigned char flag; /* for node information bits */ struct node_info ni; /* in-memory node information */ }; @@ -63,20 +65,30 @@ struct nat_entry { #define inc_node_version(version) (++version) +static inline void copy_node_info(struct node_info *dst, + struct node_info *src) +{ + dst->nid = src->nid; + dst->ino = src->ino; + dst->blk_addr = src->blk_addr; + dst->version = src->version; + /* should not copy flag here */ +} + static inline void set_nat_flag(struct nat_entry *ne, unsigned int type, bool set) { unsigned char mask = 0x01 << type; if (set) - ne->flag |= mask; + ne->ni.flag |= mask; else - ne->flag &= ~mask; + ne->ni.flag &= ~mask; } static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) { unsigned char mask = 0x01 << type; - return ne->flag & mask; + return ne->ni.flag & mask; } static inline void nat_reset_flag(struct nat_entry *ne) @@ -106,7 +118,9 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ - DIRTY_DENTS /* indicates dirty dentry pages */ + DIRTY_DENTS, /* indicates dirty dentry pages */ + INO_ENTRIES, /* indicates inode entries */ + BASE_CHECK, /* check kernel status */ }; struct nat_entry_set { @@ -192,21 +206,26 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) { unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) - f2fs_clear_bit(block_off, nm_i->nat_bitmap); - else - f2fs_set_bit(block_off, nm_i->nat_bitmap); + f2fs_change_bit(block_off, nm_i->nat_bitmap); } static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { struct f2fs_node *rn = F2FS_NODE(page); + unsigned int old_flag = 0; + if (reset) memset(rn, 0, sizeof(*rn)); + else + old_flag = le32_to_cpu(rn->footer.flag); + rn->footer.nid = cpu_to_le32(nid); rn->footer.ino = cpu_to_le32(ino); - rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT); + + /* should remain old flag bits such as COLD_BIT_SHIFT */ + rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | + (old_flag & OFFSET_BIT_MASK)); } static inline void copy_node_footer(struct page *dst, struct page *src) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index ebd013225788..41afb9534bbd 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -111,7 +111,7 @@ retry: iput(einode); goto out_unmap_put; } - f2fs_delete_entry(de, page, einode); + f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; } @@ -129,7 +129,7 @@ retry: goto out; out_unmap_put: - kunmap(page); + f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); out_err: iput(dir); @@ -170,13 +170,15 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + ra_meta_pages(sbi, blkaddr, 1, META_POR); + while (1) { struct fsync_inode_entry *entry; if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) return 0; - page = get_meta_page_ra(sbi, blkaddr); + page = get_meta_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -227,6 +229,8 @@ next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); f2fs_put_page(page, 1); + + ra_meta_pages_cond(sbi, blkaddr); } f2fs_put_page(page, 1); return err; @@ -342,6 +346,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { + /* + * Deprecated; xattr blocks should be found from cold log. + * But, we should remain this for backward compatibility. + */ recover_xattr_data(inode, page, blkaddr); goto out; } @@ -392,7 +400,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, /* write dummy data page */ recover_data_page(sbi, NULL, &sum, src, dest); - update_extent_cache(dest, &dn); + dn.data_blkaddr = dest; + update_extent_cache(&dn); recovered++; } dn.ofs_in_node++; @@ -436,7 +445,9 @@ static int recover_data(struct f2fs_sb_info *sbi, if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) break; - page = get_meta_page_ra(sbi, blkaddr); + ra_meta_pages_cond(sbi, blkaddr); + + page = get_meta_page(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) { f2fs_put_page(page, 1); @@ -497,7 +508,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) INIT_LIST_HEAD(&inode_list); /* step #1: find fsynced inode numbers */ - sbi->por_doing = true; + set_sbi_flag(sbi, SBI_POR_DOING); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -530,7 +541,7 @@ out: truncate_inode_pages_final(META_MAPPING(sbi)); } - sbi->por_doing = false; + clear_sbi_flag(sbi, SBI_POR_DOING); if (err) { discard_next_dnode(sbi, blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 923cb76fdc46..daee4ab913da 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,6 +20,7 @@ #include "f2fs.h" #include "segment.h" #include "node.h" +#include "trace.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -178,17 +179,31 @@ void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *new; + int err; + + SetPagePrivate(page); + f2fs_trace_pid(page); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); /* add atomic page indices to the list */ new->page = page; INIT_LIST_HEAD(&new->list); - +retry: /* increase reference count with clean state */ mutex_lock(&fi->inmem_lock); + err = radix_tree_insert(&fi->inmem_root, page->index, new); + if (err == -EEXIST) { + mutex_unlock(&fi->inmem_lock); + kmem_cache_free(inmem_entry_slab, new); + return; + } else if (err) { + mutex_unlock(&fi->inmem_lock); + goto retry; + } get_page(page); list_add_tail(&new->list, &fi->inmem_pages); + inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); } @@ -200,32 +215,48 @@ void commit_inmem_pages(struct inode *inode, bool abort) bool submit_bio = false; struct f2fs_io_info fio = { .type = DATA, - .rw = WRITE_SYNC, + .rw = WRITE_SYNC | REQ_PRIO, }; - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + /* + * The abort is true only when f2fs_evict_inode is called. + * Basically, the f2fs_evict_inode doesn't produce any data writes, so + * that we don't need to call f2fs_balance_fs. + * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this + * inode becomes free by iget_locked in f2fs_iget. + */ + if (!abort) { + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + } mutex_lock(&fi->inmem_lock); list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - lock_page(cur->page); - if (!abort && cur->page->mapping == inode->i_mapping) { - f2fs_wait_on_page_writeback(cur->page, DATA); - if (clear_page_dirty_for_io(cur->page)) - inode_dec_dirty_pages(inode); - do_write_data_page(cur->page, &fio); - submit_bio = true; + if (!abort) { + lock_page(cur->page); + if (cur->page->mapping == inode->i_mapping) { + f2fs_wait_on_page_writeback(cur->page, DATA); + if (clear_page_dirty_for_io(cur->page)) + inode_dec_dirty_pages(inode); + do_write_data_page(cur->page, &fio); + submit_bio = true; + } + f2fs_put_page(cur->page, 1); + } else { + put_page(cur->page); } - f2fs_put_page(cur->page, 1); + radix_tree_delete(&fi->inmem_root, cur->page->index); list_del(&cur->list); kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); } - if (submit_bio) - f2fs_submit_merged_bio(sbi, DATA, WRITE); mutex_unlock(&fi->inmem_lock); - filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX); - f2fs_unlock_op(sbi); + if (!abort) { + f2fs_unlock_op(sbi); + if (submit_bio) + f2fs_submit_merged_bio(sbi, DATA, WRITE); + } } /* @@ -248,7 +279,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { /* check the # of cached NAT entries and prefree segments */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || - excess_prefree_segs(sbi)) + excess_prefree_segs(sbi) || + !available_free_memory(sbi, INO_ENTRIES)) f2fs_sync_fs(sbi->sb, true); } @@ -441,21 +473,45 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static void __add_discard_entry(struct f2fs_sb_info *sbi, + struct cp_control *cpc, unsigned int start, unsigned int end) { struct list_head *head = &SM_I(sbi)->discard_list; - struct discard_entry *new; + struct discard_entry *new, *last; + + if (!list_empty(head)) { + last = list_last_entry(head, struct discard_entry, list); + if (START_BLOCK(sbi, cpc->trim_start) + start == + last->blkaddr + last->len) { + last->len += end - start; + goto done; + } + } + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; + new->len = end - start; + list_add_tail(&new->list, head); +done: + SM_I(sbi)->nr_discards += end - start; + cpc->trimmed += end - start; +} + +static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; - unsigned long dmap[entries]; + unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); int i; - if (!force && !test_opt(sbi, DISCARD)) + if (!force && (!test_opt(sbi, DISCARD) || + SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)) return; if (force && !se->valid_blocks) { @@ -473,13 +529,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) } mutex_unlock(&dirty_i->seglist_lock); - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start); - new->len = sbi->blocks_per_seg; - list_add_tail(&new->list, head); - SM_I(sbi)->nr_discards += sbi->blocks_per_seg; - cpc->trimmed += sbi->blocks_per_seg; + __add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg); return; } @@ -489,7 +539,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) - dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; + dmap[i] = force ? ~ckpt_map[i] : + (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); @@ -501,14 +552,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (end - start < cpc->trim_minlen) continue; - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - cpc->trimmed += end - start; - - list_add_tail(&new->list, head); - SM_I(sbi)->nr_discards += end - start; + __add_discard_entry(sbi, cpc, start, end); } } @@ -620,10 +664,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_set_bit(offset, se->cur_valid_map)) + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); } else { - if (!f2fs_clear_bit(offset, se->cur_valid_map)) + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -683,7 +727,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, /* * Calculate the number of current summary pages for writing */ -int npages_for_summary_flush(struct f2fs_sb_info *sbi) +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; @@ -691,8 +735,13 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] == SSR) valid_sum_count += sbi->blocks_per_seg; - else - valid_sum_count += curseg_blkoff(sbi, i); + else { + if (for_ra) + valid_sum_count += le16_to_cpu( + F2FS_CKPT(sbi)->cur_data_blkoff[i]); + else + valid_sum_count += curseg_blkoff(sbi, i); + } } sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - @@ -751,7 +800,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi, int go_left = 0; int i; - write_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, @@ -824,7 +873,7 @@ got_it: f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); __set_inuse(sbi, segno); *newseg = segno; - write_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) @@ -875,7 +924,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi, { struct seg_entry *se = get_seg_entry(sbi, seg->segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); - unsigned long target_map[entries]; + unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; int i, pos; @@ -975,18 +1024,22 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } +static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int old_segno; + + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); + locate_dirty_segment(sbi, old_segno); +} + void allocate_new_segments(struct f2fs_sb_info *sbi) { - struct curseg_info *curseg; - unsigned int old_curseg; int i; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { - curseg = CURSEG_I(sbi, i); - old_curseg = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); - locate_dirty_segment(sbi, old_curseg); - } + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) + __allocate_new_segments(sbi, i); } static const struct segment_allocation default_salloc_ops = { @@ -995,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = { int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { - __u64 start = range->start >> sbi->log_blocksize; - __u64 end = start + (range->len >> sbi->log_blocksize) - 1; + __u64 start = F2FS_BYTES_TO_BLK(range->start); + __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; @@ -1004,6 +1057,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) range->len < sbi->blocksize) return -EINVAL; + cpc.trimmed = 0; if (end <= MAIN_BLKADDR(sbi)) goto out; @@ -1012,15 +1066,21 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); cpc.reason = CP_DISCARD; - cpc.trim_start = start_segno; - cpc.trim_end = end_segno; - cpc.trim_minlen = range->minlen >> sbi->log_blocksize; - cpc.trimmed = 0; + cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen); /* do checkpoint to issue discard commands safely */ - write_checkpoint(sbi, &cpc); + for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { + cpc.trim_start = start_segno; + cpc.trim_end = min_t(unsigned int, rounddown(start_segno + + BATCHED_TRIM_SEGMENTS(sbi), + sbi->segs_per_sec) - 1, end_segno); + + mutex_lock(&sbi->gc_mutex); + write_checkpoint(sbi, &cpc); + mutex_unlock(&sbi->gc_mutex); + } out: - range->len = cpc.trimmed << sbi->log_blocksize; + range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return 0; } @@ -1050,8 +1110,8 @@ static int __get_segment_type_4(struct page *page, enum page_type p_type) else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && !is_cold_node(page)) - return CURSEG_HOT_NODE; + if (IS_DNODE(page) && is_cold_node(page)) + return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } @@ -1097,11 +1157,18 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; + bool direct_io = (type == CURSEG_DIRECT_IO); + + type = direct_io ? CURSEG_WARM_DATA : type; curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); + /* direct_io'ed data is aligned to the segment for better performance */ + if (direct_io && curseg->next_blkoff) + __allocate_new_segments(sbi, type); + *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* @@ -1133,39 +1200,39 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, } static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, struct f2fs_io_info *fio) + struct f2fs_summary *sum, + struct f2fs_io_info *fio) { int type = __get_segment_type(page, fio->type); - allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type); + allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio); + f2fs_submit_page_mbio(sbi, page, fio); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_io_info fio = { .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO + .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .blk_addr = page->index, }; set_page_writeback(page); - f2fs_submit_page_mbio(sbi, page, page->index, &fio); + f2fs_submit_page_mbio(sbi, page, &fio); } void write_node_page(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_io_info *fio, - unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr) + unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio); + do_write_page(sbi, page, &sum, fio); } void write_data_page(struct page *page, struct dnode_of_data *dn, - block_t *new_blkaddr, struct f2fs_io_info *fio) + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; @@ -1174,14 +1241,14 @@ void write_data_page(struct page *page, struct dnode_of_data *dn, f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - - do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio); + do_write_page(sbi, page, &sum, fio); + dn->data_blkaddr = fio->blk_addr; } -void rewrite_data_page(struct page *page, block_t old_blkaddr, - struct f2fs_io_info *fio) +void rewrite_data_page(struct page *page, struct f2fs_io_info *fio) { - f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio); + stat_inc_inplace_blocks(F2FS_P_SB(page)); + f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -1339,7 +1406,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); - if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) + if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); @@ -1348,7 +1415,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); - if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) + if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else @@ -1359,7 +1426,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { - if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) { + if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { @@ -1396,12 +1463,22 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) int err; if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + int npages = npages_for_summary_flush(sbi, true); + + if (npages >= 2) + ra_meta_pages(sbi, start_sum_block(sbi), npages, + META_CP); + /* restore for compacted data summary */ if (read_compacted_summaries(sbi)) return -EINVAL; type = CURSEG_HOT_NODE; } + if (__exist_node_summaries(sbi)) + ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type), + NR_CURSEG_TYPE - type, META_CP); + for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); if (err) @@ -1495,8 +1572,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) - write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); + write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, @@ -1524,17 +1600,7 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(segno); - block_t blk_addr = sit_i->sit_base_addr + offset; - - check_seg_range(sbi, segno); - - /* calculate sit block address */ - if (f2fs_test_bit(offset, sit_i->sit_bitmap)) - blk_addr += sit_i->sit_blocks; - - return get_meta_page(sbi, blk_addr); + return get_meta_page(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -1687,7 +1753,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * #2, flush sit entries to sit page. */ list_for_each_entry_safe(ses, tmp, head, set_list) { - struct page *page; + struct page *page = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start_segno = ses->start_segno; unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, @@ -1710,7 +1776,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) { + if (cpc->reason != CP_DISCARD) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc); } @@ -1789,6 +1855,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; } + sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->tmp_map) + return -ENOMEM; + if (sbi->segs_per_sec > 1) { sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry)); @@ -1853,7 +1923,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; - rwlock_init(&free_i->segmap_lock); + spin_lock_init(&free_i->segmap_lock); return 0; } @@ -2066,6 +2136,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->nr_discards = 0; sm_info->max_discards = 0; + sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; + INIT_LIST_HEAD(&sm_info->sit_entry_set); if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { @@ -2168,6 +2240,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) kfree(sit_i->sentries[start].ckpt_valid_map); } } + kfree(sit_i->tmp_map); + vfree(sit_i->sentries); vfree(sit_i->sec_entries); kfree(sit_i->dirty_sentries_bitmap); @@ -2200,7 +2274,7 @@ int __init create_segment_manager_caches(void) goto fail; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", - sizeof(struct nat_entry_set)); + sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destory_discard_entry; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 2495bec1c621..7fd35111cf62 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -189,6 +189,7 @@ struct sit_info { char *sit_bitmap; /* SIT bitmap pointer */ unsigned int bitmap_size; /* SIT bitmap size */ + unsigned long *tmp_map; /* bitmap for temporal use */ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */ unsigned int dirty_sentries; /* # of dirty sentries */ unsigned int sents_per_block; /* # of SIT entries per block */ @@ -207,7 +208,7 @@ struct free_segmap_info { unsigned int start_segno; /* start segment number logically */ unsigned int free_segments; /* # of free segments */ unsigned int free_sections; /* # of free sections */ - rwlock_t segmap_lock; /* free segmap lock */ + spinlock_t segmap_lock; /* free segmap lock */ unsigned long *free_segmap; /* free segment bitmap */ unsigned long *free_secmap; /* free section bitmap */ }; @@ -318,9 +319,9 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, unsigned int max, unsigned int segno) { unsigned int ret; - read_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); ret = find_next_bit(free_i->free_segmap, max, segno); - read_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); return ret; } @@ -331,7 +332,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) unsigned int start_segno = secno * sbi->segs_per_sec; unsigned int next; - write_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); free_i->free_segments++; @@ -340,7 +341,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } - write_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); } static inline void __set_inuse(struct f2fs_sb_info *sbi, @@ -362,7 +363,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int start_segno = secno * sbi->segs_per_sec; unsigned int next; - write_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; @@ -373,7 +374,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_sections++; } } - write_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); } static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, @@ -381,13 +382,13 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int secno = segno / sbi->segs_per_sec; - write_lock(&free_i->segmap_lock); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) free_i->free_sections--; } - write_unlock(&free_i->segmap_lock); + spin_unlock(&free_i->segmap_lock); } static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, @@ -460,7 +461,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - if (unlikely(sbi->por_doing)) + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + @@ -599,13 +600,13 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { if (segno > TOTAL_SEGS(sbi) - 1) - sbi->need_fsck = true; + set_sbi_flag(sbi, SBI_NEED_FSCK); } static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) - sbi->need_fsck = true; + set_sbi_flag(sbi, SBI_NEED_FSCK); } /* @@ -616,11 +617,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, { /* check segment usage */ if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) - sbi->need_fsck = true; + set_sbi_flag(sbi, SBI_NEED_FSCK); /* check boundary of a given segment number */ if (segno > TOTAL_SEGS(sbi) - 1) - sbi->need_fsck = true; + set_sbi_flag(sbi, SBI_NEED_FSCK); } #endif @@ -657,10 +658,7 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { unsigned int block_off = SIT_BLOCK_OFFSET(start); - if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) - f2fs_clear_bit(block_off, sit_i->sit_bitmap); - else - f2fs_set_bit(block_off, sit_i->sit_bitmap); + f2fs_change_bit(block_off, sit_i->sit_bitmap); } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) @@ -714,6 +712,9 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { + if (sbi->sb->s_bdi->dirty_exceeded) + return 0; + if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 41d6f700f4ee..f2fe666a6ea9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -30,6 +30,7 @@ #include "segment.h" #include "xattr.h" #include "gc.h" +#include "trace.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> @@ -41,6 +42,7 @@ static struct kset *f2fs_kset; enum { Opt_gc_background, Opt_disable_roll_forward, + Opt_norecovery, Opt_discard, Opt_noheap, Opt_user_xattr, @@ -51,14 +53,17 @@ enum { Opt_disable_ext_identify, Opt_inline_xattr, Opt_inline_data, + Opt_inline_dentry, Opt_flush_merge, Opt_nobarrier, + Opt_fastboot, Opt_err, }; static match_table_t f2fs_tokens = { {Opt_gc_background, "background_gc=%s"}, {Opt_disable_roll_forward, "disable_roll_forward"}, + {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, @@ -69,8 +74,10 @@ static match_table_t f2fs_tokens = { {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, + {Opt_inline_dentry, "inline_dentry"}, {Opt_flush_merge, "flush_merge"}, {Opt_nobarrier, "nobarrier"}, + {Opt_fastboot, "fastboot"}, {Opt_err, NULL}, }; @@ -188,6 +195,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); @@ -203,6 +211,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_idle), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), @@ -282,6 +291,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_disable_roll_forward: set_opt(sbi, DISABLE_ROLL_FORWARD); break; + case Opt_norecovery: + /* this option mounts f2fs with ro */ + set_opt(sbi, DISABLE_ROLL_FORWARD); + if (!f2fs_readonly(sb)) + return -EINVAL; + break; case Opt_discard: set_opt(sbi, DISCARD); break; @@ -340,12 +355,18 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_data: set_opt(sbi, INLINE_DATA); break; + case Opt_inline_dentry: + set_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; + case Opt_fastboot: + set_opt(sbi, FASTBOOT); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -373,6 +394,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); init_rwsem(&fi->i_sem); + INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -435,8 +457,13 @@ static void f2fs_put_super(struct super_block *sb) f2fs_destroy_stats(sbi); stop_gc_thread(sbi); - /* We don't need to do checkpoint when it's clean */ - if (sbi->s_dirty) { + /* + * We don't need to do checkpoint when superblock is clean. + * But, the previous checkpoint was not done by umount, it needs to do + * clean checkpoint again. + */ + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -473,15 +500,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); if (sync) { - struct cp_control cpc = { - .reason = CP_SYNC, - }; + struct cp_control cpc; + + cpc.reason = __get_cp_reason(sbi); + mutex_lock(&sbi->gc_mutex); write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } else { f2fs_balance_fs(sbi); } + f2fs_trace_ios(NULL, NULL, 1); return 0; } @@ -562,10 +591,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + if (test_opt(sbi, INLINE_DENTRY)) + seq_puts(seq, ",inline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) seq_puts(seq, ",nobarrier"); + if (test_opt(sbi, FASTBOOT)) + seq_puts(seq, ",fastboot"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -654,7 +687,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) f2fs_sync_fs(sb, 1); need_restart_gc = true; } - } else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) { + } else if (!sbi->gc_thread) { err = start_gc_thread(sbi); if (err) goto restore_opts; @@ -667,7 +700,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { destroy_flush_cmd_control(sbi); - } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) { + } else if (!SM_I(sbi)->cmd_control_info) { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -872,7 +905,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; - sbi->need_fsck = false; + clear_sbi_flag(sbi, SBI_NEED_FSCK); } /* @@ -922,11 +955,12 @@ retry: static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; - struct f2fs_super_block *raw_super; + struct f2fs_super_block *raw_super = NULL; struct buffer_head *raw_super_buf; struct inode *root; long err = -EINVAL; bool retry = true; + char *options = NULL; int i; try_onemore: @@ -958,9 +992,15 @@ try_onemore: set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - err = parse_options(sb, (char *)data); - if (err) + options = kstrdup((const char *)data, GFP_KERNEL); + if (data && !options) { + err = -ENOMEM; goto free_sb_buf; + } + + err = parse_options(sb, options); + if (err) + goto free_options; sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; @@ -983,7 +1023,7 @@ try_onemore: mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); - sbi->por_doing = false; + clear_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); init_rwsem(&sbi->read_io.io_rwsem); @@ -1004,7 +1044,7 @@ try_onemore: if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_sb_buf; + goto free_options; } err = get_valid_checkpoint(sbi); @@ -1107,10 +1147,19 @@ try_onemore: goto free_proc; if (!retry) - sbi->need_fsck = true; + set_sbi_flag(sbi, SBI_NEED_FSCK); /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { + /* + * mount should be failed, when device has readonly mode, and + * previous checkpoint was not done by clean system shutdown. + */ + if (bdev_read_only(sb->s_bdev) && + !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + err = -EROFS; + goto free_kobj; + } err = recover_fsync_data(sbi); if (err) { f2fs_msg(sb, KERN_ERR, @@ -1123,12 +1172,13 @@ try_onemore: * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (!f2fs_readonly(sb)) { + if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) goto free_kobj; } + kfree(options); return 0; free_kobj: @@ -1153,6 +1203,8 @@ free_cp: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_options: + kfree(options); free_sb_buf: brelse(raw_super_buf); free_sbi: @@ -1173,11 +1225,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super); } +static void kill_f2fs_super(struct super_block *sb) +{ + if (sb->s_root) + set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + kill_block_super(sb); +} + static struct file_system_type f2fs_fs_type = { .owner = THIS_MODULE, .name = "f2fs", .mount = f2fs_mount, - .kill_sb = kill_block_super, + .kill_sb = kill_f2fs_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("f2fs"); @@ -1205,6 +1264,8 @@ static int __init init_f2fs_fs(void) { int err; + f2fs_build_trace_ios(); + err = init_inodecache(); if (err) goto fail; @@ -1214,12 +1275,9 @@ static int __init init_f2fs_fs(void) err = create_segment_manager_caches(); if (err) goto free_node_manager_caches; - err = create_gc_caches(); - if (err) - goto free_segment_manager_caches; err = create_checkpoint_caches(); if (err) - goto free_gc_caches; + goto free_segment_manager_caches; f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); if (!f2fs_kset) { err = -ENOMEM; @@ -1236,8 +1294,6 @@ free_kset: kset_unregister(f2fs_kset); free_checkpoint_caches: destroy_checkpoint_caches(); -free_gc_caches: - destroy_gc_caches(); free_segment_manager_caches: destroy_segment_manager_caches(); free_node_manager_caches: @@ -1254,11 +1310,11 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); - destroy_gc_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); kset_unregister(f2fs_kset); + f2fs_destroy_trace_ios(); } module_init(init_f2fs_fs) diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c new file mode 100644 index 000000000000..875aa8179bc1 --- /dev/null +++ b/fs/f2fs/trace.c @@ -0,0 +1,159 @@ +/* + * f2fs IO tracer + * + * Copyright (c) 2014 Motorola Mobility + * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/sched.h> +#include <linux/radix-tree.h> + +#include "f2fs.h" +#include "trace.h" + +static RADIX_TREE(pids, GFP_ATOMIC); +static spinlock_t pids_lock; +static struct last_io_info last_io; + +static inline void __print_last_io(void) +{ + if (!last_io.len) + return; + + trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + last_io.major, last_io.minor, + last_io.pid, "----------------", + last_io.type, + last_io.fio.rw, last_io.fio.blk_addr, + last_io.len); + memset(&last_io, 0, sizeof(last_io)); +} + +static int __file_type(struct inode *inode, pid_t pid) +{ + if (f2fs_is_atomic_file(inode)) + return __ATOMIC_FILE; + else if (f2fs_is_volatile_file(inode)) + return __VOLATILE_FILE; + else if (S_ISDIR(inode->i_mode)) + return __DIR_FILE; + else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode))) + return __NODE_FILE; + else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode))) + return __META_FILE; + else if (pid) + return __NORMAL_FILE; + else + return __MISC_FILE; +} + +void f2fs_trace_pid(struct page *page) +{ + struct inode *inode = page->mapping->host; + pid_t pid = task_pid_nr(current); + void *p; + + page->private = pid; + + if (radix_tree_preload(GFP_NOFS)) + return; + + spin_lock(&pids_lock); + p = radix_tree_lookup(&pids, pid); + if (p == current) + goto out; + if (p) + radix_tree_delete(&pids, pid); + + f2fs_radix_tree_insert(&pids, pid, current); + + trace_printk("%3x:%3x %4x %-16s\n", + MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), + pid, current->comm); +out: + spin_unlock(&pids_lock); + radix_tree_preload_end(); +} + +void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush) +{ + struct inode *inode; + pid_t pid; + int major, minor; + + if (flush) { + __print_last_io(); + return; + } + + inode = page->mapping->host; + pid = page_private(page); + + major = MAJOR(inode->i_sb->s_dev); + minor = MINOR(inode->i_sb->s_dev); + + if (last_io.major == major && last_io.minor == minor && + last_io.pid == pid && + last_io.type == __file_type(inode, pid) && + last_io.fio.rw == fio->rw && + last_io.fio.blk_addr + last_io.len == fio->blk_addr) { + last_io.len++; + return; + } + + __print_last_io(); + + last_io.major = major; + last_io.minor = minor; + last_io.pid = pid; + last_io.type = __file_type(inode, pid); + last_io.fio = *fio; + last_io.len = 1; + return; +} + +void f2fs_build_trace_ios(void) +{ + spin_lock_init(&pids_lock); +} + +#define PIDVEC_SIZE 128 +static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, + unsigned int max_items) +{ + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!max_items)) + return 0; + + radix_tree_for_each_slot(slot, &pids, &iter, first_index) { + results[ret] = iter.index; + if (++ret == PIDVEC_SIZE) + break; + } + return ret; +} + +void f2fs_destroy_trace_ios(void) +{ + pid_t pid[PIDVEC_SIZE]; + pid_t next_pid = 0; + unsigned int found; + + spin_lock(&pids_lock); + while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { + unsigned idx; + + next_pid = pid[found - 1] + 1; + for (idx = 0; idx < found; idx++) + radix_tree_delete(&pids, pid[idx]); + } + spin_unlock(&pids_lock); +} diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h new file mode 100644 index 000000000000..1041dbeb52ae --- /dev/null +++ b/fs/f2fs/trace.h @@ -0,0 +1,46 @@ +/* + * f2fs IO tracer + * + * Copyright (c) 2014 Motorola Mobility + * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __F2FS_TRACE_H__ +#define __F2FS_TRACE_H__ + +#ifdef CONFIG_F2FS_IO_TRACE +#include <trace/events/f2fs.h> + +enum file_type { + __NORMAL_FILE, + __DIR_FILE, + __NODE_FILE, + __META_FILE, + __ATOMIC_FILE, + __VOLATILE_FILE, + __MISC_FILE, +}; + +struct last_io_info { + int major, minor; + pid_t pid; + enum file_type type; + struct f2fs_io_info fio; + block_t len; +}; + +extern void f2fs_trace_pid(struct page *); +extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int); +extern void f2fs_build_trace_ios(void); +extern void f2fs_destroy_trace_ios(void); +#else +#define f2fs_trace_pid(p) +#define f2fs_trace_ios(p, i, n) +#define f2fs_build_trace_ios() +#define f2fs_destroy_trace_ios() + +#endif +#endif /* __F2FS_TRACE_H__ */ diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index deca8728117b..5072bf9ae0ef 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, buffer, size); + return f2fs_getxattr(dentry->d_inode, type, name, buffer, size, NULL); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -398,7 +398,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } int f2fs_getxattr(struct inode *inode, int index, const char *name, - void *buffer, size_t buffer_size) + void *buffer, size_t buffer_size, struct page *ipage) { struct f2fs_xattr_entry *entry; void *base_addr; @@ -412,7 +412,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, NULL); + base_addr = read_all_xattrs(inode, ipage); if (!base_addr) return -ENOMEM; diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 34ab7dbcf5e3..969d792ca362 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -115,7 +115,8 @@ extern const struct xattr_handler *f2fs_xattr_handlers[]; extern int f2fs_setxattr(struct inode *, int, const char *, const void *, size_t, struct page *, int); -extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t); +extern int f2fs_getxattr(struct inode *, int, const char *, void *, + size_t, struct page *); extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #else @@ -126,7 +127,8 @@ static inline int f2fs_setxattr(struct inode *inode, int index, return -EOPNOTSUPP; } static inline int f2fs_getxattr(struct inode *inode, int index, - const char *name, void *buffer, size_t buffer_size) + const char *name, void *buffer, + size_t buffer_size, struct page *dpage) { return -EOPNOTSUPP; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 3963ede84eb0..c5d6bb939d19 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -702,10 +702,11 @@ static int fat_readdir(struct file *file, struct dir_context *ctx) } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ -static int func(void *__buf, const char *name, int name_len, \ +static int func(struct dir_context *ctx, const char *name, int name_len, \ loff_t offset, u64 ino, unsigned int d_type) \ { \ - struct fat_ioctl_filldir_callback *buf = __buf; \ + struct fat_ioctl_filldir_callback *buf = \ + container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \ struct dirent_type __user *d1 = buf->dirent; \ struct dirent_type __user *d2 = d1 + 1; \ \ diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e0c4ba39a377..64e295e8ff38 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -370,6 +370,7 @@ extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* fat/inode.c */ +extern int fat_block_truncate_page(struct inode *inode, loff_t from); extern void fat_attach(struct inode *inode, loff_t i_pos); extern void fat_detach(struct inode *inode); extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); diff --git a/fs/fat/file.c b/fs/fat/file.c index 85f79a89e747..8429c68e3057 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -443,6 +443,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { + error = fat_block_truncate_page(inode, attr->ia_size); + if (error) + goto out; down_write(&MSDOS_I(inode)->truncate_lock); truncate_setsize(inode, attr->ia_size); fat_truncate_blocks(inode, attr->ia_size); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 756aead10d96..497c7c5263c7 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -294,6 +294,18 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) return blocknr; } +/* + * fat_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This is required during truncate to physically zeroout the tail end + * of that block so it doesn't yield old data if the file is later grown. + * Also, avoid causing failure from fsx for cases of "data past EOF" + */ +int fat_block_truncate_page(struct inode *inode, loff_t from) +{ + return block_truncate_page(inode->i_mapping, from, fat_get_block); +} + static const struct address_space_operations fat_aops = { .readpage = fat_readpage, .readpages = fat_readpages, @@ -568,7 +580,7 @@ static void fat_set_state(struct super_block *sb, { struct buffer_head *bh; struct fat_boot_sector *b; - struct msdos_sb_info *sbi = sb->s_fs_info; + struct msdos_sb_info *sbi = MSDOS_SB(sb); /* do not change any thing if mounted read only */ if ((sb->s_flags & MS_RDONLY) && !force) diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6df8d3d885e5..b8b92c2f9683 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -736,7 +736,12 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, } alias = d_find_alias(inode); - if (alias && !vfat_d_anon_disconn(alias)) { + /* + * Checking "alias->d_parent == dentry->d_parent" to make sure + * FS is not corrupted (especially double linked dir). + */ + if (alias && alias->d_parent == dentry->d_parent && + !vfat_d_anon_disconn(alias)) { /* * This inode has non anonymous-DCACHE_DISCONNECTED * dentry. This means, the user did ->lookup() by an @@ -755,12 +760,9 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry, out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - dentry->d_time = dentry->d_parent->d_inode->i_version; - dentry = d_splice_alias(inode, dentry); - if (dentry) - dentry->d_time = dentry->d_parent->d_inode->i_version; - return dentry; - + if (!inode) + dentry->d_time = dir->i_version; + return d_splice_alias(inode, dentry); error: mutex_unlock(&MSDOS_SB(sb)->s_lock); return ERR_PTR(err); @@ -793,7 +795,6 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ - dentry->d_time = dentry->d_parent->d_inode->i_version; d_instantiate(dentry, inode); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -824,6 +825,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); + dentry->d_time = dir->i_version; out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -849,6 +851,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC; fat_detach(inode); + dentry->d_time = dir->i_version; out: mutex_unlock(&MSDOS_SB(sb)->s_lock); @@ -889,7 +892,6 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mtime = inode->i_atime = inode->i_ctime = ts; /* timestamp is already written, so mark_inode_dirty() is unneeded. */ - dentry->d_time = dentry->d_parent->d_inode->i_version; d_instantiate(dentry, inode); mutex_unlock(&MSDOS_SB(sb)->s_lock); diff --git a/fs/fcntl.c b/fs/fcntl.c index 99d440a4a6ba..ee85cd4e136a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -740,14 +740,15 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE + __FMODE_EXEC | O_PATH | __O_TMPFILE | + __FMODE_NONOTIFY )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/file.c b/fs/file.c index ab3eb6a88239..ee738ea028fa 100644 --- a/fs/file.c +++ b/fs/file.c @@ -869,7 +869,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) struct file *file = fget_raw(fildes); if (file) { - ret = get_unused_fd(); + ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef9bef118342..32a8bbd7a9ad 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -53,6 +53,18 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * If an inode is constantly having its pages dirtied, but then the + * updates stop dirtytime_expire_interval seconds in the past, it's + * possible for the worst case time between when an inode has its + * timestamps updated and when they finally get written out to be two + * dirtytime_expire_intervals. We set the default to 12 hours (in + * seconds), which means most of the time inodes will have their + * timestamps written to disk after 12 hours, but in the worst case a + * few inodes might not their timestamps updated for 24 hours. + */ +unsigned int dirtytime_expire_interval = 12 * 60 * 60; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -66,15 +78,21 @@ int writeback_in_progress(struct backing_dev_info *bdi) } EXPORT_SYMBOL(writeback_in_progress); -static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +struct backing_dev_info *inode_to_bdi(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct super_block *sb; - if (sb_is_blkdev_sb(sb)) - return inode->i_mapping->backing_dev_info; + if (!inode) + return &noop_backing_dev_info; + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif return sb->s_bdi; } +EXPORT_SYMBOL_GPL(inode_to_bdi); static inline struct inode *wb_inode(struct list_head *head) { @@ -247,14 +265,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) return ret; } +#define EXPIRE_DIRTY_ATIME 0x0001 + /* * Move expired (dirtied before work->older_than_this) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, + int flags, struct wb_writeback_work *work) { + unsigned long *older_than_this = NULL; + unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -262,13 +285,21 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; + if ((flags & EXPIRE_DIRTY_ATIME) == 0) + older_than_this = work->older_than_this; + else if (!work->for_sync) { + expire_time = jiffies - (dirtytime_expire_interval * HZ); + older_than_this = &expire_time; + } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (work->older_than_this && - inode_dirtied_after(inode, *work->older_than_this)) + if (older_than_this && + inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_wb_list, &tmp); moved++; + if (flags & EXPIRE_DIRTY_ATIME) + set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -309,9 +340,12 @@ out: static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, + EXPIRE_DIRTY_ATIME, work); trace_writeback_queue_io(wb, work, moved); } @@ -435,6 +469,9 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * updates after data IO completion. */ redirty_tail(inode, wb); + } else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + list_move(&inode->i_wb_list, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ list_del_init(&inode->i_wb_list); @@ -479,14 +516,43 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * write_inode() */ spin_lock(&inode->i_lock); - /* Clear I_DIRTY_PAGES if we've written out all dirty pages */ - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) - inode->i_state &= ~I_DIRTY_PAGES; + dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); + if (inode->i_state & I_DIRTY_TIME) { + if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || + unlikely(time_after(jiffies, + (inode->dirtied_time_when + + dirtytime_expire_interval * HZ)))) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + } else + inode->i_state &= ~I_DIRTY_TIME_EXPIRED; + inode->i_state &= ~dirty; + + /* + * Paired with smp_mb() in __mark_inode_dirty(). This allows + * __mark_inode_dirty() to test i_state without grabbing i_lock - + * either they see the I_DIRTY bits cleared or we see the dirtied + * inode. + * + * I_DIRTY_PAGES is always cleared together above even if @mapping + * still has dirty pages. The flag is reinstated after smp_mb() if + * necessary. This guarantees that either __mark_inode_dirty() + * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. + */ + smp_mb(); + + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + inode->i_state |= I_DIRTY_PAGES; + spin_unlock(&inode->i_lock); + + if (dirty & I_DIRTY_TIME) + mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; @@ -534,7 +600,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * make sure inode is on some writeback list and leave it there unless * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY) && + if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; @@ -549,7 +615,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * If inode is clean, remove it from writeback lists. Otherwise don't * touch it. See comment above for explanation. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) list_del_init(&inode->i_wb_list); spin_unlock(&wb->list_lock); inode_sync_complete(inode); @@ -691,7 +757,7 @@ static long writeback_sb_inodes(struct super_block *sb, wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) wrote++; requeue_inode(inode, wb, &wbc); inode_sync_complete(inode); @@ -721,9 +787,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!grab_super_passive(sb)) { + if (!trylock_super(sb)) { /* - * grab_super_passive() may fail consistently due to + * trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ @@ -731,7 +797,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, continue; } wrote += writeback_sb_inodes(sb, wb, work); - drop_super(sb); + up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { @@ -1083,6 +1149,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_unlock(); } +/* + * Wake up bdi's periodically to make sure dirtytime inodes gets + * written back periodically. We deliberately do *not* check the + * b_dirtytime list in wb_has_dirty_io(), since this would cause the + * kernel to be constantly waking up once there are any dirtytime + * inodes on the system. So instead we define a separate delayed work + * function which gets called much more rarely. (By default, only + * once every 12 hours.) + * + * If there is any other write activity going on in the file system, + * this function won't be necessary. But if the only thing that has + * happened on the file system is a dirtytime inode caused by an atime + * update, we need this infrastructure below to make sure that inode + * eventually gets pushed out to disk. + */ +static void wakeup_dirtytime_writeback(struct work_struct *w); +static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); + +static void wakeup_dirtytime_writeback(struct work_struct *w) +{ + struct backing_dev_info *bdi; + + rcu_read_lock(); + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + if (list_empty(&bdi->wb.b_dirty_time)) + continue; + bdi_wakeup_thread(bdi); + } + rcu_read_unlock(); + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); +} + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + return 0; +} +__initcall(start_dirtytime_writeback); + +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + mod_delayed_work(system_wq, &dirtytime_work, 0); + return ret; +} + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { @@ -1129,16 +1245,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; + int dirtytime; + + trace_writeback_mark_inode_dirty(inode, flags); /* * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) @@ -1146,24 +1266,31 @@ void __mark_inode_dirty(struct inode *inode, int flags) trace_writeback_dirty_inode(inode, flags); } + if (flags & I_DIRTY_INODE) + flags &= ~I_DIRTY_TIME; + dirtytime = flags & I_DIRTY_TIME; /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * Paired with smp_mb() in __writeback_single_inode() for the + * following lockless i_state test. See there for details. */ smp_mb(); - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) + if (((inode->i_state & flags) == flags) || + (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); spin_lock(&inode->i_lock); + if (dirtytime && (inode->i_state & I_DIRTY_INODE)) + goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + if (flags & I_DIRTY_INODE) + inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* @@ -1210,8 +1337,15 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + if (dirtytime) + inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) + list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + else + list_move(&inode->i_wb_list, + &bdi->wb.b_dirty_time); spin_unlock(&bdi->wb.list_lock); + trace_writeback_dirty_inode_enqueue(inode); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 9368236ca100..b06c98796afb 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -1,78 +1,102 @@ #include <linux/fs.h> +#include <linux/sched.h> #include <linux/slab.h> -#include <linux/fs_pin.h> #include "internal.h" #include "mount.h" -static void pin_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct fs_pin, rcu)); -} - static DEFINE_SPINLOCK(pin_lock); -void pin_put(struct fs_pin *p) -{ - if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, pin_free_rcu); -} - void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); hlist_del(&pin->m_list); hlist_del(&pin->s_list); spin_unlock(&pin_lock); + spin_lock_irq(&pin->wait.lock); + pin->done = 1; + wake_up_locked(&pin->wait); + spin_unlock_irq(&pin->wait.lock); } -void pin_insert(struct fs_pin *pin, struct vfsmount *m) +void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p) { spin_lock(&pin_lock); - hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + if (p) + hlist_add_head(&pin->s_list, p); hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); spin_unlock(&pin_lock); } +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + pin_insert_group(pin, m, &m->mnt_sb->s_pins); +} + +void pin_kill(struct fs_pin *p) +{ + wait_queue_t wait; + + if (!p) { + rcu_read_unlock(); + return; + } + init_wait(&wait); + spin_lock_irq(&p->wait.lock); + if (likely(!p->done)) { + p->done = -1; + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + p->kill(p); + return; + } + if (p->done > 0) { + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + return; + } + __add_wait_queue(&p->wait, &wait); + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + if (likely(list_empty(&wait.task_list))) + break; + /* OK, we know p couldn't have been freed yet */ + spin_lock_irq(&p->wait.lock); + if (p->done > 0) { + spin_unlock_irq(&p->wait.lock); + break; + } + } + rcu_read_unlock(); +} + void mnt_pin_kill(struct mount *m) { while (1) { struct hlist_node *p; - struct fs_pin *pin; rcu_read_lock(); p = ACCESS_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; } - pin = hlist_entry(p, struct fs_pin, m_list); - if (!atomic_long_inc_not_zero(&pin->count)) { - rcu_read_unlock(); - cpu_relax(); - continue; - } - rcu_read_unlock(); - pin->kill(pin); + pin_kill(hlist_entry(p, struct fs_pin, m_list)); } } -void sb_pin_kill(struct super_block *sb) +void group_pin_kill(struct hlist_head *p) { while (1) { - struct hlist_node *p; - struct fs_pin *pin; + struct hlist_node *q; rcu_read_lock(); - p = ACCESS_ONCE(sb->s_pins.first); - if (!p) { + q = ACCESS_ONCE(p->first); + if (!q) { rcu_read_unlock(); break; } - pin = hlist_entry(p, struct fs_pin, s_list); - if (!atomic_long_inc_not_zero(&pin->count)) { - rcu_read_unlock(); - cpu_relax(); - continue; - } - rcu_read_unlock(); - pin->kill(pin); + pin_kill(hlist_entry(q, struct fs_pin, s_list)); } } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 966ace8b243f..28d0c7abba1c 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -415,7 +415,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_conn_kill(fc); + fuse_abort_conn(fc); goto out; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ca887314aba9..39706c57ad3c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -131,6 +131,13 @@ static void fuse_req_init_context(struct fuse_req *req) req->in.h.pid = current->pid; } +void fuse_set_initialized(struct fuse_conn *fc) +{ + /* Make sure stores before this are seen on another CPU */ + smp_wmb(); + fc->initialized = 1; +} + static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { return !fc->initialized || (for_background && fc->blocked); @@ -155,6 +162,8 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (intr) goto out; } + /* Matches smp_wmb() in fuse_set_initialized() */ + smp_rmb(); err = -ENOTCONN; if (!fc->connected) @@ -253,6 +262,8 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, atomic_inc(&fc->num_waiting); wait_event(fc->blocked_waitq, fc->initialized); + /* Matches smp_wmb() in fuse_set_initialized() */ + smp_rmb(); req = fuse_request_alloc(0); if (!req) req = get_reserved_req(fc, file); @@ -511,6 +522,71 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send); +static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) +{ + if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) + args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; + + if (fc->minor < 9) { + switch (args->in.h.opcode) { + case FUSE_LOOKUP: + case FUSE_CREATE: + case FUSE_MKNOD: + case FUSE_MKDIR: + case FUSE_SYMLINK: + case FUSE_LINK: + args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + break; + case FUSE_GETATTR: + case FUSE_SETATTR: + args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + break; + } + } + if (fc->minor < 12) { + switch (args->in.h.opcode) { + case FUSE_CREATE: + args->in.args[0].size = sizeof(struct fuse_open_in); + break; + case FUSE_MKNOD: + args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + break; + } + } +} + +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +{ + struct fuse_req *req; + ssize_t ret; + + req = fuse_get_req(fc, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* Needs to be done after fuse_get_req() so that fc->minor is valid */ + fuse_adjust_compat(fc, args); + + req->in.h.opcode = args->in.h.opcode; + req->in.h.nodeid = args->in.h.nodeid; + req->in.numargs = args->in.numargs; + memcpy(req->in.args, args->in.args, + args->in.numargs * sizeof(struct fuse_in_arg)); + req->out.argvar = args->out.argvar; + req->out.numargs = args->out.numargs; + memcpy(req->out.args, args->out.args, + args->out.numargs * sizeof(struct fuse_arg)); + fuse_request_send(fc, req); + ret = req->out.h.error; + if (!ret && args->out.argvar) { + BUG_ON(args->out.numargs != 1); + ret = req->out.args[0].size; + } + fuse_put_request(fc, req); + + return ret; +} + static void fuse_request_send_nowait_locked(struct fuse_conn *fc, struct fuse_req *req) { @@ -814,8 +890,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newpage = buf->page; - if (WARN_ON(!PageUptodate(newpage))) - return -EIO; + if (!PageUptodate(newpage)) + SetPageUptodate(newpage); ClearPageMappedToDisk(newpage); @@ -1277,6 +1353,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, return err; } +static int fuse_dev_open(struct inode *inode, struct file *file) +{ + /* + * The fuse device's file's private_data is used to hold + * the fuse_conn(ection) when it is mounted, and is used to + * keep track of whether the file has been mounted already. + */ + file->private_data = NULL; + return 0; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -1721,6 +1808,9 @@ copy_finish: static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { + /* Don't try to move pages (yet) */ + cs->move_pages = 0; + switch (code) { case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); @@ -2098,7 +2188,7 @@ void fuse_abort_conn(struct fuse_conn *fc) if (fc->connected) { fc->connected = 0; fc->blocked = 0; - fc->initialized = 1; + fuse_set_initialized(fc); end_io_requests(fc); end_queued_requests(fc); end_polls(fc); @@ -2117,7 +2207,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) spin_lock(&fc->lock); fc->connected = 0; fc->blocked = 0; - fc->initialized = 1; + fuse_set_initialized(fc); end_queued_requests(fc); end_polls(fc); wake_up_all(&fc->blocked_waitq); @@ -2141,6 +2231,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, + .open = fuse_dev_open, .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dbab798f5caf..1545b711ddcf 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -145,22 +145,19 @@ static void fuse_invalidate_entry(struct dentry *entry) fuse_invalidate_entry_cache(entry); } -static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req, +static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); - req->in.h.opcode = FUSE_LOOKUP; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = name->len + 1; - req->in.args[0].value = name->name; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; - else - req->out.args[0].size = sizeof(struct fuse_entry_out); - req->out.args[0].value = outarg; + args->in.h.opcode = FUSE_LOOKUP; + args->in.h.nodeid = nodeid; + args->in.numargs = 1; + args->in.args[0].size = name->len + 1; + args->in.args[0].value = name->name; + args->out.numargs = 1; + args->out.args[0].size = sizeof(struct fuse_entry_out); + args->out.args[0].value = outarg; } u64 fuse_get_attr_version(struct fuse_conn *fc) @@ -200,9 +197,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & LOOKUP_REVAL)) { - int err; struct fuse_entry_out outarg; - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -215,31 +211,23 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) goto out; fc = get_fuse_conn(inode); - req = fuse_get_req_nopages(fc); - ret = PTR_ERR(req); - if (IS_ERR(req)) - goto out; forget = fuse_alloc_forget(); - if (!forget) { - fuse_put_request(fc, req); - ret = -ENOMEM; + ret = -ENOMEM; + if (!forget) goto out; - } attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(fc, req, get_node_id(parent->d_inode), + fuse_lookup_init(fc, &args, get_node_id(parent->d_inode), &entry->d_name, &outarg); - fuse_request_send(fc, req); + ret = fuse_simple_request(fc, &args); dput(parent); - err = req->out.h.error; - fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT */ - if (!err && !outarg.nodeid) - err = -ENOENT; - if (!err) { + if (!ret && !outarg.nodeid) + ret = -ENOENT; + if (!ret) { fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { fuse_queue_forget(fc, forget, outarg.nodeid, 1); @@ -250,7 +238,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) spin_unlock(&fc->lock); } kfree(forget); - if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + if (ret == -ENOMEM) + goto out; + if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; fuse_change_attributes(inode, &outarg.attr, @@ -296,7 +286,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { struct fuse_conn *fc = get_fuse_conn_super(sb); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; int err; @@ -306,24 +296,16 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, if (name->len > FUSE_NAME_MAX) goto out; - req = fuse_get_req_nopages(fc); - err = PTR_ERR(req); - if (IS_ERR(req)) - goto out; forget = fuse_alloc_forget(); err = -ENOMEM; - if (!forget) { - fuse_put_request(fc, req); + if (!forget) goto out; - } attr_version = fuse_get_attr_version(fc); - fuse_lookup_init(fc, req, nodeid, name, outarg); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_lookup_init(fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fc, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -372,7 +354,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (inode && get_node_id(inode) == FUSE_ROOT_ID) goto out_iput; - newent = d_materialise_unique(entry, inode); + newent = d_splice_alias(inode, entry); err = PTR_ERR(newent); if (IS_ERR(newent)) goto out_err; @@ -405,7 +387,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int err; struct inode *inode; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; struct fuse_open_out outopen; @@ -420,15 +402,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!forget) goto out_err; - req = fuse_get_req_nopages(fc); - err = PTR_ERR(req); - if (IS_ERR(req)) - goto out_put_forget_req; - err = -ENOMEM; ff = fuse_file_alloc(fc); if (!ff) - goto out_put_request; + goto out_put_forget_req; if (!fc->dont_mask) mode &= ~current_umask(); @@ -439,24 +416,19 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - req->in.h.opcode = FUSE_CREATE; - req->in.h.nodeid = get_node_id(dir); - req->in.numargs = 2; - req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : - sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = entry->d_name.len + 1; - req->in.args[1].value = entry->d_name.name; - req->out.numargs = 2; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; - else - req->out.args[0].size = sizeof(outentry); - req->out.args[0].value = &outentry; - req->out.args[1].size = sizeof(outopen); - req->out.args[1].value = &outopen; - fuse_request_send(fc, req); - err = req->out.h.error; + args.in.h.opcode = FUSE_CREATE; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + args.out.numargs = 2; + args.out.args[0].size = sizeof(outentry); + args.out.args[0].value = &outentry; + args.out.args[1].size = sizeof(outopen); + args.out.args[1].value = &outopen; + err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; @@ -464,7 +436,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) goto out_free_ff; - fuse_put_request(fc, req); ff->fh = outopen.fh; ff->nodeid = outentry.nodeid; ff->open_flags = outopen.open_flags; @@ -492,8 +463,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, out_free_ff: fuse_file_free(ff); -out_put_request: - fuse_put_request(fc, req); out_put_forget_req: kfree(forget); out_err: @@ -547,7 +516,7 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, +static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -557,22 +526,15 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, struct fuse_forget_link *forget; forget = fuse_alloc_forget(); - if (!forget) { - fuse_put_request(fc, req); + if (!forget) return -ENOMEM; - } memset(&outarg, 0, sizeof(outarg)); - req->in.h.nodeid = get_node_id(dir); - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; - else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args->in.h.nodeid = get_node_id(dir); + args->out.numargs = 1; + args->out.args[0].size = sizeof(outarg); + args->out.args[0].value = &outarg; + err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; @@ -609,9 +571,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); if (!fc->dont_mask) mode &= ~current_umask(); @@ -620,14 +580,13 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); - req->in.h.opcode = FUSE_MKNOD; - req->in.numargs = 2; - req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : - sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = entry->d_name.len + 1; - req->in.args[1].value = entry->d_name.name; - return create_new_entry(fc, req, dir, entry, mode); + args.in.h.opcode = FUSE_MKNOD; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + return create_new_entry(fc, &args, dir, entry, mode); } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, @@ -640,9 +599,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); if (!fc->dont_mask) mode &= ~current_umask(); @@ -650,13 +607,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); - req->in.h.opcode = FUSE_MKDIR; - req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = entry->d_name.len + 1; - req->in.args[1].value = entry->d_name.name; - return create_new_entry(fc, req, dir, entry, S_IFDIR); + args.in.h.opcode = FUSE_MKDIR; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = entry->d_name.len + 1; + args.in.args[1].value = entry->d_name.name; + return create_new_entry(fc, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct inode *dir, struct dentry *entry, @@ -664,17 +621,15 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); - req->in.h.opcode = FUSE_SYMLINK; - req->in.numargs = 2; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; - req->in.args[1].size = len; - req->in.args[1].value = link; - return create_new_entry(fc, req, dir, entry, S_IFLNK); + args.in.h.opcode = FUSE_SYMLINK; + args.in.numargs = 2; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + args.in.args[1].size = len; + args.in.args[1].value = link; + return create_new_entry(fc, &args, dir, entry, S_IFLNK); } static inline void fuse_update_ctime(struct inode *inode) @@ -689,18 +644,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.h.opcode = FUSE_UNLINK; - req->in.h.nodeid = get_node_id(dir); - req->in.numargs = 1; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_UNLINK; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 1; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = entry->d_inode; struct fuse_inode *fi = get_fuse_inode(inode); @@ -729,18 +680,14 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.h.opcode = FUSE_RMDIR; - req->in.h.nodeid = get_node_id(dir); - req->in.numargs = 1; - req->in.args[0].size = entry->d_name.len + 1; - req->in.args[0].value = entry->d_name.name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + FUSE_ARGS(args); + + args.in.h.opcode = FUSE_RMDIR; + args.in.h.nodeid = get_node_id(dir); + args.in.numargs = 1; + args.in.args[0].size = entry->d_name.len + 1; + args.in.args[0].value = entry->d_name.name; + err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(entry->d_inode); fuse_invalidate_attr(dir); @@ -757,27 +704,21 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req; - - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = get_node_id(olddir); - req->in.numargs = 3; - req->in.args[0].size = argsize; - req->in.args[0].value = &inarg; - req->in.args[1].size = oldent->d_name.len + 1; - req->in.args[1].value = oldent->d_name.name; - req->in.args[2].size = newent->d_name.len + 1; - req->in.args[2].value = newent->d_name.name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = opcode; + args.in.h.nodeid = get_node_id(olddir); + args.in.numargs = 3; + args.in.args[0].size = argsize; + args.in.args[0].value = &inarg; + args.in.args[1].size = oldent->d_name.len + 1; + args.in.args[1].value = oldent->d_name.name; + args.in.args[2].size = newent->d_name.len + 1; + args.in.args[2].value = newent->d_name.name; + err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); @@ -849,19 +790,17 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); - req->in.h.opcode = FUSE_LINK; - req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = newent->d_name.len + 1; - req->in.args[1].value = newent->d_name.name; - err = create_new_entry(fc, req, newdir, newent, inode->i_mode); + args.in.h.opcode = FUSE_LINK; + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = newent->d_name.len + 1; + args.in.args[1].value = newent->d_name.name; + err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" inode. We invalidate the attributes of the old one, so it @@ -929,13 +868,9 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, struct fuse_getattr_in inarg; struct fuse_attr_out outarg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); u64 attr_version; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - attr_version = fuse_get_attr_version(fc); memset(&inarg, 0, sizeof(inarg)); @@ -947,20 +882,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - req->in.h.opcode = FUSE_GETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; - else - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_GETATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) { if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); @@ -1041,7 +971,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, err = -EBUSY; goto badentry; } - if (S_ISDIR(entry->d_inode->i_mode)) { + if (d_is_dir(entry)) { shrink_dcache_parent(entry); if (!simple_empty(entry)) { err = -ENOTEMPTY; @@ -1102,7 +1032,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) static int fuse_access(struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_access_in inarg; int err; @@ -1111,20 +1041,14 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); - req->in.h.opcode = FUSE_ACCESS; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_ACCESS; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; err = 0; @@ -1320,7 +1244,7 @@ static int fuse_direntplus_link(struct file *file, if (!inode) goto out; - alias = d_materialise_unique(dentry, inode); + alias = d_splice_alias(inode, dentry); err = PTR_ERR(alias); if (IS_ERR(alias)) goto out; @@ -1445,31 +1369,27 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_req_nopages(fc); + FUSE_ARGS(args); char *link; - - if (IS_ERR(req)) - return ERR_CAST(req); + ssize_t ret; link = (char *) __get_free_page(GFP_KERNEL); - if (!link) { - link = ERR_PTR(-ENOMEM); - goto out; - } - req->in.h.opcode = FUSE_READLINK; - req->in.h.nodeid = get_node_id(inode); - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = PAGE_SIZE - 1; - req->out.args[0].value = link; - fuse_request_send(fc, req); - if (req->out.h.error) { + if (!link) + return ERR_PTR(-ENOMEM); + + args.in.h.opcode = FUSE_READLINK; + args.in.h.nodeid = get_node_id(inode); + args.out.argvar = 1; + args.out.numargs = 1; + args.out.args[0].size = PAGE_SIZE - 1; + args.out.args[0].value = link; + ret = fuse_simple_request(fc, &args); + if (ret < 0) { free_page((unsigned long) link); - link = ERR_PTR(req->out.h.error); - } else - link[req->out.args[0].size] = '\0'; - out: - fuse_put_request(fc, req); + link = ERR_PTR(ret); + } else { + link[ret] = '\0'; + } fuse_invalidate_atime(inode); return link; } @@ -1629,22 +1549,19 @@ void fuse_release_nowrite(struct inode *inode) spin_unlock(&fc->lock); } -static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, +static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct inode *inode, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { - req->in.h.opcode = FUSE_SETATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(*inarg_p); - req->in.args[0].value = inarg_p; - req->out.numargs = 1; - if (fc->minor < 9) - req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; - else - req->out.args[0].size = sizeof(*outarg_p); - req->out.args[0].value = outarg_p; + args->in.h.opcode = FUSE_SETATTR; + args->in.h.nodeid = get_node_id(inode); + args->in.numargs = 1; + args->in.args[0].size = sizeof(*inarg_p); + args->in.args[0].value = inarg_p; + args->out.numargs = 1; + args->out.args[0].size = sizeof(*outarg_p); + args->out.args[0].value = outarg_p; } /* @@ -1653,14 +1570,9 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; - int err; - - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -1677,12 +1589,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } - fuse_setattr_fill(fc, req, inode, &inarg, &outarg); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - return err; + return fuse_simple_request(fc, &args); } /* @@ -1698,7 +1607,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; @@ -1723,10 +1632,6 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (attr->ia_valid & ATTR_SIZE) is_truncate = true; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -1747,10 +1652,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } - fuse_setattr_fill(fc, req, inode, &inarg, &outarg); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + err = fuse_simple_request(fc, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); @@ -1837,32 +1740,26 @@ static int fuse_setxattr(struct dentry *entry, const char *name, { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; - req->in.h.opcode = FUSE_SETXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = strlen(name) + 1; - req->in.args[1].value = name; - req->in.args[2].size = size; - req->in.args[2].value = value; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_SETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 3; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; + args.in.args[2].size = size; + args.in.args[2].value = value; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_setxattr = 1; err = -EOPNOTSUPP; @@ -1879,7 +1776,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; @@ -1887,40 +1784,32 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - req->in.h.opcode = FUSE_GETXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->in.args[1].size = strlen(name) + 1; - req->in.args[1].value = name; + args.in.h.opcode = FUSE_GETXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 2; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.in.args[1].size = strlen(name) + 1; + args.in.args[1].value = name; /* This is really two different operations rolled into one */ - req->out.numargs = 1; + args.out.numargs = 1; if (size) { - req->out.argvar = 1; - req->out.args[0].size = size; - req->out.args[0].value = value; + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = value; } else { - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; } - fuse_request_send(fc, req); - ret = req->out.h.error; - if (!ret) - ret = size ? req->out.args[0].size : outarg.size; - else { - if (ret == -ENOSYS) { - fc->no_getxattr = 1; - ret = -EOPNOTSUPP; - } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = outarg.size; + if (ret == -ENOSYS) { + fc->no_getxattr = 1; + ret = -EOPNOTSUPP; } - fuse_put_request(fc, req); return ret; } @@ -1928,7 +1817,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; @@ -1939,38 +1828,30 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - req->in.h.opcode = FUSE_LISTXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; + args.in.h.opcode = FUSE_LISTXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; /* This is really two different operations rolled into one */ - req->out.numargs = 1; + args.out.numargs = 1; if (size) { - req->out.argvar = 1; - req->out.args[0].size = size; - req->out.args[0].value = list; + args.out.argvar = 1; + args.out.args[0].size = size; + args.out.args[0].value = list; } else { - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; } - fuse_request_send(fc, req); - ret = req->out.h.error; - if (!ret) - ret = size ? req->out.args[0].size : outarg.size; - else { - if (ret == -ENOSYS) { - fc->no_listxattr = 1; - ret = -EOPNOTSUPP; - } + ret = fuse_simple_request(fc, &args); + if (!ret && !size) + ret = outarg.size; + if (ret == -ENOSYS) { + fc->no_listxattr = 1; + ret = -EOPNOTSUPP; } - fuse_put_request(fc, req); return ret; } @@ -1978,24 +1859,18 @@ static int fuse_removexattr(struct dentry *entry, const char *name) { struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); int err; if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->in.h.opcode = FUSE_REMOVEXATTR; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = strlen(name) + 1; - req->in.args[0].value = name; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_REMOVEXATTR; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = strlen(name) + 1; + args.in.args[0].value = name; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_removexattr = 1; err = -EOPNOTSUPP; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index caa8d95b24e8..c01ec3bdcfd8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -24,30 +24,22 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { struct fuse_open_in inarg; - struct fuse_req *req; - int err; - - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - req->in.h.opcode = opcode; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - req->out.args[0].size = sizeof(*outargp); - req->out.args[0].value = outargp; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = opcode; + args.in.h.nodeid = nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(*outargp); + args.out.args[0].value = outargp; - return err; + return fuse_simple_request(fc, &args); } struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) @@ -89,37 +81,9 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_async(struct work_struct *work) -{ - struct fuse_req *req; - struct fuse_conn *fc; - struct path path; - - req = container_of(work, struct fuse_req, misc.release.work); - path = req->misc.release.path; - fc = get_fuse_conn(path.dentry->d_inode); - - fuse_put_request(fc, req); - path_put(&path); -} - static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - if (fc->destroy_req) { - /* - * If this is a fuseblk mount, then it's possible that - * releasing the path will result in releasing the - * super block and sending the DESTROY request. If - * the server is single threaded, this would hang. - * For this reason do the path_put() in a separate - * thread. - */ - atomic_inc(&req->count); - INIT_WORK(&req->misc.release.work, fuse_release_async); - schedule_work(&req->misc.release.work); - } else { - path_put(&req->misc.release.path); - } + iput(req->misc.release.inode); } static void fuse_file_put(struct fuse_file *ff, bool sync) @@ -133,12 +97,12 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) * implement 'open' */ req->background = 0; - path_put(&req->misc.release.path); + iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { req->background = 0; fuse_request_send(ff->fc, req); - path_put(&req->misc.release.path); + iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; @@ -297,9 +261,8 @@ void fuse_release_common(struct file *file, int opcode) inarg->lock_owner = fuse_lock_owner_id(ff->fc, (fl_owner_t) file); } - /* Hold vfsmount and dentry until release is finished */ - path_get(&file->f_path); - req->misc.release.path = file->f_path; + /* Hold inode until release is finished */ + req->misc.release.inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if @@ -480,7 +443,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_fsync_in inarg; int err; @@ -506,23 +469,15 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) goto out; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto out; - } - memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? 1 : 0; - req->in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { if (isdir) fc->no_fsyncdir = 1; @@ -1204,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -1509,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) { struct inode *inode = req->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; list_del(&req->writepages_entry); @@ -1703,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1813,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || old_req->state == FUSE_REQ_PENDING)) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); @@ -1917,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK); + inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; @@ -1988,7 +1943,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode); + struct fuse_conn *fc = get_fuse_conn(file_inode(file)); struct page *page; loff_t fsize; int err = -ENOMEM; @@ -2107,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -2156,49 +2110,44 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, return 0; } -static void fuse_lk_fill(struct fuse_req *req, struct file *file, +static void fuse_lk_fill(struct fuse_args *args, struct file *file, const struct file_lock *fl, int opcode, pid_t pid, - int flock) + int flock, struct fuse_lk_in *inarg) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_lk_in *arg = &req->misc.lk_in; - - arg->fh = ff->fh; - arg->owner = fuse_lock_owner_id(fc, fl->fl_owner); - arg->lk.start = fl->fl_start; - arg->lk.end = fl->fl_end; - arg->lk.type = fl->fl_type; - arg->lk.pid = pid; + + memset(inarg, 0, sizeof(*inarg)); + inarg->fh = ff->fh; + inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->lk.start = fl->fl_start; + inarg->lk.end = fl->fl_end; + inarg->lk.type = fl->fl_type; + inarg->lk.pid = pid; if (flock) - arg->lk_flags |= FUSE_LK_FLOCK; - req->in.h.opcode = opcode; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; + inarg->lk_flags |= FUSE_LK_FLOCK; + args->in.h.opcode = opcode; + args->in.h.nodeid = get_node_id(inode); + args->in.numargs = 1; + args->in.args[0].size = sizeof(*inarg); + args->in.args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); + struct fuse_lk_in inarg; struct fuse_lk_out outarg; int err; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - - fuse_lk_fill(req, file, fl, FUSE_GETLK, 0, 0); - req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) err = convert_fuse_file_lock(&outarg.lk, fl); @@ -2209,7 +2158,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); + struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; int err; @@ -2223,17 +2173,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if (fl->fl_flags & FL_CLOSE) return 0; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg); + err = fuse_simple_request(fc, &args); - fuse_lk_fill(req, file, fl, opcode, pid, flock); - fuse_request_send(fc, req); - err = req->out.h.error; /* locking is restartable */ if (err == -EINTR) err = -ERESTARTSYS; - fuse_put_request(fc, req); + return err; } @@ -2283,7 +2229,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; @@ -2291,24 +2237,18 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) if (!inode->i_sb->s_bdev || fc->no_bmap) return 0; - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return 0; - memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - req->in.h.opcode = FUSE_BMAP; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_BMAP; + args.in.h.nodeid = get_node_id(inode); + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) fc->no_bmap = 1; @@ -2776,7 +2716,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) struct fuse_conn *fc = ff->fc; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; - struct fuse_req *req; + FUSE_ARGS(args); int err; if (fc->no_poll) @@ -2794,21 +2734,15 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return POLLERR; - - req->in.h.opcode = FUSE_POLL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.in.h.opcode = FUSE_POLL; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) return outarg.revents; @@ -2949,10 +2883,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; - struct inode *inode = file->f_inode; + struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = ff->fc; - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, .offset = offset, @@ -2985,25 +2919,16 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto out; - } - - req->in.h.opcode = FUSE_FALLOCATE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - fuse_request_send(fc, req); - err = req->out.h.error; + args.in.h.opcode = FUSE_FALLOCATE; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_fallocate = 1; err = -EOPNOTSUPP; } - fuse_put_request(fc, req); - if (err) goto out; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e8e47a6ab518..1cdfb07c1376 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -213,7 +213,7 @@ struct fuse_out { unsigned numargs; /** Array of arguments */ - struct fuse_arg args[3]; + struct fuse_arg args[2]; }; /** FUSE page descriptor */ @@ -222,6 +222,25 @@ struct fuse_page_desc { unsigned int offset; }; +struct fuse_args { + struct { + struct { + uint32_t opcode; + uint64_t nodeid; + } h; + unsigned numargs; + struct fuse_in_arg args[3]; + + } in; + struct { + unsigned argvar:1; + unsigned numargs; + struct fuse_arg args[2]; + } out; +}; + +#define FUSE_ARGS(args) struct fuse_args args = {} + /** The request state */ enum fuse_req_state { FUSE_REQ_INIT = 0, @@ -305,11 +324,8 @@ struct fuse_req { /** Data for asynchronous requests */ union { struct { - union { - struct fuse_release_in in; - struct work_struct work; - }; - struct path path; + struct fuse_release_in in; + struct inode *inode; } release; struct fuse_init_in init_in; struct fuse_init_out init_out; @@ -324,7 +340,6 @@ struct fuse_req { struct fuse_req *next; } write; struct fuse_notify_retrieve_in retrieve_in; - struct fuse_lk_in lk_in; } misc; /** page vector */ @@ -754,15 +769,6 @@ struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, void __fuse_get_request(struct fuse_req *req); /** - * Get a request, may fail with -ENOMEM, - * useful for callers who doesn't use req->pages[] - */ -static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc) -{ - return fuse_get_req(fc, 0); -} - -/** * Gets a requests for a file operation, always succeeds */ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, @@ -780,6 +786,11 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); /** + * Simple request sending that does request allocation and freeing + */ +ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); + +/** * Send a request in the background */ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); @@ -804,8 +815,6 @@ void fuse_invalidate_atime(struct inode *inode); */ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); -void fuse_conn_kill(struct fuse_conn *fc); - /** * Initialize fuse_conn */ @@ -897,4 +906,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); +void fuse_set_initialized(struct fuse_conn *fc); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 03246cd9d47a..e8799c11424b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; - inode->i_data.backing_dev_info = &fc->bdi; fuse_init_inode(inode, attr); unlock_new_inode(inode); } else if ((inode->i_mode ^ attr->mode) & S_IFMT) { @@ -376,28 +375,13 @@ static void fuse_bdi_destroy(struct fuse_conn *fc) bdi_destroy(&fc->bdi); } -void fuse_conn_kill(struct fuse_conn *fc) -{ - spin_lock(&fc->lock); - fc->connected = 0; - fc->blocked = 0; - fc->initialized = 1; - spin_unlock(&fc->lock); - /* Flush all readers on this fs */ - kill_fasync(&fc->fasync, SIGIO, POLL_IN); - wake_up_all(&fc->waitq); - wake_up_all(&fc->blocked_waitq); - wake_up_all(&fc->reserved_req_waitq); -} -EXPORT_SYMBOL_GPL(fuse_conn_kill); - static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); fuse_send_destroy(fc); - fuse_conn_kill(fc); + fuse_abort_conn(fc); mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); @@ -425,7 +409,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - struct fuse_req *req; + FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; @@ -434,23 +418,16 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - memset(&outarg, 0, sizeof(outarg)); - req->in.numargs = 0; - req->in.h.opcode = FUSE_STATFS; - req->in.h.nodeid = get_node_id(dentry->d_inode); - req->out.numargs = 1; - req->out.args[0].size = - fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); - req->out.args[0].value = &outarg; - fuse_request_send(fc, req); - err = req->out.h.error; + args.in.numargs = 0; + args.in.h.opcode = FUSE_STATFS; + args.in.h.nodeid = get_node_id(dentry->d_inode); + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); - fuse_put_request(fc, req); return err; } @@ -919,7 +896,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } - fc->initialized = 1; + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3088e2a38e30..7b3143064af1 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) BUG_ON(name == NULL); - if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) return -E2BIG; if (type == ACL_TYPE_ACCESS) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 805b37fed638..4ad4f94edebe 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -289,7 +289,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - trace_wbc_writepage(wbc, mapping->backing_dev_info); + trace_wbc_writepage(wbc, inode_to_bdi(inode)); ret = __gfs2_jdata_writepage(page, wbc); if (unlikely(ret)) { diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 5d4261ff5d23..487527b42d94 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -365,23 +365,17 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - if (is_vmalloc_addr(hc)) - vfree(hc); - else - kfree(hc); + kvfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) { - if (is_vmalloc_addr(hc)) - vfree(hc); - else - kfree(hc); - } else { + if (likely(!ip->i_hash_cache)) { ip->i_hash_cache = hc; + hc = NULL; } spin_unlock(&inode->i_lock); + kvfree(hc); return ip->i_hash_cache; } @@ -396,10 +390,7 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { __be64 *hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - if (is_vmalloc_addr(hc)) - vfree(hc); - else - kfree(hc); + kvfree(hc); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -1168,10 +1159,7 @@ fail: gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - if (is_vmalloc_addr(hc2)) - vfree(hc2); - else - kfree(hc2); + kvfree(hc2); return error; } @@ -1302,14 +1290,6 @@ static void *gfs2_alloc_sort_buffer(unsigned size) return ptr; } -static void gfs2_free_sort_buffer(void *ptr) -{ - if (is_vmalloc_addr(ptr)) - vfree(ptr); - else - kfree(ptr); -} - static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, int *copied, unsigned *depth, u64 leaf_no) @@ -1393,7 +1373,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, out_free: for(i = 0; i < leaf; i++) brelse(larr[i]); - gfs2_free_sort_buffer(larr); + kvfree(larr); out: return error; } @@ -1829,7 +1809,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) gfs2_consist_inode(dip); dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; - if (S_ISDIR(dentry->d_inode->i_mode)) + if (d_is_dir(dentry)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -1916,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = vzalloc(size); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, + PAGE_KERNEL); if (!ht) return -ENOMEM; @@ -2004,10 +1985,7 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - if (is_vmalloc_addr(ht)) - vfree(ht); - else - kfree(ht); + kvfree(ht); return error; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 8b9b3775e2e7..c41d255b6a7b 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -69,10 +69,12 @@ struct get_name_filldir { char *name; }; -static int get_name_filldir(void *opaque, const char *name, int length, - loff_t offset, u64 inum, unsigned int type) +static int get_name_filldir(struct dir_context *ctx, const char *name, + int length, loff_t offset, u64 inum, + unsigned int type) { - struct get_name_filldir *gnfd = opaque; + struct get_name_filldir *gnfd = + container_of(ctx, struct get_name_filldir, ctx); if (inum != gnfd->inum.no_addr) return 0; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 80dd44dca028..3e32bb8e2d7e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -337,7 +337,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; int hint = min_t(size_t, INT_MAX, blks); - atomic_set(&ip->i_res->rs_sizehint, hint); + if (hint > atomic_read(&ip->i_res->rs_sizehint)) + atomic_set(&ip->i_res->rs_sizehint, hint); } /** @@ -497,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; /** @@ -654,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode->i_state & I_DIRTY_ALL; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -667,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) - sync_state &= ~I_DIRTY_SYNC; + sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); if (sync_state) { ret = sync_inode_metadata(inode, 1); @@ -728,7 +728,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; - loff_t size = len; unsigned int nr_blks; sector_t lblock = offset >> inode->i_blkbits; @@ -762,11 +761,6 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, goto out; } } - if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) - i_size_write(inode, offset + size); - - mark_inode_dirty(inode); - out: brelse(dibh); return error; @@ -796,8 +790,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, } } -static long gfs2_fallocate(struct file *file, int mode, loff_t offset, - loff_t len) +static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -811,14 +804,9 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; loff_t max_chunk_size = UINT_MAX & bsize_mask; - struct gfs2_holder gh; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; - /* We only support the FALLOC_FL_KEEP_SIZE mode */ - if (mode & ~FALLOC_FL_KEEP_SIZE) - return -EOPNOTSUPP; - offset &= bsize_mask; len = next - offset; @@ -829,17 +817,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; - error = gfs2_rs_alloc(ip); - if (error) - return error; - - mutex_lock(&inode->i_mutex); - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - error = gfs2_glock_nq(&gh); - if (unlikely(error)) - goto out_uninit; - gfs2_size_hint(file, offset, len); while (len > 0) { @@ -852,8 +829,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, } error = gfs2_quota_lock_check(ip); if (error) - goto out_unlock; - + return error; retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); @@ -895,20 +871,64 @@ retry: gfs2_quota_unlock(ip); } - if (error == 0) - error = generic_write_sync(file, pos, count); - goto out_unlock; + if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) { + i_size_write(inode, pos + count); + /* Marks the inode as dirty */ + file_update_time(file); + } + + return generic_write_sync(file, pos, count); out_trans_fail: gfs2_inplace_release(ip); out_qunlock: gfs2_quota_unlock(ip); + return error; +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + mutex_lock(&inode->i_mutex); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (offset + len) > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out_unlock; + } + + ret = get_write_access(inode); + if (ret) + goto out_unlock; + + ret = gfs2_rs_alloc(ip); + if (ret) + goto out_putw; + + ret = __gfs2_fallocate(file, mode, offset, len); + if (ret) + gfs2_rs_deltree(ip->i_res); +out_putw: + put_write_access(inode); out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); mutex_unlock(&inode->i_mutex); - return error; + return ret; } #ifdef CONFIG_GFS2_FS_LOCKING_DLM diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8f0c19d1d943..f42dffba056a 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) spin_unlock(&lru_lock); } -static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); } -} - -static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) -{ - spin_lock(&lru_lock); - __gfs2_glock_remove_from_lru(gl); spin_unlock(&lru_lock); } @@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl) lockref_mark_dead(&gl->gl_lockref); - spin_lock(&lru_lock); - __gfs2_glock_remove_from_lru(gl); - spin_unlock(&lru_lock); + gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); spin_lock_bucket(gl->gl_hash); hlist_bl_del_rcu(&gl->gl_list); @@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = s->s_bdi; mapping->writeback_index = 0; } @@ -836,8 +828,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = _RET_IP_; - if (gh->gh_owner_pid) - put_pid(gh->gh_owner_pid); + put_pid(gh->gh_owner_pid); gh->gh_owner_pid = get_pid(task_pid(current)); } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 1cc0bba6313f..fe91951c3361 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,8 @@ #include "trans.h" #include "dir.h" +struct workqueue_struct *gfs2_freeze_wq; + static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) { fs_err(gl->gl_sbd, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page state 0x%lx\n", @@ -94,11 +96,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) * on the stack */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); tr.tr_ip = _RET_IP_; - sb_start_intwrite(sdp->sd_vfs); - if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { - sb_end_intwrite(sdp->sd_vfs); + if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) return; - } WARN_ON_ONCE(current->journal_info); current->journal_info = &tr; @@ -469,20 +468,19 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) static void freeze_go_sync(struct gfs2_glock *gl) { + int error = 0; struct gfs2_sbd *sdp = gl->gl_sbd; - DEFINE_WAIT(wait); if (gl->gl_state == LM_ST_SHARED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - atomic_set(&sdp->sd_log_freeze, 1); - wake_up(&sdp->sd_logd_waitq); - do { - prepare_to_wait(&sdp->sd_log_frozen_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&sdp->sd_log_freeze)) - io_schedule(); - } while(atomic_read(&sdp->sd_log_freeze)); - finish_wait(&sdp->sd_log_frozen_wait, &wait); + atomic_set(&sdp->sd_freeze_state, SFS_STARTING_FREEZE); + error = freeze_super(sdp->sd_vfs); + if (error) { + printk(KERN_INFO "GFS2: couldn't freeze filesystem: %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work); + gfs2_log_flush(sdp, NULL, FREEZE_FLUSH); } } diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 7455d2629bcb..8ed1857c1a8d 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -12,6 +12,8 @@ #include "incore.h" +extern struct workqueue_struct *gfs2_freeze_wq; + extern const struct gfs2_glock_operations gfs2_meta_glops; extern const struct gfs2_glock_operations gfs2_inode_glops; extern const struct gfs2_glock_operations gfs2_rgrp_glops; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 39e7e9959b74..7a2dbbc0d634 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -97,6 +97,7 @@ struct gfs2_rgrpd { #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ struct rb_root rd_rstree; /* multi-block reservation tree */ @@ -587,6 +588,12 @@ enum { SDF_SKIP_DLM_UNLOCK = 8, }; +enum gfs2_freeze_state { + SFS_UNFROZEN = 0, + SFS_STARTING_FREEZE = 1, + SFS_FROZEN = 2, +}; + #define GFS2_FSNAME_LEN 256 struct gfs2_inum_host { @@ -684,6 +691,7 @@ struct gfs2_sbd { struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_freeze_gl; + struct work_struct sd_freeze_work; wait_queue_head_t sd_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; @@ -788,6 +796,9 @@ struct gfs2_sbd { wait_queue_head_t sd_log_flush_wait; int sd_log_error; + atomic_t sd_reserving_log; + wait_queue_head_t sd_reserving_log_wait; + unsigned int sd_log_flush_head; u64 sd_log_flush_wrapped; @@ -797,12 +808,8 @@ struct gfs2_sbd { /* For quiescing the filesystem */ struct gfs2_holder sd_freeze_gh; - struct gfs2_holder sd_freeze_root_gh; - struct gfs2_holder sd_thaw_gh; - atomic_t sd_log_freeze; - atomic_t sd_frozen_root; - wait_queue_head_t sd_frozen_root_wait; - wait_queue_head_t sd_log_frozen_wait; + atomic_t sd_freeze_state; + struct mutex sd_freeze_mutex; char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c4ed823d150e..73c72253faac 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, } error = gfs2_dir_add(&dip->i_inode, name, ip, da); - if (error) - goto fail_end_trans; -fail_end_trans: gfs2_trans_end(sdp); fail_ipreserv: gfs2_inplace_release(dip); @@ -596,7 +593,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; - struct dentry *d; int error, free_vfs_inode = 0; u32 aflags = 0; unsigned blocks = 1; @@ -624,22 +620,18 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); error = PTR_ERR(inode); if (!IS_ERR(inode)) { - d = d_splice_alias(inode, dentry); - error = PTR_ERR(d); - if (IS_ERR(d)) { - inode = ERR_CAST(d); + if (S_ISDIR(inode->i_mode)) { + iput(inode); + inode = ERR_PTR(-EISDIR); goto fail_gunlock; } + d_instantiate(dentry, inode); error = 0; if (file) { - if (S_ISREG(inode->i_mode)) { - WARN_ON(d != NULL); + if (S_ISREG(inode->i_mode)) error = finish_open(file, dentry, gfs2_open_common, opened); - } else { - error = finish_no_open(file, d); - } - } else { - dput(d); + else + error = finish_no_open(file, NULL); } gfs2_glock_dq_uninit(ghs); return error; @@ -1045,11 +1037,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (error) return error; - error = gfs2_dir_check(&dip->i_inode, name, ip); - if (error) - return error; - - return 0; + return gfs2_dir_check(&dip->i_inode, name, ip); } /** @@ -1254,11 +1242,8 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, if (d != NULL) dentry = d; if (dentry->d_inode) { - if (!(*opened & FILE_OPENED)) { - if (d == NULL) - dget(dentry); - return finish_no_open(file, dentry); - } + if (!(*opened & FILE_OPENED)) + return finish_no_open(file, d); dput(d); return 0; } @@ -1622,26 +1607,18 @@ int gfs2_permission(struct inode *inode, int mask) { struct gfs2_inode *ip; struct gfs2_holder i_gh; - struct gfs2_sbd *sdp = GFS2_SB(inode); int error; int unlock = 0; - int frozen_root = 0; ip = GFS2_I(inode); if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && - inode == sdp->sd_root_dir->d_inode && - atomic_inc_not_zero(&sdp->sd_frozen_root))) - frozen_root = 1; - else { - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - unlock = 1; - } + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; } if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) @@ -1650,8 +1627,6 @@ int gfs2_permission(struct inode *inode, int mask) error = generic_permission(inode, mask); if (unlock) gfs2_glock_dq_uninit(&i_gh); - else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) - wake_up(&sdp->sd_frozen_root_wait); return error; } @@ -1824,29 +1799,19 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode = dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - struct gfs2_sbd *sdp = GFS2_SB(inode); int error; int unlock = 0; - int frozen_root = 0; if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) && - inode == sdp->sd_root_dir->d_inode && - atomic_inc_not_zero(&sdp->sd_frozen_root))) - frozen_root = 1; - else { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) - return error; - unlock = 1; - } + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; } generic_fillattr(inode, stat); if (unlock) gfs2_glock_dq_uninit(&gh); - else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root)) - wake_up(&sdp->sd_frozen_root_wait); return 0; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3966fadbcebd..536e7a6252cd 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -339,6 +339,7 @@ void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) { + int ret = 0; unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); unsigned wanted = blks + reserved_blks; DEFINE_WAIT(wait); @@ -362,9 +363,13 @@ retry: } while(free_blocks <= wanted); finish_wait(&sdp->sd_log_waitq, &wait); } + atomic_inc(&sdp->sd_reserving_log); if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, - free_blocks - blks) != free_blocks) + free_blocks - blks) != free_blocks) { + if (atomic_dec_and_test(&sdp->sd_reserving_log)) + wake_up(&sdp->sd_reserving_log_wait); goto retry; + } trace_gfs2_log_blocks(sdp, -blks); /* @@ -377,9 +382,11 @@ retry: down_read(&sdp->sd_log_flush_lock); if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { gfs2_log_release(sdp, blks); - return -EROFS; + ret = -EROFS; } - return 0; + if (atomic_dec_and_test(&sdp->sd_reserving_log)) + wake_up(&sdp->sd_reserving_log_wait); + return ret; } /** @@ -652,9 +659,12 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) u32 hash; int rw = WRITE_FLUSH_FUA | REQ_META; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); clear_page(lh); + gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); + tail = current_tail(sdp); lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -695,6 +705,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, enum gfs2_flush_type type) { struct gfs2_trans *tr; + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); down_write(&sdp->sd_log_flush_lock); @@ -713,8 +724,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); tr->tr_first = sdp->sd_log_flush_head; + if (unlikely (state == SFS_FROZEN)) + gfs2_assert_withdraw(sdp, !tr->tr_num_buf_new && !tr->tr_num_databuf_new); } + if (unlikely(state == SFS_FROZEN)) + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke); @@ -745,8 +760,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, spin_unlock(&sdp->sd_ail_lock); gfs2_log_unlock(sdp); - if (atomic_read(&sdp->sd_log_freeze)) - type = FREEZE_FLUSH; if (type != NORMAL_FLUSH) { if (!sdp->sd_log_idle) { for (;;) { @@ -763,21 +776,8 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH) gfs2_log_shutdown(sdp); - if (type == FREEZE_FLUSH) { - int error; - - atomic_set(&sdp->sd_log_freeze, 0); - wake_up(&sdp->sd_log_frozen_wait); - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, 0, - &sdp->sd_thaw_gh); - if (error) { - printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); - gfs2_assert_withdraw(sdp, 0); - } - else - gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); - } + if (type == FREEZE_FLUSH) + atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); } trace_gfs2_log_flush(sdp, 0); @@ -888,7 +888,7 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp) static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) { - return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze)); + return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1)); } static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 82b6ac829656..241a399bf83d 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -30,6 +30,7 @@ #include "quota.h" #include "recovery.h" #include "dir.h" +#include "glops.h" struct workqueue_struct *gfs2_control_wq; @@ -161,9 +162,14 @@ static int __init init_gfs2_fs(void) if (!gfs2_control_wq) goto fail_recovery; + gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0); + + if (!gfs2_freeze_wq) + goto fail_control; + gfs2_page_pool = mempool_create_page_pool(64, 0); if (!gfs2_page_pool) - goto fail_control; + goto fail_freeze; gfs2_register_debugfs(); @@ -171,6 +177,8 @@ static int __init init_gfs2_fs(void) return 0; +fail_freeze: + destroy_workqueue(gfs2_freeze_wq); fail_control: destroy_workqueue(gfs2_control_wq); fail_recovery: @@ -224,6 +232,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2meta_fs_type); destroy_workqueue(gfs_recovery_wq); destroy_workqueue(gfs2_control_wq); + destroy_workqueue(gfs2_freeze_wq); list_lru_destroy(&gfs2_qd_lru); rcu_barrier(); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index d3eae244076e..efc8e254787c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = sb->s_bdi; mapping->writeback_index = 0; spin_lock_init(&sdp->sd_log_lock); @@ -129,11 +128,11 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); + atomic_set(&sdp->sd_reserving_log, 0); + init_waitqueue_head(&sdp->sd_reserving_log_wait); init_waitqueue_head(&sdp->sd_log_flush_wait); - init_waitqueue_head(&sdp->sd_log_frozen_wait); - atomic_set(&sdp->sd_log_freeze, 0); - atomic_set(&sdp->sd_frozen_root, 0); - init_waitqueue_head(&sdp->sd_frozen_root_wait); + atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); + mutex_init(&sdp->sd_freeze_mutex); return sdp; } @@ -760,15 +759,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); jindex = 0; - if (!sdp->sd_args.ar_spectator) { - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, - &sdp->sd_thaw_gh); - if (error) { - fs_err(sdp, "can't acquire freeze glock: %d\n", error); - goto fail_jinode_gh; - } - } - gfs2_glock_dq_uninit(&sdp->sd_thaw_gh); + INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); return 0; fail_jinode_gh: @@ -1082,6 +1073,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sb->s_export_op = &gfs2_export_ops; sb->s_xattr = gfs2_xattr_handlers; sb->s_qcop = &gfs2_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 64b29f7f6b4c..3aa17d4d1cfc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list) } -static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) +static enum lru_status gfs2_qd_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); @@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); - list_move(&qd->qd_lru, dispose); + list_lru_isolate_move(lru, &qd->qd_lru, dispose); } spin_unlock(&qd->qd_lockref.lock); @@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, - &dispose, &sc->nr_to_scan); + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, + gfs2_qd_isolate, &dispose); gfs2_qd_dispose(&dispose); @@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } struct shrinker gfs2_qd_shrinker = { @@ -667,7 +668,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, s64 change, struct gfs2_quota_data *qd, - struct fs_disk_quota *fdq) + struct qc_dqblk *fdq) { struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -697,16 +698,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, be64_add_cpu(&q.qu_value, change); qd->qd_qb.qb_value = q.qu_value; if (fdq) { - if (fdq->d_fieldmask & FS_DQ_BSOFT) { - q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); + if (fdq->d_fieldmask & QC_SPC_SOFT) { + q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_warn = q.qu_warn; } - if (fdq->d_fieldmask & FS_DQ_BHARD) { - q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); + if (fdq->d_fieldmask & QC_SPC_HARD) { + q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_limit = q.qu_limit; } - if (fdq->d_fieldmask & FS_DQ_BCOUNT) { - q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); + if (fdq->d_fieldmask & QC_SPACE) { + q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_value = q.qu_value; } } @@ -1360,13 +1361,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count)); - if (sdp->sd_quota_bitmap) { - if (is_vmalloc_addr(sdp->sd_quota_bitmap)) - vfree(sdp->sd_quota_bitmap); - else - kfree(sdp->sd_quota_bitmap); - sdp->sd_quota_bitmap = NULL; - } + kvfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; } static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) @@ -1502,7 +1498,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb, } static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, - struct fs_disk_quota *fdq) + struct qc_dqblk *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_lvb *qlvb; @@ -1510,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, struct gfs2_holder q_gh; int error; - memset(fdq, 0, sizeof(struct fs_disk_quota)); + memset(fdq, 0, sizeof(*fdq)); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ @@ -1527,12 +1523,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, goto out; qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; - fdq->d_version = FS_DQUOT_VERSION; - fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA; - fdq->d_id = from_kqid_munged(current_user_ns(), qid); - fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; - fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; - fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; + fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift; + fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift; + fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift; gfs2_glock_dq_uninit(&q_gh); out: @@ -1541,10 +1534,10 @@ out: } /* GFS2 only supports a subset of the XFS fields */ -#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT) +#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE) static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, - struct fs_disk_quota *fdq) + struct qc_dqblk *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); @@ -1588,17 +1581,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, goto out_i; /* If nothing has changed, this is a no-op */ - if ((fdq->d_fieldmask & FS_DQ_BSOFT) && - ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) - fdq->d_fieldmask ^= FS_DQ_BSOFT; + if ((fdq->d_fieldmask & QC_SPC_SOFT) && + ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) + fdq->d_fieldmask ^= QC_SPC_SOFT; - if ((fdq->d_fieldmask & FS_DQ_BHARD) && - ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) - fdq->d_fieldmask ^= FS_DQ_BHARD; + if ((fdq->d_fieldmask & QC_SPC_HARD) && + ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) + fdq->d_fieldmask ^= QC_SPC_HARD; - if ((fdq->d_fieldmask & FS_DQ_BCOUNT) && - ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value))) - fdq->d_fieldmask ^= FS_DQ_BCOUNT; + if ((fdq->d_fieldmask & QC_SPACE) && + ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= QC_SPACE; if (fdq->d_fieldmask == 0) goto out_i; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 573bd3b758fa..1b645773c98e 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, ls->ls_recover_jid_done = jid; ls->ls_recover_jid_status = message; - sprintf(env_jid, "JID=%d", jid); + sprintf(env_jid, "JID=%u", jid); sprintf(env_status, "RECOVERY=%s", message == LM_RD_SUCCESS ? "Done" : "Failed"); kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 7474c413ffd1..9150207f365c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize; rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; - rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; spin_lock(&sdp->sd_rindex_spin); @@ -955,6 +955,36 @@ fail: } /** + * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use + * @sdp: the GFS2 superblock + * + * The purpose of this function is to select a subset of the resource groups + * and mark them as PREFERRED. We do it in such a way that each node prefers + * to use a unique set of rgrps to minimize glock contention. + */ +static void set_rgrp_preferences(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd, *first; + int i; + + /* Skip an initial number of rgrps, based on this node's journal ID. + That should start each node out on its own set. */ + rgd = gfs2_rgrpd_get_first(sdp); + for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++) + rgd = gfs2_rgrpd_get_next(rgd); + first = rgd; + + do { + rgd->rd_flags |= GFS2_RDF_PREFERRED; + for (i = 0; i < sdp->sd_journals; i++) { + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == first) + break; + } + } while (rgd != first); +} + +/** * gfs2_ri_update - Pull in a new resource index from the disk * @ip: pointer to the rindex inode * @@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip) if (error < 0) return error; + set_rgrp_preferences(sdp); + sdp->sd_rindex_uptodate = 1; return 0; } @@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b } /** + * fast_to_acquire - determine if a resource group will be fast to acquire + * + * If this is one of our preferred rgrps, it should be quicker to acquire, + * because we tried to set ourselves up as dlm lock master. + */ +static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) +{ + struct gfs2_glock *gl = rgd->rd_gl; + + if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + return 1; + if (rgd->rd_flags & GFS2_RDF_PREFERRED) + return 1; + return 0; +} + +/** * gfs2_inplace_reserve - Reserve space in the filesystem * @ip: the inode to reserve space for * @ap: the allocation parameters @@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a rg_locked = 0; if (skip && skip--) goto next_rgrp; - if (!gfs2_rs_active(rs) && (loops < 2) && - gfs2_rgrp_used_recently(rs, 1000) && - gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) - goto next_rgrp; + if (!gfs2_rs_active(rs)) { + if (loops == 0 && + !fast_to_acquire(rs->rs_rbm.rgd)) + goto next_rgrp; + if ((loops < 2) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + goto next_rgrp; + } error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &rs->rs_rgd_gh); @@ -2195,6 +2251,9 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, trace_gfs2_rs(rs, TRACE_RS_CLAIM); if (rs->rs_free && !ret) goto out; + /* We used up our block reservation, so we should + reserve more blocks next time. */ + atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint); } __rs_deltree(rs); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 5d8f085f7ade..b104f4af3afd 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -20,6 +20,7 @@ */ #define RGRP_RSRV_MINBYTES 8 #define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY)) +#define RGRP_RSRV_ADDBLKS 64 struct gfs2_rgrpd; struct gfs2_sbd; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a346f56c4c6d..1666382b198d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -26,6 +26,7 @@ #include <linux/wait.h> #include <linux/writeback.h> #include <linux/backing-dev.h> +#include <linux/kernel.h> #include "gfs2.h" #include "incore.h" @@ -399,7 +400,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_holder thaw_gh; + struct gfs2_holder freeze_gh; struct gfs2_log_header_host head; int error; @@ -408,7 +409,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) return error; error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, - &thaw_gh); + &freeze_gh); if (error) goto fail_threads; @@ -434,13 +435,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_glock_dq_uninit(&thaw_gh); + gfs2_glock_dq_uninit(&freeze_gh); return 0; fail: - thaw_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&thaw_gh); + freeze_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&freeze_gh); fail_threads: kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); @@ -580,14 +581,15 @@ int gfs2_statfs_sync(struct super_block *sb, int type) struct buffer_head *m_bh, *l_bh; int error; + sb_start_write(sb); error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &gh); if (error) - return error; + goto out; error = gfs2_meta_inode_buffer(m_ip, &m_bh); if (error) - goto out; + goto out_unlock; spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + @@ -615,8 +617,10 @@ out_bh2: brelse(l_bh); out_bh: brelse(m_bh); -out: +out_unlock: gfs2_glock_dq_uninit(&gh); +out: + sb_end_write(sb); return error; } @@ -643,14 +647,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct lfcc *lfcc; LIST_HEAD(list); struct gfs2_log_header_host lh; - struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode); int error; - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, - &sdp->sd_freeze_root_gh); - if (error) - return error; - atomic_set(&sdp->sd_frozen_root, 1); list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); if (!lfcc) { @@ -692,11 +690,6 @@ out: gfs2_glock_dq_uninit(&lfcc->gh); kfree(lfcc); } - if (error) { - atomic_dec(&sdp->sd_frozen_root); - wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); - gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); - } return error; } @@ -750,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); - struct backing_dev_info *bdi = metamapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); int ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) @@ -834,18 +827,14 @@ out: static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - struct gfs2_holder thaw_gh; + struct gfs2_holder freeze_gh; int error; error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, - &thaw_gh); + &freeze_gh); if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return error; - down_write(&sdp->sd_log_flush_lock); - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - up_write(&sdp->sd_log_flush_lock); - kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); @@ -853,11 +842,16 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); + down_write(&sdp->sd_log_flush_lock); + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + up_write(&sdp->sd_log_flush_lock); + gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); + wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); - if (thaw_gh.gh_gl) - gfs2_glock_dq_uninit(&thaw_gh); + if (freeze_gh.gh_gl) + gfs2_glock_dq_uninit(&freeze_gh); gfs2_quota_cleanup(sdp); @@ -943,11 +937,41 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp && !atomic_read(&sdp->sd_log_freeze)) + if (wait && sdp) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); return 0; } +void gfs2_freeze_func(struct work_struct *work) +{ + int error; + struct gfs2_holder freeze_gh; + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work); + struct super_block *sb = sdp->sd_vfs; + + atomic_inc(&sb->s_active); + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0, + &freeze_gh); + if (error) { + printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error); + gfs2_assert_withdraw(sdp, 0); + } + else { + atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); + error = thaw_super(sb); + if (error) { + printk(KERN_INFO "GFS2: couldn't thaw filesystem: %d\n", + error); + gfs2_assert_withdraw(sdp, 0); + } + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + freeze_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&freeze_gh); + } + deactivate_super(sb); + return; +} + /** * gfs2_freeze - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem @@ -957,10 +981,16 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) static int gfs2_freeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - int error; + int error = 0; - if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return -EINVAL; + mutex_lock(&sdp->sd_freeze_mutex); + if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) + goto out; + + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { + error = -EINVAL; + goto out; + } for (;;) { error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); @@ -980,7 +1010,10 @@ static int gfs2_freeze(struct super_block *sb) fs_err(sdp, "retrying...\n"); msleep(1000); } - return 0; + error = 0; +out: + mutex_unlock(&sdp->sd_freeze_mutex); + return error; } /** @@ -993,10 +1026,15 @@ static int gfs2_unfreeze(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; + mutex_lock(&sdp->sd_freeze_mutex); + if (atomic_read(&sdp->sd_freeze_state) != SFS_FROZEN || + sdp->sd_freeze_gh.gh_gl == NULL) { + mutex_unlock(&sdp->sd_freeze_mutex); + return 0; + } + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - atomic_dec(&sdp->sd_frozen_root); - wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0); - gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh); + mutex_unlock(&sdp->sd_freeze_mutex); return 0; } @@ -1618,8 +1656,8 @@ const struct super_operations gfs2_super_ops = { .evict_inode = gfs2_evict_inode, .put_super = gfs2_put_super, .sync_fs = gfs2_sync_fs, - .freeze_fs = gfs2_freeze, - .unfreeze_fs = gfs2_unfreeze, + .freeze_super = gfs2_freeze, + .thaw_super = gfs2_unfreeze, .statfs = gfs2_statfs, .remount_fs = gfs2_remount_fs, .drop_inode = gfs2_drop_inode, diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 90e3322ffa10..73c97dccae21 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -45,6 +45,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); +extern void gfs2_freeze_func(struct work_struct *work); extern struct file_system_type gfs2_fs_type; extern struct file_system_type gfs2meta_fs_type; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 3ab566ba5696..ae8e8811f0e8 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) struct super_block *sb = sdp->sd_vfs; int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; - return snprintf(buf, PAGE_SIZE, "%u\n", frozen); + return snprintf(buf, PAGE_SIZE, "%d\n", frozen); } static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 42bfd3361979..88bff2430669 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -89,14 +89,17 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; s64 nbuf; + int alloced = tr->tr_alloced; + BUG_ON(!tr); current->journal_info = NULL; if (!tr->tr_touched) { gfs2_log_release(sdp, tr->tr_reserved); - if (tr->tr_alloced) + if (alloced) { kfree(tr); - sb_end_intwrite(sdp->sd_vfs); + sb_end_intwrite(sdp->sd_vfs); + } return; } @@ -109,13 +112,14 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) gfs2_print_trans(tr); gfs2_log_commit(sdp, tr); - if (tr->tr_alloced && !tr->tr_attached) + if (alloced && !tr->tr_attached) kfree(tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); - sb_end_intwrite(sdp->sd_vfs); + if (alloced) + sb_end_intwrite(sdp->sd_vfs); } static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, @@ -192,6 +196,7 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) { struct gfs2_meta_header *mh; struct gfs2_trans *tr; + enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); tr = current->journal_info; tr->tr_touched = 1; @@ -205,6 +210,10 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) (unsigned long long)bd->bd_bh->b_blocknr); BUG(); } + if (unlikely(state == SFS_FROZEN)) { + printk(KERN_INFO "GFS2:adding buf while frozen\n"); + gfs2_assert_withdraw(sdp, 0); + } gfs2_pin(sdp, bd->bd_bh); mh->__pad0 = cpu_to_be64(0); mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index ff0316b925a5..db458ee3a546 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -162,14 +162,16 @@ err2: */ int hfs_cat_keycmp(const btree_key *key1, const btree_key *key2) { - int retval; + __be32 k1p, k2p; - retval = be32_to_cpu(key1->cat.ParID) - be32_to_cpu(key2->cat.ParID); - if (!retval) - retval = hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, - key2->cat.CName.name, key2->cat.CName.len); + k1p = key1->cat.ParID; + k2p = key2->cat.ParID; - return retval; + if (k1p != k2p) + return be32_to_cpu(k1p) < be32_to_cpu(k2p) ? -1 : 1; + + return hfs_strcmp(key1->cat.CName.name, key1->cat.CName.len, + key2->cat.CName.name, key2->cat.CName.len); } /* Try to get a catalog entry for given catalog id */ diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c index 6e560d56094b..754fdf8c6356 100644 --- a/fs/hfsplus/brec.c +++ b/fs/hfsplus/brec.c @@ -131,13 +131,16 @@ skip: hfs_bnode_write(node, entry, data_off + key_len, entry_len); hfs_bnode_dump(node); - if (new_node) { - /* update parent key if we inserted a key - * at the start of the first node - */ - if (!rec && new_node != node) - hfs_brec_update_parent(fd); + /* + * update parent key if we inserted a key + * at the start of the node and it is not the new node + */ + if (!rec && new_node != node) { + hfs_bnode_read_key(node, fd->search_key, data_off + size); + hfs_brec_update_parent(fd); + } + if (new_node) { hfs_bnode_put(fd->bnode); if (!new_node->parent) { hfs_btree_inc_height(tree); @@ -168,9 +171,6 @@ skip: goto again; } - if (!rec) - hfs_brec_update_parent(fd); - return 0; } @@ -370,6 +370,8 @@ again: if (IS_ERR(parent)) return PTR_ERR(parent); __hfs_brec_find(parent, fd, hfs_find_rec_by_key); + if (fd->record < 0) + return -ENOENT; hfs_bnode_dump(parent); rec = fd->record; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 32602c667b4a..7892e6fddb66 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -38,21 +38,30 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, return hfsplus_strcmp(&k1->cat.name, &k2->cat.name); } -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, - u32 parent, struct qstr *str) +/* Generates key for catalog file/folders record. */ +int hfsplus_cat_build_key(struct super_block *sb, + hfsplus_btree_key *key, u32 parent, struct qstr *str) { - int len; + int len, err; key->cat.parent = cpu_to_be32(parent); - if (str) { - hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, - str->name, str->len); - len = be16_to_cpu(key->cat.name.length); - } else { - key->cat.name.length = 0; - len = 0; - } + err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, + str->name, str->len); + if (unlikely(err < 0)) + return err; + + len = be16_to_cpu(key->cat.name.length); key->key_len = cpu_to_be16(6 + 2 * len); + return 0; +} + +/* Generates key for catalog thread record. */ +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent) +{ + key->cat.parent = cpu_to_be32(parent); + key->cat.name.length = 0; + key->key_len = cpu_to_be16(6); } static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, @@ -167,11 +176,16 @@ static int hfsplus_fill_cat_thread(struct super_block *sb, hfsplus_cat_entry *entry, int type, u32 parentid, struct qstr *str) { + int err; + entry->type = cpu_to_be16(type); entry->thread.reserved = 0; entry->thread.parentID = cpu_to_be32(parentid); - hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, + err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, str->name, str->len); + if (unlikely(err < 0)) + return err; + return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; } @@ -183,7 +197,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, int err; u16 type; - hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid); err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry)); if (err) return err; @@ -250,11 +264,16 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) return err; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, dir->i_ino, str); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto err2; + } + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) @@ -265,7 +284,10 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) goto err2; - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto err1; + entry_size = hfsplus_cat_build_record(&entry, cnid, inode); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { @@ -288,7 +310,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, return 0; err1: - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); if (!hfs_brec_find(&fd, hfs_find_rec_by_key)) hfs_brec_remove(&fd); err2: @@ -313,7 +335,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (!str) { int len; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -329,7 +351,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) off + 2, len); fd.search_key->key_len = cpu_to_be16(6 + len); } else - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto out; err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) @@ -360,7 +384,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (err) goto out; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -405,7 +429,11 @@ int hfsplus_rename_cat(u32 cnid, dst_fd = src_fd; /* find the old dir entry and read the data */ - hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -419,7 +447,11 @@ int hfsplus_rename_cat(u32 cnid, type = be16_to_cpu(entry.type); /* create new dir entry with the data from the old entry */ - hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); + err = hfsplus_cat_build_key(sb, dst_fd.search_key, + dst_dir->i_ino, dst_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) @@ -436,7 +468,11 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; /* finally remove the old entry */ - hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -449,7 +485,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; /* remove old thread entry */ - hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -459,9 +495,14 @@ int hfsplus_rename_cat(u32 cnid, goto out; /* create new thread entry */ - hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto out; + } + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 610a3260bef1..f0235c1640af 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -44,7 +44,10 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return ERR_PTR(err); - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, + &dentry->d_name); + if (unlikely(err < 0)) + goto fail; again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); if (err) { @@ -97,9 +100,11 @@ again: be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; - hfsplus_cat_build_key(sb, fd.search_key, + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); + if (unlikely(err < 0)) + goto fail; goto again; } } else if (!dentry->d_fsdata) @@ -145,7 +150,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) err = -ENOMEM; goto out; } - hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -525,7 +530,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (d_is_dir(new_dentry)) res = hfsplus_rmdir(new_dir, new_dentry); else res = hfsplus_unlink(new_dir, new_dentry); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index eb5e059f481a..b0441d65fa54 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -443,8 +443,10 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, +int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, u32 parent, struct qstr *str); +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent); void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); int hfsplus_find_cat(struct super_block *sb, u32 cnid, struct hfs_find_data *fd); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 4cf2024b87da..593af2fdcc2d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -515,7 +515,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = hfs_find_init(sbi->cat_tree, &fd); if (err) goto out_put_root; - hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + if (unlikely(err < 0)) + goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 4338ff32959d..043ac9d77262 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -548,10 +548,11 @@ struct hppfs_dirent { struct dentry *dentry; }; -static int hppfs_filldir(void *d, const char *name, int size, +static int hppfs_filldir(struct dir_context *ctx, const char *name, int size, loff_t offset, u64 inode, unsigned int type) { - struct hppfs_dirent *dirent = d; + struct hppfs_dirent *dirent = + container_of(ctx, struct hppfs_dirent, ctx); if (file_removed(dirent->dentry, name)) return 0; @@ -677,10 +678,10 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) return NULL; } - if (S_ISDIR(dentry->d_inode->i_mode)) { + if (d_is_dir(dentry)) { inode->i_op = &hppfs_dir_iops; inode->i_fop = &hppfs_dir_fops; - } else if (S_ISLNK(dentry->d_inode->i_mode)) { + } else if (d_is_symlink(dentry)) { inode->i_op = &hppfs_link_iops; inode->i_fop = &hppfs_file_fops; } else { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e2872b25343..c274aca8e8dc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); } -static struct backing_dev_info hugetlbfs_backing_dev_info = { - .name = "hugetlbfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - int sysctl_hugetlb_shm_group; enum { @@ -412,10 +406,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_write(mapping); truncate_hugepages(inode, offset); return 0; } @@ -472,12 +466,12 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, } /* - * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an * annotation because huge_pmd_share() does an allocation under - * i_mmap_mutex. + * i_mmap_rwsem. */ -static struct lock_class_key hugetlbfs_i_mmap_mutex_key; +static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, @@ -495,10 +489,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); - lockdep_set_class(&inode->i_mapping->i_mmap_mutex, - &hugetlbfs_i_mmap_mutex_key); + lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, + &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); @@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void) return -ENOTSUPP; } - error = bdi_init(&hugetlbfs_backing_dev_info); - if (error) - return error; - error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), @@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void) out: kmem_cache_destroy(hugetlbfs_inode_cachep); out2: - bdi_destroy(&hugetlbfs_backing_dev_info); return error; } @@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void) for_each_hstate(h) kern_unmount(hugetlbfs_vfsmount[i++]); unregister_filesystem(&hugetlbfs_fs_type); - bdi_destroy(&hugetlbfs_backing_dev_info); } module_init(init_hugetlbfs_fs) diff --git a/fs/inode.c b/fs/inode.c index 26753ba7b6d6..f00b16f45507 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> +#include <trace/events/writeback.h> #include "internal.h" /* @@ -30,7 +31,7 @@ * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -114,6 +115,11 @@ int proc_nr_inodes(struct ctl_table *table, int write, } #endif +static int no_open(struct inode *inode, struct file *file) +{ + return -ENXIO; +} + /** * inode_init_always - perform inode structure intialisation * @sb: superblock inode belongs to @@ -125,7 +131,7 @@ int proc_nr_inodes(struct ctl_table *table, int write, int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct inode_operations empty_iops; - static const struct file_operations empty_fops; + static const struct file_operations no_open_fops = {.open = no_open}; struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -133,7 +139,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_flags = 0; atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; - inode->i_fop = &empty_fops; + inode->i_fop = &no_open_fops; inode->__i_nlink = 1; inode->i_opflags = 0; i_uid_write(inode, 0); @@ -143,9 +149,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_blocks = 0; inode->i_bytes = 0; inode->i_generation = 0; -#ifdef CONFIG_QUOTA - memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); -#endif inode->i_pipe = NULL; inode->i_bdev = NULL; inode->i_cdev = NULL; @@ -168,20 +171,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&mapping->i_mmap_writable, 0); mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; - mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; - - /* - * If the block_device provides a backing_dev_info for client - * inodes then use that. Otherwise the inode share the bdev's - * backing_dev_info. - */ - if (sb->s_bdev) { - struct backing_dev_info *bdi; - - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - mapping->backing_dev_info = bdi; - } inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ @@ -192,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) #ifdef CONFIG_FSNOTIFY inode->i_fsnotify_mask = 0; #endif - + inode->i_flctx = NULL; this_cpu_inc(nr_inodes); return 0; @@ -235,6 +225,7 @@ void __destroy_inode(struct inode *inode) BUG_ON(inode_has_buffers(inode)); security_inode_free(inode); fsnotify_inode_delete(inode); + locks_free_lock_context(inode->i_flctx); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); @@ -349,11 +340,10 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - mutex_init(&mapping->i_mmap_mutex); + init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT; - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); } EXPORT_SYMBOL(address_space_init_once); @@ -414,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) inode_lru_list_add(inode); } @@ -645,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY && !kill_dirty) { + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { spin_unlock(&inode->i_lock); busy = 1; continue; @@ -683,8 +674,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) * LRU does not have strict ordering. Hence we don't want to reclaim inodes * with this flag set because they are the inodes that are out of order. */ -static enum lru_status -inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) +static enum lru_status inode_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); @@ -702,7 +693,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) */ if (atomic_read(&inode->i_count) || (inode->i_state & ~I_REFERENCED)) { - list_del_init(&inode->i_lru); + list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; @@ -736,7 +727,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; - list_move(&inode->i_lru, freeable); + list_lru_isolate_move(lru, &inode->i_lru, freeable); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); @@ -749,14 +740,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) * to trim from the LRU. Inodes to be freed are moved to a temporary list and * then are freed outside inode_lock by dispose_list(). */ -long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid) +long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(freeable); long freed; - freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, - &freeable, &nr_to_scan); + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, + inode_lru_isolate, &freeable); dispose_list(&freeable); return freed; } @@ -1280,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) } EXPORT_SYMBOL(ilookup); +/** + * find_inode_nowait - find an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @match: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @match + * + * Search for the inode specified by @hashval and @data in the inode + * cache, where the helper function @match will return 0 if the inode + * does not match, 1 if the inode does match, and -1 if the search + * should be stopped. The @match function must be responsible for + * taking the i_lock spin_lock and checking i_state for an inode being + * freed or being initialized, and incrementing the reference count + * before returning 1. It also must not sleep, since it is called with + * the inode_hash_lock spinlock held. + * + * This is a even more generalized version of ilookup5() when the + * function must never block --- find_inode() can block in + * __wait_on_freeing_inode() --- or when the caller can not increment + * the reference count because the resulting iput() might cause an + * inode eviction. The tradeoff is that the @match funtion must be + * very carefully implemented. + */ +struct inode *find_inode_nowait(struct super_block *sb, + unsigned long hashval, + int (*match)(struct inode *, unsigned long, + void *), + void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode, *ret_inode = NULL; + int mval; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) + continue; + mval = match(inode, hashval, data); + if (mval == 0) + continue; + if (mval == 1) + ret_inode = inode; + goto out; + } +out: + spin_unlock(&inode_hash_lock); + return ret_inode; +} +EXPORT_SYMBOL(find_inode_nowait); + int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1430,11 +1470,20 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (inode) { - BUG_ON(inode->i_state & I_CLEAR); - - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) - iput_final(inode); + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); } } EXPORT_SYMBOL(iput); @@ -1493,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -/* - * This does the actual work of updating an inodes time or version. Must have - * had called mnt_want_write() before calling this. - */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec *time, int flags) { - if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); + int iflags = I_DIRTY_TIME; if (flags & S_ATIME) inode->i_atime = *time; @@ -1510,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; - mark_inode_dirty_sync(inode); + + if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); return 0; } +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + int (*update_time)(struct inode *, struct timespec *, int); + + update_time = inode->i_op->update_time ? inode->i_op->update_time : + generic_update_time; + + return update_time(inode, time, flags); +} /** * touch_atime - update the access time @@ -1801,7 +1863,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; else if (S_ISSOCK(mode)) - inode->i_fop = &bad_sock_fops; + ; /* leave it no_open_fops */ else printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, diff --git a/fs/internal.h b/fs/internal.h index 757ba2abf21e..01dce1d1476b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -14,6 +14,7 @@ struct file_system_type; struct linux_binprm; struct path; struct mount; +struct shrink_control; /* * block_dev.c @@ -83,7 +84,7 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); -extern bool grab_super_passive(struct super_block *sb); +extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); extern struct super_block *user_get_super(dev_t); @@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f); * inode.c */ extern spinlock_t inode_sb_list_lock; -extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid); +extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); /* @@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool); */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); -extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, - int nid); +extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); /* * read_write.c @@ -145,5 +144,10 @@ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ -extern void sb_pin_kill(struct super_block *sb); +extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); + +/* + * fs/nsfs.c + */ +extern struct dentry_operations ns_dentry_operations; diff --git a/fs/ioctl.c b/fs/ioctl.c index 8ac3fad36192..5d01d2638ca5 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode, past_eof = true; } cond_resched(); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + } while (1); /* If ret is 1 then we just hit the end of the extent array */ @@ -443,7 +448,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp) return -EINVAL; } - return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); + return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } static int file_ioctl(struct file *filp, unsigned int cmd, @@ -518,10 +523,12 @@ static int ioctl_fsfreeze(struct file *filp) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ - if (sb->s_op->freeze_fs == NULL) + if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) return -EOPNOTSUPP; /* Freeze */ + if (sb->s_op->freeze_super) + return sb->s_op->freeze_super(sb); return freeze_super(sb); } @@ -533,6 +540,8 @@ static int ioctl_fsthaw(struct file *filp) return -EPERM; /* Thaw */ + if (sb->s_op->thaw_super) + return sb->s_op->thaw_super(sb); return thaw_super(sb); } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index fe839b915116..d67a16f2a45d 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -174,27 +174,6 @@ struct iso9660_options{ * Compute the hash for the isofs name corresponding to the dentry. */ static int -isofs_hash_common(struct qstr *qstr, int ms) -{ - const char *name; - int len; - - len = qstr->len; - name = qstr->name; - if (ms) { - while (len && name[len-1] == '.') - len--; - } - - qstr->hash = full_name_hash(name, len); - - return 0; -} - -/* - * Compute the hash for the isofs name corresponding to the dentry. - */ -static int isofs_hashi_common(struct qstr *qstr, int ms) { const char *name; @@ -263,6 +242,27 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, } #ifdef CONFIG_JOLIET +/* + * Compute the hash for the isofs name corresponding to the dentry. + */ +static int +isofs_hash_common(struct qstr *qstr, int ms) +{ + const char *name; + int len; + + len = qstr->len; + name = qstr->name; + if (ms) { + while (len && name[len-1] == '.') + len--; + } + + qstr->hash = full_name_hash(name, len); + + return 0; +} + static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index f488bbae541a..735d7522a3a9 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -30,6 +30,7 @@ struct rock_state { int cont_size; int cont_extent; int cont_offset; + int cont_loops; struct inode *inode; }; @@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode) rs->inode = inode; } +/* Maximum number of Rock Ridge continuation entries */ +#define RR_MAX_CE_ENTRIES 32 + /* * Returns 0 if the caller should continue scanning, 1 if the scan must end * and -ve on error. @@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs) goto out; } ret = -EIO; + if (++rs->cont_loops >= RR_MAX_CE_ENTRIES) + goto out; bh = sb_bread(rs->inode->i_sb, rs->cont_extent); if (bh) { memcpy(rs->buffer, bh->b_data + rs->cont_offset, @@ -356,6 +362,9 @@ repeat: rs.cont_size = isonum_733(rr->u.CE.size); break; case SIG('E', 'R'): + /* Invalid length of ER tag id? */ + if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len) + goto out; ISOFS_SB(inode->i_sb)->s_rock = 1; printk(KERN_DEBUG "ISO 9660 Extensions: "); { diff --git a/fs/isofs/util.c b/fs/isofs/util.c index 01e1ee7a998b..005a15cfd30a 100644 --- a/fs/isofs/util.c +++ b/fs/isofs/util.c @@ -2,6 +2,7 @@ * linux/fs/isofs/util.c */ +#include <linux/time.h> #include "isofs.h" /* @@ -17,9 +18,9 @@ int iso_date(char * p, int flag) { int year, month, day, hour, minute, second, tz; - int crtime, days, i; + int crtime; - year = p[0] - 70; + year = p[0]; month = p[1]; day = p[2]; hour = p[3]; @@ -31,18 +32,7 @@ int iso_date(char * p, int flag) if (year < 0) { crtime = 0; } else { - int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31}; - - days = year * 365; - if (year > 2) - days += (year+1) / 4; - for (i = 1; i < month; i++) - days += monlen[i-1]; - if (((year+2) % 4) == 0 && month > 2) - days++; - days += day - 1; - crtime = ((((days * 24) + hour) * 60 + minute) * 60) - + second; + crtime = mktime64(year+1900, month, day, hour, minute, second); /* sign extend */ if (tz & 0x80) diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index aab8549591e7..c46a79adb6ad 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1373,8 +1373,7 @@ int journal_destroy(journal_t *journal) } mutex_unlock(&journal->j_checkpoint_mutex); - if (journal->j_inode) - iput(journal->j_inode); + iput(journal->j_inode); if (journal->j_revoke) journal_destroy_revoke(journal); kfree(journal->j_wbuf); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e4dc74713a43..b96bd8076b70 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1714,8 +1714,7 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); - if (journal->j_inode) - iput(journal->j_inode); + iput(journal->j_inode); if (journal->j_revoke) jbd2_journal_destroy_revoke(journal); if (journal->j_chksum_driver) @@ -1853,13 +1852,12 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, journal->j_chksum_driver = NULL; return 0; } - } - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) + /* Precompute checksum seed for all metadata */ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); + } } /* If enabling v1 checksums, downgrade superblock */ diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index bcbef08a4d8f..b5128c6e63ad 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -524,6 +524,9 @@ static int do_one_pass(journal_t *journal, if (descr_csum_size > 0 && !jbd2_descr_block_csum_verify(journal, bh->b_data)) { + printk(KERN_ERR "JBD2: Invalid checksum " + "recovering block %lu in log\n", + next_log_block); err = -EIO; brelse(bh); goto failed; diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index 92e0644bf867..556de100ebd5 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp) return bit; } -static inline int pulledbits(struct pushpull *pp) -{ - return pp->ofs; -} - static void init_rubin(struct rubin_state *rs, int div, int *bits) { diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 938556025d64..f21b6fb5e4c4 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -252,7 +252,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!f->inocache) return -EIO; - if (S_ISDIR(old_dentry->d_inode->i_mode)) + if (d_is_dir(old_dentry)) return -EPERM; /* XXX: This is ugly */ @@ -772,7 +772,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, */ if (new_dentry->d_inode) { victim_f = JFFS2_INODE_INFO(new_dentry->d_inode); - if (S_ISDIR(new_dentry->d_inode->i_mode)) { + if (d_is_dir(new_dentry)) { struct jffs2_full_dirent *fd; mutex_lock(&victim_f->sem); @@ -807,7 +807,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (victim_f) { /* There was a victim. Kill it off nicely */ - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (d_is_dir(new_dentry)) clear_nlink(new_dentry->d_inode); else drop_nlink(new_dentry->d_inode); @@ -815,7 +815,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, inode which didn't exist. */ if (victim_f->inocache) { mutex_lock(&victim_f->sem); - if (S_ISDIR(new_dentry->d_inode->i_mode)) + if (d_is_dir(new_dentry)) victim_f->inocache->pino_nlink = 0; else victim_f->inocache->pino_nlink--; @@ -825,7 +825,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, /* If it was a directory we moved, and there was no victim, increase i_nlink on its new parent */ - if (S_ISDIR(old_dentry->d_inode->i_mode) && !victim_f) + if (d_is_dir(old_dentry) && !victim_f) inc_nlink(new_dir_i); /* Unlink the original */ @@ -839,7 +839,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); mutex_lock(&f->sem); inc_nlink(old_dentry->d_inode); - if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode)) + if (f->inocache && !d_is_dir(old_dentry)) f->inocache->pino_nlink++; mutex_unlock(&f->sem); @@ -852,7 +852,7 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, return ret; } - if (S_ISDIR(old_dentry->d_inode->i_mode)) + if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 386303dca382..dddbde4f56f4 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -224,7 +224,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); - /* If a node has zero dsize, we only have to keep if it if it might be the + /* If a node has zero dsize, we only have to keep it if it might be the node with highest version -- i.e. the one which will end up as f->metadata. Note that such nodes won't be REF_UNCHECKED since there are no data to check anyway. */ diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 7654e87b0428..9ad5ba4b299b 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo sumlen = c->sector_size - je32_to_cpu(sm->offset); sumptr = buf + buf_size - sumlen; + /* sm->offset maybe wrong but MAGIC maybe right */ + if (sumlen > c->sector_size) + goto full_scan; + /* Now, make sure the summary itself is available */ if (sumlen > buf_size) { /* Need to kmalloc for this. */ @@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo } } +full_scan: buf_ofs = jeb->offset; if (!buf_size) { diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index c522d098bb4f..bc5385471a6e 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -844,6 +844,7 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock /* Write out summary information - called from jffs2_do_reserve_space */ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c) + __must_hold(&c->erase_completion_block) { int datasize, infosize, padsize; struct jffs2_eraseblock *jeb; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 0918f0e2e266..3d76f28a2ba9 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -138,7 +138,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child) struct jffs2_inode_info *f; uint32_t pino; - BUG_ON(!S_ISDIR(child->d_inode->i_mode)); + BUG_ON(!d_is_dir(child)); f = JFFS2_INODE_INFO(child->d_inode); diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h deleted file mode 100644 index fa92f7f1d0d0..000000000000 --- a/fs/jfs/endian24.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) International Business Machines Corp., 2001 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#ifndef _H_ENDIAN24 -#define _H_ENDIAN24 - -/* - * endian24.h: - * - * Endian conversion for 24-byte data - * - */ -#define __swab24(x) \ -({ \ - __u32 __x = (x); \ - ((__u32)( \ - ((__x & (__u32)0x000000ffUL) << 16) | \ - (__x & (__u32)0x0000ff00UL) | \ - ((__x & (__u32)0x00ff0000UL) >> 16) )); \ -}) - -#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) - #define __cpu_to_le24(x) ((__u32)(x)) - #define __le24_to_cpu(x) ((__u32)(x)) -#else - #define __cpu_to_le24(x) __swab24(x) - #define __le24_to_cpu(x) __swab24(x) -#endif - -#ifdef __KERNEL__ - #define cpu_to_le24 __cpu_to_le24 - #define le24_to_cpu __le24_to_cpu -#endif - -#endif /* !_H_ENDIAN24 */ diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 33aa0cc1f8b8..10815f8dfd8b 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; mutex_lock(&inode->i_mutex); - if (!(inode->i_state & I_DIRTY) || + if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 984c2bbf4f61..d88576e23fe4 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid, pxdlist.maxnpxd = 1; pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; - PXDaddress(pxd, nxaddr) - PXDlength(pxd, xlen + n); + PXDaddress(pxd, nxaddr); + PXDlength(pxd, xlen + n); split->pxdlist = &pxdlist; if ((rc = dtExtendPage(tid, ip, split, btstack))) { nxaddr = addressPXD(pxd); diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index cf47f09e8ac8..fa7e795bd8ae 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -94,6 +94,9 @@ struct jfs_inode_info { unchar _inline_ea[128]; /* 128: inline extended attr */ } link; } u; +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif u32 dev; /* will die when we get wide dev_t */ struct inode vfs_inode; }; diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h index 43ea3713c083..8f602dcb51fa 100644 --- a/fs/jfs/jfs_types.h +++ b/fs/jfs/jfs_types.h @@ -30,8 +30,6 @@ #include <linux/types.h> #include <linux/nls.h> -#include "endian24.h" - /* * transaction and lock id's * @@ -59,26 +57,42 @@ struct timestruc_t { /* * physical xd (pxd) + * + * The leftmost 24 bits of len_addr are the extent length. + * The rightmost 8 bits of len_addr are the most signficant bits of + * the extent address */ typedef struct { - unsigned len:24; - unsigned addr1:8; + __le32 len_addr; __le32 addr2; } pxd_t; /* xd_t field construction */ -#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32)) -#define PXDaddress(pxd, address64)\ -{\ - (pxd)->addr1 = ((s64)address64) >> 32;\ - (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +static inline void PXDlength(pxd_t *pxd, __u32 len) +{ + pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) | + cpu_to_le32(len & 0xffffff); +} + +static inline void PXDaddress(pxd_t *pxd, __u64 addr) +{ + pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) | + cpu_to_le32((addr >> 32)<<24); + pxd->addr2 = cpu_to_le32(addr & 0xffffffff); } /* xd_t field extraction */ -#define lengthPXD(pxd) __le24_to_cpu((pxd)->len) -#define addressPXD(pxd)\ - ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2)) +static inline __u32 lengthPXD(pxd_t *pxd) +{ + return le32_to_cpu((pxd)->len_addr) & 0xffffff; +} + +static inline __u64 addressPXD(pxd_t *pxd) +{ + __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff; + return (n << 8) + le32_to_cpu(pxd->addr2); +} #define MAXTREEHEIGHT 8 /* pxd list */ @@ -93,12 +107,10 @@ struct pxdlist { * data extent descriptor (dxd) */ typedef struct { - unsigned flag:8; /* 1: flags */ - unsigned rsrvd:24; + __u8 flag; /* 1: flags */ + __u8 rsrvd[3]; __le32 size; /* 4: size in byte */ - unsigned len:24; /* 3: length in unit of fsblksize */ - unsigned addr1:8; /* 1: address in unit of fsblksize */ - __le32 addr2; /* 4: address in unit of fsblksize */ + pxd_t loc; /* 8: address and length in unit of fsblksize */ } dxd_t; /* - 16 - */ /* dxd_t flags */ @@ -109,12 +121,11 @@ typedef struct { #define DXD_CORRUPT 0x08 /* Inconsistency detected */ /* dxd_t field construction - * Conveniently, the PXD macros work for DXD */ -#define DXDlength PXDlength -#define DXDaddress PXDaddress -#define lengthDXD lengthPXD -#define addressDXD addressPXD +#define DXDlength(dxd, len) PXDlength(&(dxd)->loc, len) +#define DXDaddress(dxd, addr) PXDaddress(&(dxd)->loc, addr) +#define lengthDXD(dxd) lengthPXD(&(dxd)->loc) +#define addressDXD(dxd) addressPXD(&(dxd)->loc) #define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) #define sizeDXD(dxd) le32_to_cpu((dxd)->size) diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h index 08c0c749b986..1e0987986d5f 100644 --- a/fs/jfs/jfs_xtree.h +++ b/fs/jfs/jfs_xtree.h @@ -29,13 +29,11 @@ * extent allocation descriptor (xad) */ typedef struct xad { - unsigned flag:8; /* 1: flag */ - unsigned rsvrd:16; /* 2: reserved */ - unsigned off1:8; /* 1: offset in unit of fsblksize */ - __le32 off2; /* 4: offset in unit of fsblksize */ - unsigned len:24; /* 3: length in unit of fsblksize */ - unsigned addr1:8; /* 1: address in unit of fsblksize */ - __le32 addr2; /* 4: address in unit of fsblksize */ + __u8 flag; /* 1: flag */ + __u8 rsvrd[2]; /* 2: reserved */ + __u8 off1; /* 1: offset in unit of fsblksize */ + __le32 off2; /* 4: offset in unit of fsblksize */ + pxd_t loc; /* 8: length and address in unit of fsblksize */ } xad_t; /* (16) */ #define MAXXLEN ((1 << 24) - 1) @@ -49,19 +47,14 @@ typedef struct xad { (xad)->off1 = ((u64)offset64) >> 32;\ (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ } -#define XADaddress(xad, address64)\ -{\ - (xad)->addr1 = ((u64)address64) >> 32;\ - (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ -} -#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) +#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64) +#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32) /* xad_t field extraction */ #define offsetXAD(xad)\ ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) -#define addressXAD(xad)\ - ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) -#define lengthXAD(xad) __le24_to_cpu((xad)->len) +#define addressXAD(xad) addressPXD(&(xad)->loc) +#define lengthXAD(xad) lengthPXD(&(xad)->loc) /* xad list */ struct xadlist { diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index d59c7defb1ef..38fdc533f4ec 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -84,7 +84,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, struct inode *iplist[2]; struct tblock *tblk; - jfs_info("jfs_create: dip:0x%p name:%s", dip, dentry->d_name.name); + jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); dquot_initialize(dip); @@ -216,7 +216,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) struct inode *iplist[2]; struct tblock *tblk; - jfs_info("jfs_mkdir: dip:0x%p name:%s", dip, dentry->d_name.name); + jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); dquot_initialize(dip); @@ -352,7 +352,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) struct inode *iplist[2]; struct tblock *tblk; - jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); + jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ dquot_initialize(dip); @@ -480,7 +480,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) s64 new_size = 0; int commit_flag; - jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name); + jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ dquot_initialize(dip); @@ -797,8 +797,7 @@ static int jfs_link(struct dentry *old_dentry, struct btstack btstack; struct inode *iplist[2]; - jfs_info("jfs_link: %s %s", old_dentry->d_name.name, - dentry->d_name.name); + jfs_info("jfs_link: %pd %pd", old_dentry, dentry); dquot_initialize(dir); @@ -1082,8 +1081,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, int commit_flag; - jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, - new_dentry->d_name.name); + jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); dquot_initialize(old_dir); dquot_initialize(new_dir); @@ -1355,7 +1353,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - jfs_info("jfs_mknod: %s", dentry->d_name.name); + jfs_info("jfs_mknod: %pd", dentry); dquot_initialize(dir); @@ -1444,7 +1442,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsig struct component_name key; int rc; - jfs_info("jfs_lookup: name = %s", dentry->d_name.name); + jfs_info("jfs_lookup: name = %pd", dentry); if ((rc = get_UCSname(&key, dentry))) return ERR_PTR(rc); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 93e897e588a8..5d30c56ae075 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -117,6 +117,9 @@ static struct inode *jfs_alloc_inode(struct super_block *sb) jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); if (!jfs_inode) return NULL; +#ifdef CONFIG_QUOTA + memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot)); +#endif return &jfs_inode->vfs_inode; } @@ -537,6 +540,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; sb->s_qcop = &dquot_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif /* @@ -615,8 +619,7 @@ out_mount_failed: iput(sbi->direct_inode); sbi->direct_inode = NULL; out_unload: - if (sbi->nls_tab) - unload_nls(sbi->nls_tab); + unload_nls(sbi->nls_tab); out_kfree: kfree(sbi); return ret; @@ -836,6 +839,10 @@ out: return len - towrite; } +static struct dquot **jfs_get_dquots(struct inode *inode) +{ + return JFS_IP(inode)->i_dquot; +} #endif static const struct super_operations jfs_super_operations = { @@ -854,6 +861,7 @@ static const struct super_operations jfs_super_operations = { #ifdef CONFIG_QUOTA .quota_read = jfs_quota_read, .quota_write = jfs_quota_write, + .get_dquots = jfs_get_dquots, #endif }; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 1c771931bb60..6acc9648f986 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -201,10 +201,14 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns) static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { - if (hash != kn->hash) - return hash - kn->hash; - if (ns != kn->ns) - return ns - kn->ns; + if (hash < kn->hash) + return -1; + if (hash > kn->hash) + return 1; + if (ns < kn->ns) + return -1; + if (ns > kn->ns) + return 1; return strcmp(name, kn->name); } @@ -407,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); - if (!(kn->flags & KERNFS_STATIC_NAME)) - kfree(kn->name); + + kfree_const(kn->name); + if (kn->iattr) { if (kn->iattr->ia_secdata) security_release_secctx(kn->iattr->ia_secdata, @@ -502,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, unsigned flags) { - char *dup_name = NULL; struct kernfs_node *kn; int ret; - if (!(flags & KERNFS_STATIC_NAME)) { - name = dup_name = kstrdup(name, GFP_KERNEL); - if (!name) - return NULL; - } + name = kstrdup_const(name, GFP_KERNEL); + if (!name) + return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) @@ -534,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: - kfree(dup_name); + kfree_const(name); return NULL; } @@ -807,7 +809,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, } /* instantiate and hash dentry */ - ret = d_materialise_unique(dentry, inode); + ret = d_splice_alias(inode, dentry); out_unlock: mutex_unlock(&kernfs_mutex); return ret; @@ -1260,7 +1262,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, /* rename kernfs_node */ if (strcmp(kn->name, new_name) != 0) { error = -ENOMEM; - new_name = kstrdup(new_name, GFP_KERNEL); + new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { @@ -1281,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kn->ns = new_ns; if (new_name) { - if (!(kn->flags & KERNFS_STATIC_NAME)) - old_name = kn->name; - kn->flags &= ~KERNFS_STATIC_NAME; + old_name = kn->name; kn->name = new_name; } @@ -1293,7 +1293,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, kernfs_link_sibling(kn); kernfs_put(old_parent); - kfree(old_name); + kfree_const(old_name); error = 0; out: diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 4429d6d9217f..2bacb9988566 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -106,7 +106,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) const struct kernfs_ops *ops; /* - * @of->mutex nests outside active ref and is just to ensure that + * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); @@ -189,13 +189,16 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, const struct kernfs_ops *ops; char *buf; - buf = kmalloc(len, GFP_KERNEL); + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* - * @of->mutex nests outside active ref and is just to ensure that - * the ops aren't called concurrently for the same open file. + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -204,27 +207,29 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, goto out_free; } + of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, *ppos); else len = -EINVAL; - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); - if (len < 0) - goto out_free; + goto out_unlock; if (copy_to_user(user_buf, buf, len)) { len = -EFAULT; - goto out_free; + goto out_unlock; } *ppos += len; + out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); out_free: - kfree(buf); + if (buf != of->prealloc_buf) + kfree(buf); return len; } @@ -278,19 +283,16 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, len = min_t(size_t, count, PAGE_SIZE); } - buf = kmalloc(len + 1, GFP_KERNEL); + buf = of->prealloc_buf; + if (!buf) + buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ - /* - * @of->mutex nests outside active ref and is just to ensure that - * the ops aren't called concurrently for the same open file. + * @of->mutex nests outside active ref and is used both to ensure that + * the ops aren't called concurrently for the same open file, and + * to provide exclusive access to ->prealloc_buf (when that exists). */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { @@ -299,19 +301,27 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, goto out_free; } + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_unlock; + } + buf[len] = '\0'; /* guarantee string termination */ + ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, *ppos); else len = -EINVAL; - kernfs_put_active(of->kn); - mutex_unlock(&of->mutex); - if (len > 0) *ppos += len; + +out_unlock: + kernfs_put_active(of->kn); + mutex_unlock(&of->mutex); out_free: - kfree(buf); + if (buf != of->prealloc_buf) + kfree(buf); return len; } @@ -439,27 +449,6 @@ static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, return pol; } -static int kernfs_vma_migrate(struct vm_area_struct *vma, - const nodemask_t *from, const nodemask_t *to, - unsigned long flags) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return 0; - - ret = 0; - if (of->vm_ops->migrate) - ret = of->vm_ops->migrate(vma, from, to, flags); - - kernfs_put_active(of->kn); - return ret; -} #endif static const struct vm_operations_struct kernfs_vm_ops = { @@ -470,7 +459,6 @@ static const struct vm_operations_struct kernfs_vm_ops = { #ifdef CONFIG_NUMA .set_policy = kernfs_vma_set_policy, .get_policy = kernfs_vma_get_policy, - .migrate = kernfs_vma_migrate, #endif }; @@ -685,6 +673,22 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) */ of->atomic_write_len = ops->atomic_write_len; + error = -EINVAL; + /* + * ->seq_show is incompatible with ->prealloc, + * as seq_read does its own allocation. + * ->read must be used instead. + */ + if (ops->prealloc && ops->seq_show) + goto err_free; + if (ops->prealloc) { + int len = of->atomic_write_len ?: PAGE_SIZE; + of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); + error = -ENOMEM; + if (!of->prealloc_buf) + goto err_free; + } + /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access @@ -715,6 +719,7 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) err_close: seq_release(inode, file); err_free: + kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); @@ -728,6 +733,7 @@ static int kernfs_fop_release(struct inode *inode, struct file *filp) kernfs_put_open_node(kn, of); seq_release(inode, filp); + kfree(of->prealloc_buf); kfree(of); return 0; @@ -896,7 +902,6 @@ const struct file_operations kernfs_file_fops = { * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file - * @name_is_static: don't copy file name * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. @@ -906,7 +911,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, - bool name_is_static, struct lock_class_key *key) { struct kernfs_node *kn; @@ -914,8 +918,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, int rc; flags = KERNFS_FILE; - if (name_is_static) - flags |= KERNFS_STATIC_NAME; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); if (!kn) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 985217626e66..9000874a945b 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = { .write_end = simple_write_end, }; -static struct backing_dev_info kernfs_bdi = { - .name = "kernfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, @@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = { .listxattr = kernfs_iop_listxattr, }; -void __init kernfs_inode_init(void) -{ - if (bdi_init(&kernfs_bdi)) - panic("failed to init kernfs_bdi"); -} - static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { static DEFINE_MUTEX(iattr_mutex); @@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; - inode->i_mapping->backing_dev_info = &kernfs_bdi; inode->i_op = &kernfs_iops; set_default_inode_attr(inode, kn->mode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index dc84a3ef9ca2..af9fa7499919 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name); ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); -void kernfs_inode_init(void); /* * dir.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f973ae9b05f1..8eaf417187f1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -246,5 +246,4 @@ void __init kernfs_init(void) kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), 0, SLAB_PANIC, NULL); - kernfs_inode_init(); } diff --git a/fs/libfs.c b/fs/libfs.c index 171d2846f2a3..0ab65122ee45 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -114,18 +114,18 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_lock(&dentry->d_lock); /* d_lock not required for cursor */ - list_del(&cursor->d_u.d_child); + list_del(&cursor->d_child); p = dentry->d_subdirs.next; while (n && p != &dentry->d_subdirs) { struct dentry *next; - next = list_entry(p, struct dentry, d_u.d_child); + next = list_entry(p, struct dentry, d_child); spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(next)) n--; spin_unlock(&next->d_lock); p = p->next; } - list_add_tail(&cursor->d_u.d_child, p); + list_add_tail(&cursor->d_child, p); spin_unlock(&dentry->d_lock); } } @@ -150,7 +150,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p, *q = &cursor->d_u.d_child; + struct list_head *p, *q = &cursor->d_child; if (!dir_emit_dots(file, ctx)) return 0; @@ -159,7 +159,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) list_move(q, &dentry->d_subdirs); for (p = q->next; p != &dentry->d_subdirs; p = p->next) { - struct dentry *next = list_entry(p, struct dentry, d_u.d_child); + struct dentry *next = list_entry(p, struct dentry, d_child); spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); if (!simple_positive(next)) { spin_unlock(&next->d_lock); @@ -287,7 +287,7 @@ int simple_empty(struct dentry *dentry) int ret = 0; spin_lock(&dentry->d_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) { + list_for_each_entry(child, &dentry->d_subdirs, d_child) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); @@ -329,7 +329,7 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *inode = old_dentry->d_inode; - int they_are_dirs = S_ISDIR(old_dentry->d_inode->i_mode); + int they_are_dirs = d_is_dir(old_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; @@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, mutex_lock(&inode->i_mutex); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9106f42c472c..47a32b6d9b90 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) return (struct sockaddr *)&nsm->sm_addr; } -static struct rpc_clnt *nsm_create(struct net *net) +static struct rpc_clnt *nsm_create(struct net *net, const char *nodename) { struct sockaddr_in sin = { .sin_family = AF_INET, @@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net) .address = (struct sockaddr *)&sin, .addrsize = sizeof(sin), .servername = "rpc.statd", + .nodename = nodename, .program = &nsm_program, .version = NSM_VERSION, .authflavor = RPC_AUTH_NULL, @@ -102,7 +103,7 @@ out: return clnt; } -static struct rpc_clnt *nsm_client_get(struct net *net) +static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename) { struct rpc_clnt *clnt, *new; struct lockd_net *ln = net_generic(net, lockd_net_id); @@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net) if (clnt != NULL) goto out; - clnt = new = nsm_create(net); + clnt = new = nsm_create(net, nodename); if (IS_ERR(clnt)) goto out; @@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host) struct nsm_res res; int status; struct rpc_clnt *clnt; + const char *nodename = NULL; dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); if (nsm->sm_monitored) return 0; + if (host->h_rpcclnt) + nodename = host->h_rpcclnt->cl_nodename; + /* * Choose whether to record the caller_name or IP address of * this peer in the local rpc.statd's database. */ nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - clnt = nsm_client_get(host->net); + clnt = nsm_client_get(host->net, nodename); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); dprintk("lockd: failed to create NSM upcall transport, " @@ -214,7 +219,7 @@ int nsm_monitor(const struct nlm_host *host) if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { - printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); + pr_notice_ratelimited("lockd: cannot monitor %s\n", nsm->sm_name); return status; } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index d1bb7ecfd201..55505cbe11af 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -138,10 +138,6 @@ lockd(void *vrqstp) dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); - if (!nlm_timeout) - nlm_timeout = LOCKD_DFLT_TIMEO; - nlmsvc_timeout = nlm_timeout * HZ; - /* * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. @@ -350,7 +346,11 @@ static struct svc_serv *lockd_create_svc(void) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + if (!nlm_timeout) + nlm_timeout = LOCKD_DFLT_TIMEO; + nlmsvc_timeout = nlm_timeout * HZ; + + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 13db95f54176..5581e020644b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -53,12 +53,12 @@ static const struct rpc_call_ops nlmsvc_grant_ops; static LIST_HEAD(nlm_blocked); static DEFINE_SPINLOCK(nlm_blocked_lock); -#ifdef LOCKD_DEBUG +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie) { /* - * We can get away with a static buffer because we're only - * called with BKL held. + * We can get away with a static buffer because this is only called + * from lockd, which is single-threaded. */ static char buf[2*NLM_MAXCOOKIELEN+1]; unsigned int i, len = sizeof(buf); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index b6f3b84b6e99..665ef5a05183 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; + struct file_lock_context *flctx = inode->i_flctx; struct nlm_host *lockhost; + if (!flctx || list_empty_careful(&flctx->flc_posix)) + return 0; again: file->f_locks = 0; - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl; fl = fl->fl_next) { + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { if (fl->fl_lmops != &nlmsvc_lock_operations) continue; @@ -180,7 +183,7 @@ again: if (match(lockhost, host)) { struct file_lock lock = *fl; - spin_unlock(&inode->i_lock); + spin_unlock(&flctx->flc_lock); lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; @@ -192,7 +195,7 @@ again: goto again; } } - spin_unlock(&inode->i_lock); + spin_unlock(&flctx->flc_lock); return 0; } @@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file) { struct inode *inode = nlmsvc_file_inode(file); struct file_lock *fl; + struct file_lock_context *flctx = inode->i_flctx; if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares) return 1; - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl; fl = fl->fl_next) { - if (fl->fl_lmops == &nlmsvc_lock_operations) { - spin_unlock(&inode->i_lock); - return 1; + if (flctx && !list_empty_careful(&flctx->flc_posix)) { + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_lmops == &nlmsvc_lock_operations) { + spin_unlock(&flctx->flc_lock); + return 1; + } } + spin_unlock(&flctx->flc_lock); } - spin_unlock(&inode->i_lock); file->f_locks = 0; return 0; } @@ -408,7 +414,7 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == file->f_file->f_path.dentry->d_sb; + return sb == file_inode(file->f_file)->i_sb; } /** diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 9340e7e10ef6..5b651daad518 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f) return p + XDR_QUADLEN(NFS2_FHSIZE); } -static inline __be32 * -nlm_encode_fh(__be32 *p, struct nfs_fh *f) -{ - *p++ = htonl(NFS2_FHSIZE); - memcpy(p, f->data, NFS2_FHSIZE); - return p + XDR_QUADLEN(NFS2_FHSIZE); -} - /* * Encode and decode owner handle */ diff --git a/fs/locks.c b/fs/locks.c index 735b8d3fa78c..40bc384728c0 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -137,7 +137,7 @@ #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) -#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG)) +#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) static bool lease_breaking(struct file_lock *fl) @@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl) int leases_enable = 1; int lease_break_time = 45; -#define for_each_lock(inode, lockp) \ - for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next) - /* * The global file_lock_list is only used for displaying /proc/locks, so we * keep a list on each CPU, with each list protected by its own spinlock via * the file_lock_lglock. Note that alterations to the list also require that - * the relevant i_lock is held. + * the relevant flc_lock is held. */ DEFINE_STATIC_LGLOCK(file_lock_lglock); static DEFINE_PER_CPU(struct hlist_head, file_lock_list); @@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); * contrast to those that are acting as records of acquired locks). * * Note that when we acquire this lock in order to change the above fields, - * we often hold the i_lock as well. In certain cases, when reading the fields + * we often hold the flc_lock as well. In certain cases, when reading the fields * protected by this lock, we can skip acquiring it iff we already hold the - * i_lock. + * flc_lock. * * In particular, adding an entry to the fl_block list requires that you hold - * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting - * an entry from the list however only requires the file_lock_lock. + * both the flc_lock and the blocked_lock_lock (acquired in that order). + * Deleting an entry from the list however only requires the file_lock_lock. */ static DEFINE_SPINLOCK(blocked_lock_lock); +static struct kmem_cache *flctx_cache __read_mostly; static struct kmem_cache *filelock_cache __read_mostly; +static struct file_lock_context * +locks_get_lock_context(struct inode *inode) +{ + struct file_lock_context *new; + + if (likely(inode->i_flctx)) + goto out; + + new = kmem_cache_alloc(flctx_cache, GFP_KERNEL); + if (!new) + goto out; + + spin_lock_init(&new->flc_lock); + INIT_LIST_HEAD(&new->flc_flock); + INIT_LIST_HEAD(&new->flc_posix); + INIT_LIST_HEAD(&new->flc_lease); + + /* + * Assign the pointer if it's not already assigned. If it is, then + * free the context we just allocated. + */ + spin_lock(&inode->i_lock); + if (likely(!inode->i_flctx)) { + inode->i_flctx = new; + new = NULL; + } + spin_unlock(&inode->i_lock); + + if (new) + kmem_cache_free(flctx_cache, new); +out: + return inode->i_flctx; +} + +void +locks_free_lock_context(struct file_lock_context *ctx) +{ + if (ctx) { + WARN_ON_ONCE(!list_empty(&ctx->flc_flock)); + WARN_ON_ONCE(!list_empty(&ctx->flc_posix)); + WARN_ON_ONCE(!list_empty(&ctx->flc_lease)); + kmem_cache_free(flctx_cache, ctx); + } +} + static void locks_init_lock_heads(struct file_lock *fl) { INIT_HLIST_NODE(&fl->fl_link); + INIT_LIST_HEAD(&fl->fl_list); INIT_LIST_HEAD(&fl->fl_block); init_waitqueue_head(&fl->fl_wait); } @@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private); void locks_free_lock(struct file_lock *fl) { BUG_ON(waitqueue_active(&fl->fl_wait)); + BUG_ON(!list_empty(&fl->fl_list)); BUG_ON(!list_empty(&fl->fl_block)); BUG_ON(!hlist_unhashed(&fl->fl_link)); @@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose) struct file_lock *fl; while (!list_empty(dispose)) { - fl = list_first_entry(dispose, struct file_lock, fl_block); - list_del_init(&fl->fl_block); + fl = list_first_entry(dispose, struct file_lock, fl_list); + list_del_init(&fl->fl_list); locks_free_lock(fl); } } @@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) return fl1->fl_owner == fl2->fl_owner; } -/* Must be called with the i_lock held! */ +/* Must be called with the flc_lock held! */ static void locks_insert_global_locks(struct file_lock *fl) { lg_local_lock(&file_lock_lglock); @@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl) lg_local_unlock(&file_lock_lglock); } -/* Must be called with the i_lock held! */ +/* Must be called with the flc_lock held! */ static void locks_delete_global_locks(struct file_lock *fl) { /* * Avoid taking lock if already unhashed. This is safe since this check - * is done while holding the i_lock, and new insertions into the list + * is done while holding the flc_lock, and new insertions into the list * also require that it be held. */ if (hlist_unhashed(&fl->fl_link)) @@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter) * the order they blocked. The documentation doesn't require this but * it seems like the reasonable thing to do. * - * Must be called with both the i_lock and blocked_lock_lock held. The fl_block - * list itself is protected by the blocked_lock_lock, but by ensuring that the - * i_lock is also held on insertions we can avoid taking the blocked_lock_lock - * in some cases when we see that the fl_block list is empty. + * Must be called with both the flc_lock and blocked_lock_lock held. The + * fl_block list itself is protected by the blocked_lock_lock, but by ensuring + * that the flc_lock is also held on insertions we can avoid taking the + * blocked_lock_lock in some cases when we see that the fl_block list is empty. */ static void __locks_insert_block(struct file_lock *blocker, struct file_lock *waiter) @@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker, locks_insert_global_blocked(waiter); } -/* Must be called with i_lock held. */ +/* Must be called with flc_lock held. */ static void locks_insert_block(struct file_lock *blocker, struct file_lock *waiter) { @@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker, /* * Wake up processes blocked waiting for blocker. * - * Must be called with the inode->i_lock held! + * Must be called with the inode->flc_lock held! */ static void locks_wake_up_blocks(struct file_lock *blocker) { /* * Avoid taking global lock if list is empty. This is safe since new - * blocked requests are only added to the list under the i_lock, and - * the i_lock is always held here. Note that removal from the fl_block - * list does not require the i_lock, so we must recheck list_empty() + * blocked requests are only added to the list under the flc_lock, and + * the flc_lock is always held here. Note that removal from the fl_block + * list does not require the flc_lock, so we must recheck list_empty() * after acquiring the blocked_lock_lock. */ if (list_empty(&blocker->fl_block)) @@ -635,63 +680,32 @@ static void locks_wake_up_blocks(struct file_lock *blocker) spin_unlock(&blocked_lock_lock); } -/* Insert file lock fl into an inode's lock list at the position indicated - * by pos. At the same time add the lock to the global file lock list. - * - * Must be called with the i_lock held! - */ -static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl) +static void +locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) { fl->fl_nspid = get_pid(task_tgid(current)); - - /* insert into file's list */ - fl->fl_next = *pos; - *pos = fl; - + list_add_tail(&fl->fl_list, before); locks_insert_global_locks(fl); } -/** - * locks_delete_lock - Delete a lock and then free it. - * @thisfl_p: pointer that points to the fl_next field of the previous - * inode->i_flock list entry - * - * Unlink a lock from all lists and free the namespace reference, but don't - * free it yet. Wake up processes that are blocked waiting for this lock and - * notify the FS that the lock has been cleared. - * - * Must be called with the i_lock held! - */ -static void locks_unlink_lock(struct file_lock **thisfl_p) +static void +locks_unlink_lock_ctx(struct file_lock *fl) { - struct file_lock *fl = *thisfl_p; - locks_delete_global_locks(fl); - - *thisfl_p = fl->fl_next; - fl->fl_next = NULL; - + list_del_init(&fl->fl_list); if (fl->fl_nspid) { put_pid(fl->fl_nspid); fl->fl_nspid = NULL; } - locks_wake_up_blocks(fl); } -/* - * Unlink a lock from all lists and free it. - * - * Must be called with i_lock held! - */ -static void locks_delete_lock(struct file_lock **thisfl_p, - struct list_head *dispose) +static void +locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose) { - struct file_lock *fl = *thisfl_p; - - locks_unlink_lock(thisfl_p); + locks_unlink_lock_ctx(fl); if (dispose) - list_add(&fl->fl_block, dispose); + list_add(&fl->fl_list, dispose); else locks_free_lock(fl); } @@ -746,22 +760,27 @@ void posix_test_lock(struct file *filp, struct file_lock *fl) { struct file_lock *cfl; + struct file_lock_context *ctx; struct inode *inode = file_inode(filp); - spin_lock(&inode->i_lock); - for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) { - if (!IS_POSIX(cfl)) - continue; - if (posix_locks_conflict(fl, cfl)) - break; - } - if (cfl) { - locks_copy_conflock(fl, cfl); - if (cfl->fl_nspid) - fl->fl_pid = pid_vnr(cfl->fl_nspid); - } else + ctx = inode->i_flctx; + if (!ctx || list_empty_careful(&ctx->flc_posix)) { fl->fl_type = F_UNLCK; - spin_unlock(&inode->i_lock); + return; + } + + spin_lock(&ctx->flc_lock); + list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { + if (posix_locks_conflict(fl, cfl)) { + locks_copy_conflock(fl, cfl); + if (cfl->fl_nspid) + fl->fl_pid = pid_vnr(cfl->fl_nspid); + goto out; + } + } + fl->fl_type = F_UNLCK; +out: + spin_unlock(&ctx->flc_lock); return; } EXPORT_SYMBOL(posix_test_lock); @@ -845,34 +864,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, static int flock_lock_file(struct file *filp, struct file_lock *request) { struct file_lock *new_fl = NULL; - struct file_lock **before; - struct inode * inode = file_inode(filp); + struct file_lock *fl; + struct file_lock_context *ctx; + struct inode *inode = file_inode(filp); int error = 0; - int found = 0; + bool found = false; LIST_HEAD(dispose); + ctx = locks_get_lock_context(inode); + if (!ctx) + return -ENOMEM; + if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { new_fl = locks_alloc_lock(); if (!new_fl) return -ENOMEM; } - spin_lock(&inode->i_lock); + spin_lock(&ctx->flc_lock); if (request->fl_flags & FL_ACCESS) goto find_conflict; - for_each_lock(inode, before) { - struct file_lock *fl = *before; - if (IS_POSIX(fl)) - break; - if (IS_LEASE(fl)) - continue; + list_for_each_entry(fl, &ctx->flc_flock, fl_list) { if (filp != fl->fl_file) continue; if (request->fl_type == fl->fl_type) goto out; - found = 1; - locks_delete_lock(before, &dispose); + found = true; + locks_delete_lock_ctx(fl, &dispose); break; } @@ -882,23 +901,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) goto out; } - /* - * If a higher-priority process was blocked on the old file lock, - * give it the opportunity to lock the file. - */ - if (found) { - spin_unlock(&inode->i_lock); - cond_resched(); - spin_lock(&inode->i_lock); - } - find_conflict: - for_each_lock(inode, before) { - struct file_lock *fl = *before; - if (IS_POSIX(fl)) - break; - if (IS_LEASE(fl)) - continue; + list_for_each_entry(fl, &ctx->flc_flock, fl_list) { if (!flock_locks_conflict(request, fl)) continue; error = -EAGAIN; @@ -911,12 +915,12 @@ find_conflict: if (request->fl_flags & FL_ACCESS) goto out; locks_copy_lock(new_fl, request); - locks_insert_lock(before, new_fl); + locks_insert_lock_ctx(new_fl, &ctx->flc_flock); new_fl = NULL; error = 0; out: - spin_unlock(&inode->i_lock); + spin_unlock(&ctx->flc_lock); if (new_fl) locks_free_lock(new_fl); locks_dispose_list(&dispose); @@ -925,16 +929,20 @@ out: static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) { - struct file_lock *fl; + struct file_lock *fl, *tmp; struct file_lock *new_fl = NULL; struct file_lock *new_fl2 = NULL; struct file_lock *left = NULL; struct file_lock *right = NULL; - struct file_lock **before; + struct file_lock_context *ctx; int error; bool added = false; LIST_HEAD(dispose); + ctx = locks_get_lock_context(inode); + if (!ctx) + return -ENOMEM; + /* * We may need two file_lock structures for this operation, * so we get them in advance to avoid races. @@ -948,15 +956,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str new_fl2 = locks_alloc_lock(); } - spin_lock(&inode->i_lock); + spin_lock(&ctx->flc_lock); /* * New lock request. Walk all POSIX locks and look for conflicts. If * there are any, either return error or put the request on the * blocker's list of waiters and the global blocked_hash. */ if (request->fl_type != F_UNLCK) { - for_each_lock(inode, before) { - fl = *before; + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { if (!IS_POSIX(fl)) continue; if (!posix_locks_conflict(request, fl)) @@ -986,29 +993,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (request->fl_flags & FL_ACCESS) goto out; - /* - * Find the first old lock with the same owner as the new lock. - */ - - before = &inode->i_flock; - - /* First skip locks owned by other processes. */ - while ((fl = *before) && (!IS_POSIX(fl) || - !posix_same_owner(request, fl))) { - before = &fl->fl_next; + /* Find the first old lock with the same owner as the new lock */ + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { + if (posix_same_owner(request, fl)) + break; } /* Process locks with this owner. */ - while ((fl = *before) && posix_same_owner(request, fl)) { - /* Detect adjacent or overlapping regions (if same lock type) - */ + list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) { + if (!posix_same_owner(request, fl)) + break; + + /* Detect adjacent or overlapping regions (if same lock type) */ if (request->fl_type == fl->fl_type) { /* In all comparisons of start vs end, use * "start - 1" rather than "end + 1". If end * is OFFSET_MAX, end + 1 will become negative. */ if (fl->fl_end < request->fl_start - 1) - goto next_lock; + continue; /* If the next lock in the list has entirely bigger * addresses than the new one, insert the lock here. */ @@ -1029,18 +1032,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str else request->fl_end = fl->fl_end; if (added) { - locks_delete_lock(before, &dispose); + locks_delete_lock_ctx(fl, &dispose); continue; } request = fl; added = true; - } - else { + } else { /* Processing for different lock types is a bit * more complex. */ if (fl->fl_end < request->fl_start) - goto next_lock; + continue; if (fl->fl_start > request->fl_end) break; if (request->fl_type == F_UNLCK) @@ -1059,7 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str * one (This may happen several times). */ if (added) { - locks_delete_lock(before, &dispose); + locks_delete_lock_ctx(fl, &dispose); continue; } /* @@ -1075,15 +1077,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_copy_lock(new_fl, request); request = new_fl; new_fl = NULL; - locks_delete_lock(before, &dispose); - locks_insert_lock(before, request); + locks_insert_lock_ctx(request, &fl->fl_list); + locks_delete_lock_ctx(fl, &dispose); added = true; } } - /* Go on to next lock. - */ - next_lock: - before = &fl->fl_next; } /* @@ -1108,7 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str goto out; } locks_copy_lock(new_fl, request); - locks_insert_lock(before, new_fl); + locks_insert_lock_ctx(new_fl, &fl->fl_list); + fl = new_fl; new_fl = NULL; } if (right) { @@ -1119,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); - locks_insert_lock(before, left); + locks_insert_lock_ctx(left, &fl->fl_list); } right->fl_start = request->fl_end + 1; locks_wake_up_blocks(right); @@ -1129,7 +1128,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str locks_wake_up_blocks(left); } out: - spin_unlock(&inode->i_lock); + spin_unlock(&ctx->flc_lock); /* * Free any unused locks. */ @@ -1199,22 +1198,29 @@ EXPORT_SYMBOL(posix_lock_file_wait); */ int locks_mandatory_locked(struct file *file) { + int ret; struct inode *inode = file_inode(file); + struct file_lock_context *ctx; struct file_lock *fl; + ctx = inode->i_flctx; + if (!ctx || list_empty_careful(&ctx->flc_posix)) + return 0; + /* * Search the lock list for this inode for any POSIX locks. */ - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (!IS_POSIX(fl)) - continue; + spin_lock(&ctx->flc_lock); + ret = 0; + list_for_each_entry(fl, &ctx->flc_posix, fl_list) { if (fl->fl_owner != current->files && - fl->fl_owner != file) + fl->fl_owner != file) { + ret = -EAGAIN; break; + } } - spin_unlock(&inode->i_lock); - return fl ? -EAGAIN : 0; + spin_unlock(&ctx->flc_lock); + return ret; } /** @@ -1294,9 +1300,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg) } /* We already had a lease on this file; just change its type */ -int lease_modify(struct file_lock **before, int arg, struct list_head *dispose) +int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) { - struct file_lock *fl = *before; int error = assign_type(fl, arg); if (error) @@ -1313,7 +1318,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose) printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } - locks_delete_lock(before, dispose); + locks_delete_lock_ctx(fl, dispose); } return 0; } @@ -1329,25 +1334,24 @@ static bool past_time(unsigned long then) static void time_out_leases(struct inode *inode, struct list_head *dispose) { - struct file_lock **before; - struct file_lock *fl; + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl, *tmp; - lockdep_assert_held(&inode->i_lock); + lockdep_assert_held(&ctx->flc_lock); - before = &inode->i_flock; - while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) - lease_modify(before, F_RDLCK, dispose); + lease_modify(fl, F_RDLCK, dispose); if (past_time(fl->fl_break_time)) - lease_modify(before, F_UNLCK, dispose); - if (fl == *before) /* lease_modify may have freed fl */ - before = &fl->fl_next; + lease_modify(fl, F_UNLCK, dispose); } } static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) { + if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) + return false; if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) return false; return locks_conflict(breaker, lease); @@ -1356,11 +1360,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) static bool any_leases_conflict(struct inode *inode, struct file_lock *breaker) { + struct file_lock_context *ctx = inode->i_flctx; struct file_lock *fl; - lockdep_assert_held(&inode->i_lock); + lockdep_assert_held(&ctx->flc_lock); - for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) { + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { if (leases_conflict(fl, breaker)) return true; } @@ -1383,8 +1388,8 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker) int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; - struct file_lock *new_fl; - struct file_lock *fl, **before; + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *new_fl, *fl, *tmp; unsigned long break_time; int want_write = (mode & O_ACCMODE) != O_RDONLY; LIST_HEAD(dispose); @@ -1394,7 +1399,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) return PTR_ERR(new_fl); new_fl->fl_flags = type; - spin_lock(&inode->i_lock); + /* typically we will check that ctx is non-NULL before calling */ + if (!ctx) { + WARN_ON_ONCE(1); + return error; + } + + spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); @@ -1408,9 +1419,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) break_time++; /* so that 0 means no break time */ } - for (before = &inode->i_flock; - ((fl = *before) != NULL) && IS_LEASE(fl); - before = &fl->fl_next) { + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) { if (!leases_conflict(fl, new_fl)) continue; if (want_write) { @@ -1419,17 +1428,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; } else { - if (lease_breaking(inode->i_flock)) + if (lease_breaking(fl)) continue; fl->fl_flags |= FL_DOWNGRADE_PENDING; fl->fl_downgrade_time = break_time; } if (fl->fl_lmops->lm_break(fl)) - locks_delete_lock(before, &dispose); + locks_delete_lock_ctx(fl, &dispose); } - fl = inode->i_flock; - if (!fl || !IS_LEASE(fl)) + if (list_empty(&ctx->flc_lease)) goto out; if (mode & O_NONBLOCK) { @@ -1439,18 +1447,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) } restart: - break_time = inode->i_flock->fl_break_time; + fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list); + break_time = fl->fl_break_time; if (break_time != 0) break_time -= jiffies; if (break_time == 0) break_time++; - locks_insert_block(inode->i_flock, new_fl); + locks_insert_block(fl, new_fl); trace_break_lease_block(inode, new_fl); - spin_unlock(&inode->i_lock); + spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); - spin_lock(&inode->i_lock); + spin_lock(&ctx->flc_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); if (error >= 0) { @@ -1462,12 +1471,10 @@ restart: time_out_leases(inode, &dispose); if (any_leases_conflict(inode, new_fl)) goto restart; - error = 0; } - out: - spin_unlock(&inode->i_lock); + spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; @@ -1487,14 +1494,18 @@ EXPORT_SYMBOL(__break_lease); void lease_get_mtime(struct inode *inode, struct timespec *time) { bool has_lease = false; - struct file_lock *flock; + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl; - if (inode->i_flock) { - spin_lock(&inode->i_lock); - flock = inode->i_flock; - if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) - has_lease = true; - spin_unlock(&inode->i_lock); + if (ctx && !list_empty_careful(&ctx->flc_lease)) { + spin_lock(&ctx->flc_lock); + if (!list_empty(&ctx->flc_lease)) { + fl = list_first_entry(&ctx->flc_lease, + struct file_lock, fl_list); + if (fl->fl_type == F_WRLCK) + has_lease = true; + } + spin_unlock(&ctx->flc_lock); } if (has_lease) @@ -1532,20 +1543,22 @@ int fcntl_getlease(struct file *filp) { struct file_lock *fl; struct inode *inode = file_inode(filp); + struct file_lock_context *ctx = inode->i_flctx; int type = F_UNLCK; LIST_HEAD(dispose); - spin_lock(&inode->i_lock); - time_out_leases(file_inode(filp), &dispose); - for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); - fl = fl->fl_next) { - if (fl->fl_file == filp) { + if (ctx && !list_empty_careful(&ctx->flc_lease)) { + spin_lock(&ctx->flc_lock); + time_out_leases(file_inode(filp), &dispose); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_file != filp) + continue; type = target_leasetype(fl); break; } + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); } - spin_unlock(&inode->i_lock); - locks_dispose_list(&dispose); return type; } @@ -1560,11 +1573,14 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(const struct dentry *dentry, const long arg) +check_conflicting_open(const struct dentry *dentry, const long arg, int flags) { int ret = 0; struct inode *inode = dentry->d_inode; + if (flags & FL_LAYOUT) + return 0; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) return -EAGAIN; @@ -1578,9 +1594,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg) static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { - struct file_lock *fl, **before, **my_before = NULL, *lease; + struct file_lock *fl, *my_fl = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + struct file_lock_context *ctx; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; LIST_HEAD(dispose); @@ -1588,6 +1605,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr lease = *flp; trace_generic_add_lease(inode, lease); + ctx = locks_get_lock_context(inode); + if (!ctx) + return -ENOMEM; + /* * In the delegation case we need mutual exclusion with * a number of operations that take the i_mutex. We trylock @@ -1606,9 +1627,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr return -EINVAL; } - spin_lock(&inode->i_lock); + spin_lock(&ctx->flc_lock); time_out_leases(inode, &dispose); - error = check_conflicting_open(dentry, arg); + error = check_conflicting_open(dentry, arg, lease->fl_flags); if (error) goto out; @@ -1621,13 +1642,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * except for this filp. */ error = -EAGAIN; - for (before = &inode->i_flock; - ((fl = *before) != NULL) && IS_LEASE(fl); - before = &fl->fl_next) { - if (fl->fl_file == filp) { - my_before = before; + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_file == filp && + fl->fl_owner == lease->fl_owner) { + my_fl = fl; continue; } + /* * No exclusive leases if someone else has a lease on * this file: @@ -1642,9 +1663,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr goto out; } - if (my_before != NULL) { - lease = *my_before; - error = lease->fl_lmops->lm_change(my_before, arg, &dispose); + if (my_fl != NULL) { + lease = my_fl; + error = lease->fl_lmops->lm_change(lease, arg, &dispose); if (error) goto out; goto out_setup; @@ -1654,7 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr if (!leases_enable) goto out; - locks_insert_lock(before, lease); + locks_insert_lock_ctx(lease, &ctx->flc_lease); /* * The check in break_lease() is lockless. It's possible for another * open to race in after we did the earlier check for a conflicting @@ -1665,46 +1686,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * precedes these checks. */ smp_mb(); - error = check_conflicting_open(dentry, arg); - if (error) - goto out_unlink; + error = check_conflicting_open(dentry, arg, lease->fl_flags); + if (error) { + locks_unlink_lock_ctx(lease); + goto out; + } out_setup: if (lease->fl_lmops->lm_setup) lease->fl_lmops->lm_setup(lease, priv); out: - spin_unlock(&inode->i_lock); + spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); if (is_deleg) mutex_unlock(&inode->i_mutex); - if (!error && !my_before) + if (!error && !my_fl) *flp = NULL; return error; -out_unlink: - locks_unlink_lock(before); - goto out; } -static int generic_delete_lease(struct file *filp) +static int generic_delete_lease(struct file *filp, void *owner) { int error = -EAGAIN; - struct file_lock *fl, **before; + struct file_lock *fl, *victim = NULL; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + struct file_lock_context *ctx = inode->i_flctx; LIST_HEAD(dispose); - spin_lock(&inode->i_lock); - time_out_leases(inode, &dispose); - for (before = &inode->i_flock; - ((fl = *before) != NULL) && IS_LEASE(fl); - before = &fl->fl_next) { - if (fl->fl_file == filp) + if (!ctx) { + trace_generic_delete_lease(inode, NULL); + return error; + } + + spin_lock(&ctx->flc_lock); + list_for_each_entry(fl, &ctx->flc_lease, fl_list) { + if (fl->fl_file == filp && + fl->fl_owner == owner) { + victim = fl; break; + } } - trace_generic_delete_lease(inode, fl); - if (fl) - error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose); - spin_unlock(&inode->i_lock); + trace_generic_delete_lease(inode, victim); + if (victim) + error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); + spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); return error; } @@ -1737,13 +1763,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, switch (arg) { case F_UNLCK: - return generic_delete_lease(filp); + return generic_delete_lease(filp, *priv); case F_RDLCK: case F_WRLCK: if (!(*flp)->fl_lmops->lm_break) { WARN_ON_ONCE(1); return -ENOLCK; } + return generic_add_lease(filp, arg, flp, priv); default: return -EINVAL; @@ -1816,7 +1843,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { if (arg == F_UNLCK) - return vfs_setlease(filp, F_UNLCK, NULL, NULL); + return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); return do_fcntl_add_lease(fd, filp, arg); } @@ -2171,7 +2198,7 @@ again: */ /* * we need that spin_lock here - it prevents reordering between - * update of inode->i_flock and check for it done in close(). + * update of i_flctx->flc_posix and check for it done in close(). * rcu_read_lock() wouldn't do. */ spin_lock(¤t->files->file_lock); @@ -2331,13 +2358,14 @@ out: void locks_remove_posix(struct file *filp, fl_owner_t owner) { struct file_lock lock; + struct file_lock_context *ctx = file_inode(filp)->i_flctx; /* * If there are no locks held on this file, we don't need to call * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - if (!file_inode(filp)->i_flock) + if (!ctx || list_empty(&ctx->flc_posix)) return; lock.fl_type = F_UNLCK; @@ -2358,67 +2386,68 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) EXPORT_SYMBOL(locks_remove_posix); +/* The i_flctx must be valid when calling into here */ +static void +locks_remove_flock(struct file *filp) +{ + struct file_lock fl = { + .fl_owner = filp, + .fl_pid = current->tgid, + .fl_file = filp, + .fl_flags = FL_FLOCK, + .fl_type = F_UNLCK, + .fl_end = OFFSET_MAX, + }; + struct file_lock_context *flctx = file_inode(filp)->i_flctx; + + if (list_empty(&flctx->flc_flock)) + return; + + if (filp->f_op->flock) + filp->f_op->flock(filp, F_SETLKW, &fl); + else + flock_lock_file(filp, &fl); + + if (fl.fl_ops && fl.fl_ops->fl_release_private) + fl.fl_ops->fl_release_private(&fl); +} + +/* The i_flctx must be valid when calling into here */ +static void +locks_remove_lease(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx = inode->i_flctx; + struct file_lock *fl, *tmp; + LIST_HEAD(dispose); + + if (list_empty(&ctx->flc_lease)) + return; + + spin_lock(&ctx->flc_lock); + list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) + if (filp == fl->fl_file) + lease_modify(fl, F_UNLCK, &dispose); + spin_unlock(&ctx->flc_lock); + locks_dispose_list(&dispose); +} + /* * This function is called on the last close of an open file. */ void locks_remove_file(struct file *filp) { - struct inode * inode = file_inode(filp); - struct file_lock *fl; - struct file_lock **before; - LIST_HEAD(dispose); - - if (!inode->i_flock) + if (!file_inode(filp)->i_flctx) return; + /* remove any OFD locks */ locks_remove_posix(filp, filp); - if (filp->f_op->flock) { - struct file_lock fl = { - .fl_owner = filp, - .fl_pid = current->tgid, - .fl_file = filp, - .fl_flags = FL_FLOCK, - .fl_type = F_UNLCK, - .fl_end = OFFSET_MAX, - }; - filp->f_op->flock(filp, F_SETLKW, &fl); - if (fl.fl_ops && fl.fl_ops->fl_release_private) - fl.fl_ops->fl_release_private(&fl); - } - - spin_lock(&inode->i_lock); - before = &inode->i_flock; + /* remove flock locks */ + locks_remove_flock(filp); - while ((fl = *before) != NULL) { - if (fl->fl_file == filp) { - if (IS_LEASE(fl)) { - lease_modify(before, F_UNLCK, &dispose); - continue; - } - - /* - * There's a leftover lock on the list of a type that - * we didn't expect to see. Most likely a classic - * POSIX lock that ended up not getting released - * properly, or that raced onto the list somehow. Log - * some info about it and then just remove it from - * the list. - */ - WARN(!IS_FLOCK(fl), - "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n", - MAJOR(inode->i_sb->s_dev), - MINOR(inode->i_sb->s_dev), inode->i_ino, - fl->fl_type, fl->fl_flags, - fl->fl_start, fl->fl_end); - - locks_delete_lock(before, &dispose); - continue; - } - before = &fl->fl_next; - } - spin_unlock(&inode->i_lock); - locks_dispose_list(&dispose); + /* remove any leases */ + locks_remove_lease(filp); } /** @@ -2621,6 +2650,9 @@ static int __init filelock_init(void) { int i; + flctx_cache = kmem_cache_create("file_lock_ctx", + sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL); + filelock_cache = kmem_cache_create("file_lock_cache", sizeof(struct file_lock), 0, SLAB_PANIC, NULL); diff --git a/fs/mount.h b/fs/mount.h index f82c62840905..6a61c2b3e385 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -1,10 +1,12 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/poll.h> +#include <linux/ns_common.h> +#include <linux/fs_pin.h> struct mnt_namespace { atomic_t count; - unsigned int proc_inum; + struct ns_common ns; struct mount * root; struct list_head list; struct user_namespace *user_ns; @@ -61,7 +63,8 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; - struct path mnt_ex_mountpoint; + struct fs_pin mnt_umount; + struct dentry *mnt_ex_mountpoint; }; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namei.c b/fs/namei.c index db5fe86319e6..c83145af4bfc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -118,19 +118,10 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -void final_putname(struct filename *name) -{ - if (name->separate) { - __putname(name->name); - kfree(name); - } else { - __putname(name); - } -} #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) -static struct filename * +struct filename * getname_flags(const char __user *filename, int flags, int *empty) { struct filename *result, *err; @@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty) result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); + result->refcnt = 1; /* * First, try to embed the struct filename inside the names_cache @@ -179,6 +171,7 @@ recopy: } result->name = kname; result->separate = true; + result->refcnt = 1; max = PATH_MAX; goto recopy; } @@ -202,7 +195,7 @@ recopy: return result; error: - final_putname(result); + putname(result); return err; } @@ -212,43 +205,56 @@ getname(const char __user * filename) return getname_flags(filename, 0, NULL); } -/* - * The "getname_kernel()" interface doesn't do pathnames longer - * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. - */ struct filename * getname_kernel(const char * filename) { struct filename *result; - char *kname; - int len; - - len = strlen(filename); - if (len >= EMBEDDED_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); - kname = (char *)result + sizeof(*result); - result->name = kname; + if (len <= EMBEDDED_NAME_MAX) { + result->name = (char *)(result) + sizeof(*result); + result->separate = false; + } else if (len <= PATH_MAX) { + struct filename *tmp; + + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + if (unlikely(!tmp)) { + __putname(result); + return ERR_PTR(-ENOMEM); + } + tmp->name = (char *)result; + tmp->separate = true; + result = tmp; + } else { + __putname(result); + return ERR_PTR(-ENAMETOOLONG); + } + memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->separate = false; + result->refcnt = 1; + audit_getname(result); - strlcpy(kname, filename, EMBEDDED_NAME_MAX); return result; } -#ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) { - if (unlikely(!audit_dummy_context())) - return audit_putname(name); - final_putname(name); + BUG_ON(name->refcnt <= 0); + + if (--name->refcnt > 0) + return; + + if (name->separate) { + __putname(name->name); + kfree(name); + } else + __putname(name); } -#endif static int check_acl(struct inode *inode, int mask) { @@ -487,6 +493,19 @@ void path_put(const struct path *path) } EXPORT_SYMBOL(path_put); +struct nameidata { + struct path path; + struct qstr last; + struct path root; + struct inode *inode; /* path.dentry.d_inode */ + unsigned int flags; + unsigned seq, m_seq; + int last_type; + unsigned depth; + struct file *base; + char *saved_names[MAX_NESTED_LINKS + 1]; +}; + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -695,6 +714,18 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_JUMPED; } +void nd_set_link(struct nameidata *nd, char *path) +{ + nd->saved_names[nd->depth] = path; +} +EXPORT_SYMBOL(nd_set_link); + +char *nd_get_link(struct nameidata *nd) +{ + return nd->saved_names[nd->depth]; +} +EXPORT_SYMBOL(nd_get_link); + static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) { struct inode *inode = link->dentry->d_inode; @@ -1821,13 +1852,14 @@ static int link_path_walk(const char *name, struct nameidata *nd) } static int path_init(int dfd, const char *name, unsigned int flags, - struct nameidata *nd, struct file **fp) + struct nameidata *nd) { int retval = 0; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_JUMPED; + nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; + nd->base = NULL; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; @@ -1847,7 +1879,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, } else { path_get(&nd->path); } - return 0; + goto done; } nd->root.mnt = NULL; @@ -1897,7 +1929,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { if (f.flags & FDPUT_FPUT) - *fp = f.file; + nd->base = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); } else { @@ -1908,13 +1940,26 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->inode = nd->path.dentry->d_inode; if (!(flags & LOOKUP_RCU)) - return 0; + goto done; if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) - return 0; + goto done; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); return -ECHILD; +done: + current->total_link_count = 0; + return link_path_walk(name, nd); +} + +static void path_cleanup(struct nameidata *nd) +{ + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + if (unlikely(nd->base)) + fput(nd->base); } static inline int lookup_last(struct nameidata *nd, struct path *path) @@ -1930,7 +1975,6 @@ static inline int lookup_last(struct nameidata *nd, struct path *path) static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - struct file *base = NULL; struct path path; int err; @@ -1948,14 +1992,7 @@ static int path_lookupat(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); - - if (unlikely(err)) - goto out; - - current->total_link_count = 0; - err = link_path_walk(name, nd); - + err = path_init(dfd, name, flags, nd); if (!err && !(flags & LOOKUP_PARENT)) { err = lookup_last(nd, &path); while (err > 0) { @@ -1983,14 +2020,7 @@ static int path_lookupat(int dfd, const char *name, } } -out: - if (base) - fput(base); - - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - path_put(&nd->root); - nd->root.mnt = NULL; - } + path_cleanup(nd); return err; } @@ -2012,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name, static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - struct filename filename = { .name = name }; + struct filename *filename = getname_kernel(name); + int retval = PTR_ERR(filename); - return filename_lookup(dfd, &filename, flags, nd); + if (!IS_ERR(filename)) { + retval = filename_lookup(dfd, filename, flags, nd); + putname(filename); + } + return retval; } /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { + struct filename *filename = getname_kernel(name); struct nameidata nd; struct dentry *d; - int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); - if (err) - return ERR_PTR(err); + int err; + + if (IS_ERR(filename)) + return ERR_CAST(filename); + + err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd); + if (err) { + d = ERR_PTR(err); + goto out; + } if (nd.last_type != LAST_NORM) { path_put(&nd.path); - return ERR_PTR(-EINVAL); + d = ERR_PTR(-EINVAL); + goto out; } mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); d = __lookup_hash(&nd.last, nd.path.dentry, 0); if (IS_ERR(d)) { mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); - return d; + goto out; } *path = nd.path; +out: + putname(filename); return d; } @@ -2297,19 +2343,13 @@ out: static int path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) { - struct file *base = NULL; struct nameidata nd; int err; - err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); + err = path_init(dfd, name, flags, &nd); if (unlikely(err)) goto out; - current->total_link_count = 0; - err = link_path_walk(name, &nd); - if (err) - goto out; - err = mountpoint_last(&nd, path); while (err > 0) { void *cookie; @@ -2325,12 +2365,7 @@ path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags put_link(&nd, &link, cookie); } out: - if (base) - fput(base); - - if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT)) - path_put(&nd.root); - + path_cleanup(&nd); return err; } @@ -2338,13 +2373,17 @@ static int filename_mountpoint(int dfd, struct filename *s, struct path *path, unsigned int flags) { - int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); + int error; + if (IS_ERR(s)) + return PTR_ERR(s); + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); if (unlikely(error == -ECHILD)) error = path_mountpoint(dfd, s->name, path, flags); if (unlikely(error == -ESTALE)) error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); if (likely(!error)) audit_inode(s, path->dentry, 0); + putname(s); return error; } @@ -2366,21 +2405,14 @@ int user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, struct path *path) { - struct filename *s = getname(name); - int error; - if (IS_ERR(s)) - return PTR_ERR(s); - error = filename_mountpoint(dfd, s, path, flags); - putname(s); - return error; + return filename_mountpoint(dfd, getname(name), path, flags); } int kern_path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) { - struct filename s = {.name = name}; - return filename_mountpoint(dfd, &s, path, flags); + return filename_mountpoint(dfd, getname_kernel(name), path, flags); } EXPORT_SYMBOL(kern_path_mountpoint); @@ -2782,7 +2814,7 @@ no_open: } else if (!dentry->d_inode) { goto out; } else if ((open_flag & O_TRUNC) && - S_ISREG(dentry->d_inode->i_mode)) { + d_is_reg(dentry)) { goto out; } /* will fail later, go on to get the right error */ @@ -3181,7 +3213,6 @@ out: static struct file *path_openat(int dfd, struct filename *pathname, struct nameidata *nd, const struct open_flags *op, int flags) { - struct file *base = NULL; struct file *file; struct path path; int opened = 0; @@ -3198,12 +3229,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, goto out; } - error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base); - if (unlikely(error)) - goto out; - - current->total_link_count = 0; - error = link_path_walk(pathname->name, nd); + error = path_init(dfd, pathname->name, flags, nd); if (unlikely(error)) goto out; @@ -3229,10 +3255,7 @@ static struct file *path_openat(int dfd, struct filename *pathname, put_link(nd, &link, cookie); } out: - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) - path_put(&nd->root); - if (base) - fput(base); + path_cleanup(nd); if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); @@ -3269,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, { struct nameidata nd; struct file *file; - struct filename filename = { .name = name }; + struct filename *filename; int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; @@ -3278,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); + filename = getname_kernel(name); + if (unlikely(IS_ERR(filename))) + return ERR_CAST(filename); + + file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, &filename, &nd, op, flags); + file = path_openat(-1, filename, &nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); + putname(filename); return file; } -struct dentry *kern_path_create(int dfd, const char *pathname, +static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); @@ -3301,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, */ lookup_flags &= LOOKUP_REVAL; - error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); + error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd); if (error) return ERR_PTR(error); @@ -3355,6 +3383,19 @@ out: path_put(&nd.path); return dentry; } + +struct dentry *kern_path_create(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) +{ + struct filename *filename = getname_kernel(pathname); + struct dentry *res; + + if (IS_ERR(filename)) + return ERR_CAST(filename); + res = filename_create(dfd, filename, path, lookup_flags); + putname(filename); + return res; +} EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) @@ -3373,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct dentry *res; if (IS_ERR(tmp)) return ERR_CAST(tmp); - res = kern_path_create(dfd, tmp->name, path, lookup_flags); + res = filename_create(dfd, tmp, path, lookup_flags); putname(tmp); return res; } diff --git a/fs/namespace.c b/fs/namespace.c index 5b66b2b3624d..82ef1405260e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt) #endif } +static void drop_mountpoint(struct fs_pin *p) +{ + struct mount *m = container_of(p, struct mount, mnt_umount); + dput(m->mnt_ex_mountpoint); + pin_remove(p); + mntput(&m->mnt); +} + static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -201,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); if (!mnt->mnt_devname) goto out_free_id; } @@ -229,12 +237,13 @@ static struct mount *alloc_vfsmnt(const char *name) #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif + init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; #ifdef CONFIG_SMP out_free_devname: - kfree(mnt->mnt_devname); + kfree_const(mnt->mnt_devname); #endif out_free_id: mnt_free_id(mnt); @@ -568,7 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { - kfree(mnt->mnt_devname); + kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif @@ -963,7 +972,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) + if ((flag & CL_UNPRIVILEGED) && + (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); @@ -1288,7 +1298,6 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { - struct mount *mnt; struct hlist_head head = unmounted; if (likely(hlist_empty(&head))) { @@ -1298,23 +1307,11 @@ static void namespace_unlock(void) head.first->pprev = &head.first; INIT_HLIST_HEAD(&unmounted); - - /* undo decrements we'd done in umount_tree() */ - hlist_for_each_entry(mnt, &head, mnt_hash) - if (mnt->mnt_ex_mountpoint.mnt) - mntget(mnt->mnt_ex_mountpoint.mnt); - up_write(&namespace_sem); synchronize_rcu(); - while (!hlist_empty(&head)) { - mnt = hlist_entry(head.first, struct mount, mnt_hash); - hlist_del_init(&mnt->mnt_hash); - if (mnt->mnt_ex_mountpoint.mnt) - path_put(&mnt->mnt_ex_mountpoint); - mntput(&mnt->mnt); - } + group_pin_kill(&head); } static inline void namespace_lock(void) @@ -1333,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how) { HLIST_HEAD(tmp_list); struct mount *p; - struct mount *last = NULL; for (p = mnt; p; p = next_mnt(p, mnt)) { hlist_del_init_rcu(&p->mnt_hash); @@ -1346,31 +1342,28 @@ void umount_tree(struct mount *mnt, int how) if (how) propagate_umount(&tmp_list); - hlist_for_each_entry(p, &tmp_list, mnt_hash) { + while (!hlist_empty(&tmp_list)) { + p = hlist_entry(tmp_list.first, struct mount, mnt_hash); + hlist_del_init_rcu(&p->mnt_hash); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; + + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted); if (mnt_has_parent(p)) { hlist_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); - /* move the reference to mountpoint into ->mnt_ex_mountpoint */ - p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; - p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; + /* old mountpoint will be dropped when we can do that */ + p->mnt_ex_mountpoint = p->mnt_mountpoint; p->mnt_mountpoint = p->mnt.mnt_root; p->mnt_parent = p; p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); - last = p; - } - if (last) { - last->mnt_hash.next = unmounted.first; - unmounted.first = tmp_list.first; - unmounted.first->pprev = &unmounted.first; } } @@ -1544,6 +1537,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto dput_and_out; + retval = -EPERM; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: @@ -1569,17 +1565,13 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { /* Is this a proxy for a mount namespace? */ - struct inode *inode = dentry->d_inode; - struct proc_ns *ei; - - if (!proc_ns_inode(inode)) - return false; - - ei = get_proc_ns(inode); - if (ei->ns_ops != &mntns_operations) - return false; + return dentry->d_op == &ns_dentry_operations && + dentry->d_fsdata == &mntns_operations; +} - return true; +struct mnt_namespace *to_mnt_ns(struct ns_common *ns) +{ + return container_of(ns, struct mnt_namespace, ns); } static bool mnt_ns_loop(struct dentry *dentry) @@ -1591,7 +1583,7 @@ static bool mnt_ns_loop(struct dentry *dentry) if (!is_mnt_ns_file(dentry)) return false; - mnt_ns = get_proc_ns(dentry->d_inode)->ns; + mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -1610,7 +1602,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, if (IS_ERR(q)) return q; - q->mnt.mnt_flags &= ~MNT_LOCKED; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; @@ -1916,8 +1907,8 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) if (mnt->mnt.mnt_sb->s_flags & MS_NOUSER) return -EINVAL; - if (S_ISDIR(mp->m_dentry->d_inode->i_mode) != - S_ISDIR(mnt->mnt.mnt_root->d_inode->i_mode)) + if (d_is_dir(mp->m_dentry) != + d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, p, mp, NULL); @@ -2020,7 +2011,10 @@ static int do_loopback(struct path *path, const char *old_name, if (IS_MNT_UNBINDABLE(old)) goto out2; - if (!check_mnt(parent) || !check_mnt(old)) + if (!check_mnt(parent)) + goto out2; + + if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations) goto out2; if (!recurse && has_locked_children(old, old_path.dentry)) @@ -2098,7 +2092,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - return -EPERM; + /* Was the nodev implicitly added in mount? */ + if ((mnt->mnt_ns->user_ns != &init_user_ns) && + !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { + mnt_flags |= MNT_NODEV; + } else { + return -EPERM; + } } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { @@ -2180,8 +2180,8 @@ static int do_move_mount(struct path *path, const char *old_name) if (!mnt_has_parent(old)) goto out1; - if (S_ISDIR(path->dentry->d_inode->i_mode) != - S_ISDIR(old_path.dentry->d_inode->i_mode)) + if (d_is_dir(path->dentry) != + d_is_dir(old_path.dentry)) goto out1; /* * Don't move a mount residing in a shared parent. @@ -2271,7 +2271,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) goto unlock; err = -EINVAL; - if (S_ISLNK(newmnt->mnt.mnt_root->d_inode->i_mode)) + if (d_is_symlink(newmnt->mnt.mnt_root)) goto unlock; newmnt->mnt.mnt_flags = mnt_flags; @@ -2640,7 +2640,7 @@ dput_out: static void free_mnt_ns(struct mnt_namespace *ns) { - proc_free_inum(ns->proc_inum); + ns_free_inum(&ns->ns); put_user_ns(ns->user_ns); kfree(ns); } @@ -2662,11 +2662,12 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) return ERR_PTR(-ENOMEM); - ret = proc_alloc_inum(&new_ns->proc_inum); + ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); return ERR_PTR(ret); } + new_ns->ns.ops = &mntns_operations; new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); new_ns->root = NULL; @@ -2958,6 +2959,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount new_root on / */ attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); touch_mnt_namespace(current->nsproxy->mnt_ns); + /* A moved mount should not expire automatically */ + list_del_init(&new_mnt->mnt_expire); unlock_mount_hash(); chroot_fs_refs(&root, &new); put_mountpoint(root_mp); @@ -3002,6 +3005,7 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; + mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -3144,31 +3148,31 @@ found: return visible; } -static void *mntns_get(struct task_struct *task) +static struct ns_common *mntns_get(struct task_struct *task) { - struct mnt_namespace *ns = NULL; + struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { - ns = nsproxy->mnt_ns; - get_mnt_ns(ns); + ns = &nsproxy->mnt_ns->ns; + get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } -static void mntns_put(void *ns) +static void mntns_put(struct ns_common *ns) { - put_mnt_ns(ns); + put_mnt_ns(to_mnt_ns(ns)); } -static int mntns_install(struct nsproxy *nsproxy, void *ns) +static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct fs_struct *fs = current->fs; - struct mnt_namespace *mnt_ns = ns; + struct mnt_namespace *mnt_ns = to_mnt_ns(ns); struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || @@ -3198,17 +3202,10 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) return 0; } -static unsigned int mntns_inum(void *ns) -{ - struct mnt_namespace *mnt_ns = ns; - return mnt_ns->proc_inum; -} - const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, - .inum = mntns_inum, }; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 7cb751dfbeef..e7ca827d7694 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *); static int ncp_compare_dentry(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); static int ncp_delete_dentry(const struct dentry *); +static void ncp_d_prune(struct dentry *dentry); const struct dentry_operations ncp_dentry_operations = { @@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations = .d_hash = ncp_hash_dentry, .d_compare = ncp_compare_dentry, .d_delete = ncp_delete_dentry, + .d_prune = ncp_d_prune, }; #define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) @@ -198,8 +200,8 @@ ncp_single_volume(struct ncp_server *server) static inline int ncp_is_server_root(struct inode *inode) { - return (!ncp_single_volume(NCP_SERVER(inode)) && - inode == inode->i_sb->s_root->d_inode); + return !ncp_single_volume(NCP_SERVER(inode)) && + is_root_inode(inode); } @@ -384,42 +386,6 @@ finished: return val; } -static struct dentry * -ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) -{ - struct dentry *dent = dentry; - - if (d_validate(dent, parent)) { - if (dent->d_name.len <= NCP_MAXPATHLEN && - (unsigned long)dent->d_fsdata == fpos) { - if (!dent->d_inode) { - dput(dent); - dent = NULL; - } - return dent; - } - dput(dent); - } - - /* If a pointer is invalid, we search the dentry. */ - spin_lock(&parent->d_lock); - list_for_each_entry(dent, &parent->d_subdirs, d_u.d_child) { - if ((unsigned long)dent->d_fsdata == fpos) { - if (dent->d_inode) - dget(dent); - else - dent = NULL; - spin_unlock(&parent->d_lock); - goto out; - } - } - spin_unlock(&parent->d_lock); - return NULL; - -out: - return dent; -} - static time_t ncp_obtain_mtime(struct dentry *dentry) { struct inode *inode = dentry->d_inode; @@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) return ncp_date_dos2unix(i.modifyTime, i.modifyDate); } +static inline void +ncp_invalidate_dircache_entries(struct dentry *parent) +{ + struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct dentry *dentry; + + spin_lock(&parent->d_lock); + list_for_each_entry(dentry, &parent->d_subdirs, d_child) { + dentry->d_fsdata = NULL; + ncp_age_dentry(server, dentry); + } + spin_unlock(&parent->d_lock); +} + static int ncp_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; @@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) struct dentry *dent; bool over; - dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], - dentry, ctx->pos); - if (!dent) + spin_lock(&dentry->d_lock); + if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { + spin_unlock(&dentry->d_lock); + goto invalid_cache; + } + dent = ctl.cache->dentry[ctl.idx]; + if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) { + spin_unlock(&dentry->d_lock); goto invalid_cache; + } + spin_unlock(&dentry->d_lock); + if (!dent->d_inode) { + dput(dent); + goto invalid_cache; + } over = !dir_emit(ctx, dent->d_name.name, dent->d_name.len, dent->d_inode->i_ino, DT_UNKNOWN); @@ -548,6 +539,9 @@ init_cache: ctl.filled = 0; ctl.valid = 1; read_really: + spin_lock(&dentry->d_lock); + NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE; + spin_unlock(&dentry->d_lock); if (ncp_is_server_root(inode)) { ncp_read_volume_list(file, ctx, &ctl); } else { @@ -573,6 +567,13 @@ out: return result; } +static void ncp_d_prune(struct dentry *dentry) +{ + if (!dentry->d_fsdata) /* not referenced from page cache */ + return; + NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE; +} + static int ncp_fill_cache(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, @@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, d_instantiate(newdent, inode); if (!hashed) d_rehash(newdent); + } else { + spin_lock(&dentry->d_lock); + NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + spin_unlock(&dentry->d_lock); } } else { struct inode *inode = newdent->d_inode; @@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, mutex_unlock(&inode->i_mutex); } - if (newdent->d_inode) { - ino = newdent->d_inode->i_ino; - newdent->d_fsdata = (void *) ctl.fpos; - ncp_new_dentry(newdent); - } - if (ctl.idx >= NCP_DIRCACHE_SIZE) { if (ctl.page) { kunmap(ctl.page); @@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, ctl.cache = kmap(ctl.page); } if (ctl.cache) { - ctl.cache->dentry[ctl.idx] = newdent; - valid = 1; + if (newdent->d_inode) { + newdent->d_fsdata = newdent; + ctl.cache->dentry[ctl.idx] = newdent; + ino = newdent->d_inode->i_ino; + ncp_new_dentry(newdent); + } + valid = 1; } dput(newdent); end_advance: @@ -685,8 +689,7 @@ static void ncp_read_volume_list(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctl) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(file); struct ncp_server *server = NCP_SERVER(inode); struct ncp_volume_info info; struct ncp_entry_info entry; @@ -721,8 +724,7 @@ static void ncp_do_readdir(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctl) { - struct dentry *dentry = file->f_path.dentry; - struct inode *dir = dentry->d_inode; + struct inode *dir = file_inode(file); struct ncp_server *server = NCP_SERVER(dir); struct nw_search_sequence seq; struct ncp_entry_info entry; diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 77640a8bfb87..1dd7007f974d 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -100,8 +100,7 @@ out: static ssize_t ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(file); size_t already_read = 0; off_t pos; size_t bufsize; @@ -109,7 +108,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) void* freepage; size_t freelen; - ncp_dbg(1, "enter %pd2\n", dentry); + ncp_dbg(1, "enter %pD2\n", file); pos = *ppos; @@ -167,7 +166,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) file_accessed(file); - ncp_dbg(1, "exit %pd2\n", dentry); + ncp_dbg(1, "exit %pD2\n", file); outrel: ncp_inode_close(inode); return already_read ? already_read : error; @@ -176,15 +175,14 @@ outrel: static ssize_t ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(file); size_t already_written = 0; off_t pos; size_t bufsize; int errno; void* bouncebuffer; - ncp_dbg(1, "enter %pd2\n", dentry); + ncp_dbg(1, "enter %pD2\n", file); if ((ssize_t) count < 0) return -EINVAL; pos = *ppos; @@ -263,7 +261,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * i_size_write(inode, pos); mutex_unlock(&inode->i_mutex); } - ncp_dbg(1, "exit %pd2\n", dentry); + ncp_dbg(1, "exit %pD2\n", file); outrel: ncp_inode_close(inode); return already_written ? already_written : errno; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index e31e589369a4..01a9e16e9782 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) if (inode) { atomic_set(&NCP_FINFO(inode)->opened, info->opened); - inode->i_mapping->backing_dev_info = sb->s_bdi; inode->i_ino = info->ino; ncp_set_attr(inode, info); if (S_ISREG(inode->i_mode)) { @@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); - error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); + error = bdi_setup_and_register(&server->bdi, "ncpfs"); if (error) goto out_fput; diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index d5659d96ee7f..cf7e043a9447 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -447,7 +447,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg result = -EIO; } } - result = 0; } mutex_unlock(&server->root_setup_lock); diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index b359d12eb359..33b873b259a8 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -30,9 +30,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, struct vm_fault *vmf) { - struct file *file = area->vm_file; - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = file_inode(area->vm_file); char *pg_addr; unsigned int already_read; unsigned int count; diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h index 4b0bec477846..c4794504f843 100644 --- a/fs/ncpfs/ncp_fs_i.h +++ b/fs/ncpfs/ncp_fs_i.h @@ -22,6 +22,7 @@ struct ncp_inode_info { int access; int flags; #define NCPI_KLUDGE_SYMLINK 0x0001 +#define NCPI_DIR_CACHE 0x0002 __u8 file_handle[6]; struct inode vfs_inode; }; diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 52cb19d66ecb..250e443a07f3 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry) dentry->d_time = jiffies; } -static inline void -ncp_renew_dentries(struct dentry *parent) -{ - struct ncp_server *server = NCP_SERVER(parent->d_inode); - struct dentry *dentry; - - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) { - if (dentry->d_fsdata == NULL) - ncp_age_dentry(server, dentry); - else - ncp_new_dentry(dentry); - } - spin_unlock(&parent->d_lock); -} - -static inline void -ncp_invalidate_dircache_entries(struct dentry *parent) -{ - struct ncp_server *server = NCP_SERVER(parent->d_inode); - struct dentry *dentry; - - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) { - dentry->d_fsdata = NULL; - ncp_age_dentry(server, dentry); - } - spin_unlock(&parent->d_lock); -} - struct ncp_cache_head { time_t mtime; unsigned long time; /* cache age */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 3dece03f2fc8..c7abc10279af 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT depends on NFS_V4_1 && SCSI_OSD_ULD default NFS_V4 +config PNFS_FLEXFILE_LAYOUT + tristate + depends on NFS_V4_1 && NFS_V3 + default m + config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN string "NFSv4.1 Implementation ID Domain" depends on NFS_V4_1 diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 04cb830fa09f..1e987acf20c9 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o -nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o +nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ +obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 4f46f7a05289..1cac3c175d18 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -812,7 +812,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) /* Optimize common case that writes from 0 to end of file */ end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); - if (end != NFS_I(inode)->npages) { + if (end != inode->i_mapping->nrpages) { rcu_read_lock(); end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX); rcu_read_unlock(); @@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = { .pg_init = bl_pg_init_read, .pg_test = bl_pg_test_read, .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops bl_pg_write_ops = { .pg_init = bl_pg_init_write, .pg_test = bl_pg_test_write, .pg_doio = pnfs_generic_pg_writepages, + .pg_cleanup = pnfs_generic_pg_cleanup, }; static struct pnfs_layoutdriver_type blocklayout_type = { diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index acbf9ca4018c..dbe5839cdeba 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -112,7 +112,7 @@ out_unlock: static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { - struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfs_net_id); if (mlen != sizeof (struct bl_dev_msg)) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index b8fb3a4ef649..351be9205bf8 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, struct rpc_rqst, rq_bc_list); list_del(&req->rq_bc_list); spin_unlock_bh(&serv->sv_cb_lock); + finish_wait(&serv->sv_cb_waitq, &wq); dprintk("Invoking bc_svc_process()\n"); error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); } else { spin_unlock_bh(&serv->sv_cb_lock); - schedule(); + /* schedule_timeout to game the hung task watchdog */ + schedule_timeout(60 * HZ); + finish_wait(&serv->sv_cb_waitq, &wq); } - finish_wait(&serv->sv_cb_waitq, &wq); } return 0; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 73466b934090..197806fb87ff 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -49,7 +49,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; - if (nfsi->npages != 0) + if (nfsi->nrequests != 0) res->change_attr++; res->ctime = inode->i_ctime; res->mtime = inode->i_mtime; @@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (clp == NULL) goto out; + if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) + goto out; tbl = &clp->cl_session->bc_slot_table; spin_lock(&tbl->slot_tbl_lock); diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index f4ccfe6521ec..19ca95cdfd9b 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, goto out; } - args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); + args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL); if (!args->devs) { status = htonl(NFS4ERR_DELAY); goto out; @@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; - rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls * + rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls, sizeof(*rc_list->rcl_refcalls), GFP_KERNEL); if (unlikely(rc_list->rcl_refcalls == NULL)) @@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, for (i = 0; i < args->csa_nrclists; i++) { status = decode_rc_list(xdr, &args->csa_rclists[i]); - if (status) + if (status) { + args->csa_nrclists = i; goto out_free; + } } } status = 0; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index f9f4845db989..19874151e95c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat static bool nfs_client_init_is_complete(const struct nfs_client *clp) { - return clp->cl_cons_state != NFS_CS_INITING; + return clp->cl_cons_state <= NFS_CS_READY; } int nfs_wait_client_init_complete(const struct nfs_client *clp) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7f3f60641344..a6ad68865880 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_ { struct inode *inode = state->inode; struct file_lock *fl; + struct file_lock_context *flctx = inode->i_flctx; + struct list_head *list; int status = 0; - if (inode->i_flock == NULL) + if (flctx == NULL) goto out; - /* Protect inode->i_flock using the i_lock */ - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) - continue; + list = &flctx->flc_posix; + spin_lock(&flctx->flc_lock); +restart: + list_for_each_entry(fl, list, fl_list) { if (nfs_file_open_context(fl->fl_file) != ctx) continue; - spin_unlock(&inode->i_lock); + spin_unlock(&flctx->flc_lock); status = nfs4_lock_delegation_recall(fl, state, stateid); if (status < 0) goto out; - spin_lock(&inode->i_lock); + spin_lock(&flctx->flc_lock); } - spin_unlock(&inode->i_lock); + if (list == &flctx->flc_posix) { + list = &flctx->flc_flock; + goto restart; + } + spin_unlock(&flctx->flc_lock); out: return status; } @@ -175,10 +180,9 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, delegation->cred = get_rpccred(cred); clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); - NFS_I(inode)->delegation_state = delegation->type; spin_unlock(&delegation->lock); - put_rpccred(oldcred); rcu_read_unlock(); + put_rpccred(oldcred); trace_nfs4_reclaim_delegation(inode, res->delegation_type); } else { /* We appear to have raced with a delegation return. */ @@ -270,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); list_del_rcu(&delegation->super_list); delegation->inode = NULL; - nfsi->delegation_state = 0; rcu_assign_pointer(nfsi->delegation, NULL); spin_unlock(&delegation->lock); return delegation; @@ -301,6 +304,17 @@ nfs_inode_detach_delegation(struct inode *inode) return nfs_detach_delegation(nfsi, delegation, server); } +static void +nfs_update_inplace_delegation(struct nfs_delegation *delegation, + const struct nfs_delegation *update) +{ + if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) { + delegation->stateid.seqid = update->stateid.seqid; + smp_wmb(); + delegation->type = update->type; + } +} + /** * nfs_inode_set_delegation - set up a delegation on an inode * @inode: inode to which delegation applies @@ -334,9 +348,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct old_delegation = rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); if (old_delegation != NULL) { - if (nfs4_stateid_match(&delegation->stateid, - &old_delegation->stateid) && - delegation->type == old_delegation->type) { + /* Is this an update of the existing delegation? */ + if (nfs4_stateid_match_other(&old_delegation->stateid, + &delegation->stateid)) { + nfs_update_inplace_delegation(old_delegation, + delegation); goto out; } /* @@ -354,13 +370,15 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct delegation = NULL; goto out; } - freeme = nfs_detach_delegation_locked(nfsi, + if (test_and_set_bit(NFS_DELEGATION_RETURNING, + &old_delegation->flags)) + goto out; + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); if (freeme == NULL) goto out; } list_add_rcu(&delegation->super_list, &server->delegations); - nfsi->delegation_state = delegation->type; rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -418,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) + goto out; if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { @@ -429,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) ret = true; spin_unlock(&delegation->lock); } +out: return ret; } @@ -456,14 +477,20 @@ restart: super_list) { if (!nfs_delegation_need_return(delegation)) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); err = nfs_end_delegation_return(inode, delegation, 0); iput(inode); + nfs_sb_deactive(server->super); if (!err) goto restart; set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); @@ -794,19 +821,30 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { + if (test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags)) + continue; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; - inode = nfs_delegation_grab_inode(delegation); - if (inode == NULL) + if (!nfs_sb_active(server->super)) continue; - delegation = nfs_detach_delegation(NFS_I(inode), - delegation, server); + inode = nfs_delegation_grab_inode(delegation); + if (inode == NULL) { + rcu_read_unlock(); + nfs_sb_deactive(server->super); + goto restart; + } + delegation = nfs_start_delegation_return_locked(NFS_I(inode)); rcu_read_unlock(); - - if (delegation != NULL) - nfs_free_delegation(delegation); + if (delegation != NULL) { + delegation = nfs_detach_delegation(NFS_I(inode), + delegation, server); + if (delegation != NULL) + nfs_free_delegation(delegation); + } iput(inode); + nfs_sb_deactive(server->super); goto restart; } } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6e62155abf26..c19e16f0b2d0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -133,7 +133,7 @@ out: static int nfs_closedir(struct inode *inode, struct file *filp) { - put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data); + put_nfs_open_dir_context(file_inode(filp), filp->private_data); return 0; } @@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc, return 0; } +/* Match file and dirent using either filehandle or fileid + * Note: caller is responsible for checking the fsid + */ static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { + struct nfs_inode *nfsi; + if (dentry->d_inode == NULL) goto different; - if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) - goto different; - return 1; + + nfsi = NFS_I(dentry->d_inode); + if (entry->fattr->fileid == nfsi->fileid) + return 1; + if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) + return 1; different: return 0; } @@ -469,6 +477,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct inode *inode; int status; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID)) + return; + if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID)) + return; if (filename.name[0] == '.') { if (filename.len == 1) return; @@ -479,6 +491,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) dentry = d_lookup(parent, &filename); if (dentry != NULL) { + /* Is there a mountpoint here? If so, just exit */ + if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid, + &entry->fattr->fsid)) + goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); status = nfs_refresh_inode(dentry->d_inode, entry->fattr); @@ -499,7 +515,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) if (IS_ERR(inode)) goto out; - alias = d_materialise_unique(dentry, inode); + alias = d_splice_alias(inode, dentry); if (IS_ERR(alias)) goto out; else if (alias) { @@ -1393,7 +1409,7 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in nfs_advise_use_readdirplus(dir); no_entry: - res = d_materialise_unique(dentry, inode); + res = d_splice_alias(inode, dentry); if (res != NULL) { if (IS_ERR(res)) goto out_unblock_sillyrename; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 10bf07280f4a..e907c8cf732e 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep; /* * This represents a set of asynchronous requests that we're waiting on */ +struct nfs_direct_mirror { + ssize_t count; +}; + struct nfs_direct_req { struct kref kref; /* release manager */ @@ -78,8 +82,13 @@ struct nfs_direct_req { /* completion state */ atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ + + struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; + int mirror_count; + ssize_t count, /* bytes actually processed */ bytes_left, /* bytes left to be sent */ + io_start, /* start of IO */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ @@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } +void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) +{ + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; +} +EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); + +static void +nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +{ + int i; + ssize_t count; + + WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); + + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; + + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); + + dreq->count = count; +} + /* * nfs_direct_select_verf - select the right verifier * @dreq - direct request possibly spanning multiple servers * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs - * @ds_idx - index of data server in data server list, only valid if ds_clp set + * @commit_idx - commit bucket index for the DS * * returns the correct verifier to use given the role of the server */ static struct nfs_writeverf * nfs_direct_select_verf(struct nfs_direct_req *dreq, struct nfs_client *ds_clp, - int ds_idx) + int commit_idx) { struct nfs_writeverf *verfp = &dreq->verf; #ifdef CONFIG_NFS_V4_1 if (ds_clp) { /* pNFS is in use, use the DS verf */ - if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets) - verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf; + if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets) + verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf; else WARN_ON_ONCE(1); } @@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq, { struct nfs_writeverf *verfp; - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, - hdr->ds_idx); + verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); WARN_ON_ONCE(verfp->committed >= 0); memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); WARN_ON_ONCE(verfp->committed < 0); @@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, { struct nfs_writeverf *verfp; - verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, - hdr->ds_idx); + verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx); if (verfp->committed < 0) { nfs_direct_set_hdr_verf(dreq, hdr); return 0; @@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, verfp = nfs_direct_select_verf(dreq, data->ds_clp, data->ds_commit_index); - WARN_ON_ONCE(verfp->committed < 0); + + /* verifier not set so always fail */ + if (verfp->committed < 0) + return 1; + return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); } @@ -212,6 +253,12 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { + struct inode *inode = iocb->ki_filp->f_mapping->host; + + /* we only support swap file calling nfs_direct_IO */ + if (!IS_SWAPFILE(inode)) + return 0; + #ifndef CONFIG_NFS_SWAP dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", iocb->ki_filp, (long long) pos, iter->nr_segs); @@ -236,13 +283,25 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages) void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq) { - cinfo->lock = &dreq->lock; + cinfo->lock = &dreq->inode->i_lock; cinfo->mds = &dreq->mds_cinfo; cinfo->ds = &dreq->ds_cinfo; cinfo->dreq = dreq; cinfo->completion_ops = &nfs_direct_commit_completion_ops; } +static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq, + struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + int mirror_count = 1; + + if (pgio->pg_ops->pg_get_mirror_count) + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + + dreq->mirror_count = mirror_count; +} + static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; @@ -257,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->mds_cinfo.list); dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); + dreq->mirror_count = 1; spin_lock_init(&dreq->lock); return dreq; @@ -363,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0)) dreq->error = hdr->error; else - dreq->count += hdr->good_bytes; + nfs_direct_good_bytes(dreq, hdr); + spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -541,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, dreq->inode = inode; dreq->bytes_left = count; + dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -573,6 +635,20 @@ out: return result; } +static void +nfs_direct_write_scan_commit_list(struct inode *inode, + struct list_head *list, + struct nfs_commit_info *cinfo) +{ + spin_lock(cinfo->lock); +#ifdef CONFIG_NFS_V4_1 + if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) + NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); +#endif + nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); + spin_unlock(cinfo->lock); +} + static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; @@ -580,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) LIST_HEAD(reqs); struct nfs_commit_info cinfo; LIST_HEAD(failed); + int i; nfs_init_cinfo_from_dreq(&cinfo, dreq); - pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo); - spin_lock(cinfo.lock); - nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0); - spin_unlock(cinfo.lock); + nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + for (i = 0; i < dreq->mirror_count; i++) + dreq->mirrors[i].count = 0; get_dreq(dreq); nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; + req = nfs_list_entry(reqs.next); + nfs_direct_setup_mirroring(dreq, &desc, req); + list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { nfs_list_remove_request(req); @@ -640,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_list_remove_request(req); if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) { /* Note the rewrite will go through mds */ - nfs_mark_request_commit(req, NULL, &cinfo); + nfs_mark_request_commit(req, NULL, &cinfo, 0); } else nfs_release_request(req); nfs_unlock_and_release_request(req); @@ -715,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) dreq->error = hdr->error; } if (dreq->error == 0) { - dreq->count += hdr->good_bytes; + nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) request_commit = true; @@ -739,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (request_commit) { kref_get(&req->wb_kref); - nfs_mark_request_commit(req, hdr->lseg, &cinfo); + nfs_mark_request_commit(req, hdr->lseg, &cinfo, + hdr->ds_commit_idx); } nfs_unlock_and_release_request(req); } @@ -820,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, result = PTR_ERR(req); break; } + + nfs_direct_setup_mirroring(dreq, &desc, req); + nfs_lock_request(req); req->wb_index = pos >> PAGE_SHIFT; req->wb_offset = pos & ~PAGE_MASK; @@ -928,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, dreq->inode = inode; dreq->bytes_left = count; + dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2ab6f00dba5b..e679d24c39d3 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -178,7 +178,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); + result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) @@ -199,7 +199,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", filp, (unsigned long) count, (unsigned long long) *ppos); - res = nfs_revalidate_mapping(inode, filp->f_mapping); + res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); if (!res) { res = generic_file_splice_read(filp, ppos, pipe, count, flags); if (res > 0) @@ -372,6 +372,10 @@ start: nfs_wait_bit_killable, TASK_KILLABLE); if (ret) return ret; + /* + * Wait for O_DIRECT to complete + */ + nfs_inode_dio_wait(mapping->host); page = grab_cache_page_write_begin(mapping, index, flags); if (!page) @@ -619,6 +623,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* make sure the cache has finished storing the page */ nfs_fscache_wait_on_page_write(NFS_I(inode), page); + wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, + nfs_wait_bit_killable, TASK_KILLABLE); + lock_page(page); mapping = page_file_mapping(page); if (mapping != inode->i_mapping) @@ -646,7 +653,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nfs_need_sync_write(struct file *filp, struct inode *inode) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 7afb52f6a25a..91e88a7ecef0 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr) } } -static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo) -{ - if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) - return; - pnfs_return_layout(inode); -} - static int filelayout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, @@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); + pnfs_error_mark_layout_for_return(inode, lseg); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: @@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data) rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); } -static void filelayout_read_release(void *data) -{ - struct nfs_pgio_header *hdr = data; - struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout; - - filelayout_fenceme(lo->plh_inode, lo); - nfs_put_client(hdr->ds_clp); - hdr->mds_ops->rpc_release(data); -} - static int filelayout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task, return 0; } -/* Fake up some data that will cause nfs_commit_release to retry the writes. */ -static void prepare_to_resend_writes(struct nfs_commit_data *data) -{ - struct nfs_page *first = nfs_list_entry(data->pages.next); - - data->task.tk_status = 0; - memcpy(&data->verf.verifier, &first->wb_verf, - sizeof(data->verf.verifier)); - data->verf.verifier.data[0]++; /* ensure verifier mismatch */ -} - static int filelayout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { @@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_MDS: - prepare_to_resend_writes(data); + pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: rpc_restart_call_prepare(task); @@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data) rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics); } -static void filelayout_write_release(void *data) -{ - struct nfs_pgio_header *hdr = data; - struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout; - - filelayout_fenceme(lo->plh_inode, lo); - nfs_put_client(hdr->ds_clp); - hdr->mds_ops->rpc_release(data); -} - static void filelayout_commit_prepare(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; @@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data) task); } -static void filelayout_write_commit_done(struct rpc_task *task, void *data) -{ - struct nfs_commit_data *wdata = data; - - /* Note this may cause RPC to be resent */ - wdata->mds_ops->rpc_call_done(task, data); -} - static void filelayout_commit_count_stats(struct rpc_task *task, void *data) { struct nfs_commit_data *cdata = data; @@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data) rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics); } -static void filelayout_commit_release(void *calldata) -{ - struct nfs_commit_data *data = calldata; - - data->completion_ops->completion(data); - pnfs_put_lseg(data->lseg); - nfs_put_client(data->ds_clp); - nfs_commitdata_release(data); -} - static const struct rpc_call_ops filelayout_read_call_ops = { .rpc_call_prepare = filelayout_read_prepare, .rpc_call_done = filelayout_read_call_done, .rpc_count_stats = filelayout_read_count_stats, - .rpc_release = filelayout_read_release, + .rpc_release = pnfs_generic_rw_release, }; static const struct rpc_call_ops filelayout_write_call_ops = { .rpc_call_prepare = filelayout_write_prepare, .rpc_call_done = filelayout_write_call_done, .rpc_count_stats = filelayout_write_count_stats, - .rpc_release = filelayout_write_release, + .rpc_release = pnfs_generic_rw_release, }; static const struct rpc_call_ops filelayout_commit_call_ops = { .rpc_call_prepare = filelayout_commit_prepare, - .rpc_call_done = filelayout_write_commit_done, + .rpc_call_done = pnfs_generic_write_commit_done, .rpc_count_stats = filelayout_commit_count_stats, - .rpc_release = filelayout_commit_release, + .rpc_release = pnfs_generic_commit_release, }; static enum pnfs_try_status @@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) /* No multipath support. Use first DS */ atomic_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_idx = idx; + hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) hdr->args.fh = fh; @@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) hdr->mds_offset = offset; /* Perform an asynchronous read to ds */ - nfs_initiate_pgio(ds_clnt, hdr, - &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); + nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, + NFS_PROTO(hdr->inode), &filelayout_read_call_ops, + 0, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } @@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->pgio_done_cb = filelayout_write_done_cb; atomic_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_idx = idx; + hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); if (fh) hdr->args.fh = fh; hdr->args.offset = filelayout_get_dserver_offset(lseg, offset); /* Perform an asynchronous write */ - nfs_initiate_pgio(ds_clnt, hdr, - &filelayout_write_call_ops, sync, - RPC_TASK_SOFTCONN); + nfs_initiate_pgio(ds_clnt, hdr, hdr->cred, + NFS_PROTO(hdr->inode), &filelayout_write_call_ops, + sync, RPC_TASK_SOFTCONN); return PNFS_ATTEMPTED; } @@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = { .pg_init = filelayout_pg_init_read, .pg_test = filelayout_pg_test, .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops filelayout_pg_write_ops = { .pg_init = filelayout_pg_init_write, .pg_test = filelayout_pg_test, .pg_doio = pnfs_generic_pg_writepages, + .pg_cleanup = pnfs_generic_pg_cleanup, }; static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) @@ -1004,87 +951,28 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j) return j; } -/* The generic layer is about to remove the req from the commit list. - * If this will make the bucket empty, it will need to put the lseg reference. - * Note this is must be called holding the inode (/cinfo) lock - */ -static void -filelayout_clear_request_commit(struct nfs_page *req, - struct nfs_commit_info *cinfo) -{ - struct pnfs_layout_segment *freeme = NULL; - - if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) - goto out; - cinfo->ds->nwritten--; - if (list_is_singular(&req->wb_list)) { - struct pnfs_commit_bucket *bucket; - - bucket = list_first_entry(&req->wb_list, - struct pnfs_commit_bucket, - written); - freeme = bucket->wlseg; - bucket->wlseg = NULL; - } -out: - nfs_request_remove_commit_list(req, cinfo); - pnfs_put_lseg_locked(freeme); -} - static void filelayout_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) { struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); u32 i, j; - struct list_head *list; - struct pnfs_commit_bucket *buckets; if (fl->commit_through_mds) { - list = &cinfo->mds->list; - spin_lock(cinfo->lock); - goto mds_commit; - } - - /* Note that we are calling nfs4_fl_calc_j_index on each page - * that ends up being committed to a data server. An attractive - * alternative is to add a field to nfs_write_data and nfs_page - * to store the value calculated in filelayout_write_pagelist - * and just use that here. - */ - j = nfs4_fl_calc_j_index(lseg, req_offset(req)); - i = select_bucket_index(fl, j); - spin_lock(cinfo->lock); - buckets = cinfo->ds->buckets; - list = &buckets[i].written; - if (list_empty(list)) { - /* Non-empty buckets hold a reference on the lseg. That ref - * is normally transferred to the COMMIT call and released - * there. It could also be released if the last req is pulled - * off due to a rewrite, in which case it will be done in - * filelayout_clear_request_commit + nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + } else { + /* Note that we are calling nfs4_fl_calc_j_index on each page + * that ends up being committed to a data server. An attractive + * alternative is to add a field to nfs_write_data and nfs_page + * to store the value calculated in filelayout_write_pagelist + * and just use that here. */ - buckets[i].wlseg = pnfs_get_lseg(lseg); - } - set_bit(PG_COMMIT_TO_DS, &req->wb_flags); - cinfo->ds->nwritten++; - -mds_commit: - /* nfs_request_add_commit_list(). We need to add req to list without - * dropping cinfo lock. - */ - set_bit(PG_CLEAN, &(req)->wb_flags); - nfs_list_add_request(req, list); - cinfo->mds->ncommit++; - spin_unlock(cinfo->lock); - if (!cinfo->dreq) { - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, - BDI_RECLAIMABLE); - __mark_inode_dirty(req->wb_context->dentry->d_inode, - I_DIRTY_DATASYNC); + j = nfs4_fl_calc_j_index(lseg, req_offset(req)); + i = select_bucket_index(fl, j); + pnfs_layout_mark_request_commit(req, lseg, cinfo, i); } } @@ -1138,101 +1026,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; - return nfs_initiate_commit(ds_clnt, data, + return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode), &filelayout_commit_call_ops, how, RPC_TASK_SOFTCONN); out_err: - prepare_to_resend_writes(data); - filelayout_commit_release(data); + pnfs_generic_prepare_to_resend_writes(data); + pnfs_generic_commit_release(data); return -EAGAIN; } -static int -transfer_commit_list(struct list_head *src, struct list_head *dst, - struct nfs_commit_info *cinfo, int max) -{ - struct nfs_page *req, *tmp; - int ret = 0; - - list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; - kref_get(&req->wb_kref); - if (cond_resched_lock(cinfo->lock)) - list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req, cinfo); - clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); - nfs_list_add_request(req, dst); - ret++; - if ((ret == max) && !cinfo->dreq) - break; - } - return ret; -} - -/* Note called with cinfo->lock held. */ -static int -filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, - struct nfs_commit_info *cinfo, - int max) -{ - struct list_head *src = &bucket->written; - struct list_head *dst = &bucket->committing; - int ret; - - ret = transfer_commit_list(src, dst, cinfo, max); - if (ret) { - cinfo->ds->nwritten -= ret; - cinfo->ds->ncommitting += ret; - bucket->clseg = bucket->wlseg; - if (list_empty(src)) - bucket->wlseg = NULL; - else - pnfs_get_lseg(bucket->clseg); - } - return ret; -} - -/* Move reqs from written to committing lists, returning count of number moved. - * Note called with cinfo->lock held. - */ -static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo, - int max) -{ - int i, rv = 0, cnt; - - for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { - cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i], - cinfo, max); - max -= cnt; - rv += cnt; - } - return rv; -} - -/* Pull everything off the committing lists and dump into @dst */ -static void filelayout_recover_commit_reqs(struct list_head *dst, - struct nfs_commit_info *cinfo) -{ - struct pnfs_commit_bucket *b; - struct pnfs_layout_segment *freeme; - int i; - -restart: - spin_lock(cinfo->lock); - for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - if (transfer_commit_list(&b->written, dst, cinfo, 0)) { - freeme = b->wlseg; - b->wlseg = NULL; - spin_unlock(cinfo->lock); - pnfs_put_lseg(freeme); - goto restart; - } - } - cinfo->ds->nwritten = 0; - spin_unlock(cinfo->lock); -} - /* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest * for @page * @cinfo - commit info for current inode @@ -1263,108 +1065,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) return NULL; } -static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx) -{ - struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - struct pnfs_commit_bucket *bucket; - struct pnfs_layout_segment *freeme; - int i; - - for (i = idx; i < fl_cinfo->nbuckets; i++) { - bucket = &fl_cinfo->buckets[i]; - if (list_empty(&bucket->committing)) - continue; - nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); - spin_lock(cinfo->lock); - freeme = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(cinfo->lock); - pnfs_put_lseg(freeme); - } -} - -static unsigned int -alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) -{ - struct pnfs_ds_commit_info *fl_cinfo; - struct pnfs_commit_bucket *bucket; - struct nfs_commit_data *data; - int i; - unsigned int nreq = 0; - - fl_cinfo = cinfo->ds; - bucket = fl_cinfo->buckets; - for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { - if (list_empty(&bucket->committing)) - continue; - data = nfs_commitdata_alloc(); - if (!data) - break; - data->ds_commit_index = i; - spin_lock(cinfo->lock); - data->lseg = bucket->clseg; - bucket->clseg = NULL; - spin_unlock(cinfo->lock); - list_add(&data->pages, list); - nreq++; - } - - /* Clean up on error */ - filelayout_retry_commit(cinfo, i); - /* Caller will clean up entries put on list */ - return nreq; -} - -/* This follows nfs_commit_list pretty closely */ static int filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, int how, struct nfs_commit_info *cinfo) { - struct nfs_commit_data *data, *tmp; - LIST_HEAD(list); - unsigned int nreq = 0; - - if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(); - if (data != NULL) { - data->lseg = NULL; - list_add(&data->pages, &list); - nreq++; - } else { - nfs_retry_commit(mds_pages, NULL, cinfo); - filelayout_retry_commit(cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); - return -ENOMEM; - } - } - - nreq += alloc_ds_commits(cinfo, &list); - - if (nreq == 0) { - cinfo->completion_ops->error_cleanup(NFS_I(inode)); - goto out; - } - - atomic_add(nreq, &cinfo->mds->rpcs_out); - - list_for_each_entry_safe(data, tmp, &list, pages) { - list_del_init(&data->pages); - if (!data->lseg) { - nfs_init_commit(data, mds_pages, NULL, cinfo); - nfs_initiate_commit(NFS_CLIENT(inode), data, - data->mds_ops, how, 0); - } else { - struct pnfs_commit_bucket *buckets; - - buckets = cinfo->ds->buckets; - nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo); - filelayout_initiate_commit(data, how); - } - } -out: - cinfo->ds->ncommitting = 0; - return PNFS_ATTEMPTED; + return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo, + filelayout_initiate_commit); } + static struct nfs4_deviceid_node * filelayout_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, gfp_t gfp_flags) @@ -1421,9 +1129,9 @@ static struct pnfs_layoutdriver_type filelayout_type = { .pg_write_ops = &filelayout_pg_write_ops, .get_ds_info = &filelayout_get_ds_info, .mark_request_commit = filelayout_mark_request_commit, - .clear_request_commit = filelayout_clear_request_commit, - .scan_commit_lists = filelayout_scan_commit_lists, - .recover_commit_reqs = filelayout_recover_commit_reqs, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, .search_commit_reqs = filelayout_search_commit_reqs, .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index 7c9f800c49d7..2896cb833a11 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -33,13 +33,6 @@ #include "../pnfs.h" /* - * Default data server connection timeout and retrans vaules. - * Set by module paramters dataserver_timeo and dataserver_retrans. - */ -#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ -#define NFS4_DEF_DS_RETRANS 5 - -/* * Field testing shows we need to support up to 4096 stripe indices. * We store each index as a u8 (u32 on the wire) to keep the memory footprint * reasonable. This in turn means we support a maximum of 256 @@ -48,32 +41,11 @@ #define NFS4_PNFS_MAX_STRIPE_CNT 4096 #define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */ -/* error codes for internal use */ -#define NFS4ERR_RESET_TO_MDS 12001 - enum stripetype4 { STRIPE_SPARSE = 1, STRIPE_DENSE = 2 }; -/* Individual ip address */ -struct nfs4_pnfs_ds_addr { - struct sockaddr_storage da_addr; - size_t da_addrlen; - struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ - char *da_remotestr; /* human readable addr+port */ -}; - -struct nfs4_pnfs_ds { - struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ - char *ds_remotestr; /* comma sep list of addrs */ - struct list_head ds_addrs; - struct nfs_client *ds_clp; - atomic_t ds_count; - unsigned long ds_state; -#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ -}; - struct nfs4_file_layout_dsaddr { struct nfs4_deviceid_node id_node; u32 stripe_count; @@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg) return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node; } -static inline void -filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node) -{ - u32 *p = (u32 *)&node->deviceid; - - printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n", - p[0], p[1], p[2], p[3]); - - set_bit(NFS_DEVICEID_INVALID, &node->flags); -} - static inline bool filelayout_test_devid_invalid(struct nfs4_deviceid_node *node) { @@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node); extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); -extern void print_ds(struct nfs4_pnfs_ds *ds); u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 9bb806a76d99..4f372e224603 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -31,7 +31,6 @@ #include <linux/nfs_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> -#include <linux/sunrpc/addr.h> #include "../internal.h" #include "../nfs4session.h" @@ -42,184 +41,6 @@ static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; -/* - * Data server cache - * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting - * - set to 1 on allocation - * - incremented when a device id maps a data server already in the cache. - * - decremented when deviceid is removed from the cache. - */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); - -/* Debug routines */ -void -print_ds(struct nfs4_pnfs_ds *ds) -{ - if (ds == NULL) { - printk("%s NULL device\n", __func__); - return; - } - printk(" ds %s\n" - " ref count %d\n" - " client %p\n" - " cl_exchange_flags %x\n", - ds->ds_remotestr, - atomic_read(&ds->ds_count), ds->ds_clp, - ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); -} - -static bool -same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) -{ - struct sockaddr_in *a, *b; - struct sockaddr_in6 *a6, *b6; - - if (addr1->sa_family != addr2->sa_family) - return false; - - switch (addr1->sa_family) { - case AF_INET: - a = (struct sockaddr_in *)addr1; - b = (struct sockaddr_in *)addr2; - - if (a->sin_addr.s_addr == b->sin_addr.s_addr && - a->sin_port == b->sin_port) - return true; - break; - - case AF_INET6: - a6 = (struct sockaddr_in6 *)addr1; - b6 = (struct sockaddr_in6 *)addr2; - - /* LINKLOCAL addresses must have matching scope_id */ - if (ipv6_addr_src_scope(&a6->sin6_addr) == - IPV6_ADDR_SCOPE_LINKLOCAL && - a6->sin6_scope_id != b6->sin6_scope_id) - return false; - - if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && - a6->sin6_port == b6->sin6_port) - return true; - break; - - default: - dprintk("%s: unhandled address family: %u\n", - __func__, addr1->sa_family); - return false; - } - - return false; -} - -static bool -_same_data_server_addrs_locked(const struct list_head *dsaddrs1, - const struct list_head *dsaddrs2) -{ - struct nfs4_pnfs_ds_addr *da1, *da2; - - /* step through both lists, comparing as we go */ - for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), - da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); - da1 != NULL && da2 != NULL; - da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), - da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { - if (!same_sockaddr((struct sockaddr *)&da1->da_addr, - (struct sockaddr *)&da2->da_addr)) - return false; - } - if (da1 == NULL && da2 == NULL) - return true; - - return false; -} - -/* - * Lookup DS by addresses. nfs4_ds_cache_lock is held - */ -static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) -{ - struct nfs4_pnfs_ds *ds; - - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) - if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) - return ds; - return NULL; -} - -/* - * Create an rpc connection to the nfs4_pnfs_ds data server - * Currently only supports IPv4 and IPv6 addresses - */ -static int -nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) -{ - struct nfs_client *clp = ERR_PTR(-EIO); - struct nfs4_pnfs_ds_addr *da; - int status = 0; - - dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, - mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); - - list_for_each_entry(da, &ds->ds_addrs, da_node) { - dprintk("%s: DS %s: trying address %s\n", - __func__, ds->ds_remotestr, da->da_remotestr); - - clp = nfs4_set_ds_client(mds_srv->nfs_client, - (struct sockaddr *)&da->da_addr, - da->da_addrlen, IPPROTO_TCP, - dataserver_timeo, dataserver_retrans); - if (!IS_ERR(clp)) - break; - } - - if (IS_ERR(clp)) { - status = PTR_ERR(clp); - goto out; - } - - status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); - if (status) - goto out_put; - - smp_wmb(); - ds->ds_clp = clp; - dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); -out: - return status; -out_put: - nfs_put_client(clp); - goto out; -} - -static void -destroy_ds(struct nfs4_pnfs_ds *ds) -{ - struct nfs4_pnfs_ds_addr *da; - - dprintk("--> %s\n", __func__); - ifdebug(FACILITY) - print_ds(ds); - - if (ds->ds_clp) - nfs_put_client(ds->ds_clp); - - while (!list_empty(&ds->ds_addrs)) { - da = list_first_entry(&ds->ds_addrs, - struct nfs4_pnfs_ds_addr, - da_node); - list_del_init(&da->da_node); - kfree(da->da_remotestr); - kfree(da); - } - - kfree(ds->ds_remotestr); - kfree(ds); -} - void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { @@ -230,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) for (i = 0; i < dsaddr->ds_num; i++) { ds = dsaddr->ds_list[i]; - if (ds != NULL) { - if (atomic_dec_and_lock(&ds->ds_count, - &nfs4_ds_cache_lock)) { - list_del_init(&ds->ds_node); - spin_unlock(&nfs4_ds_cache_lock); - destroy_ds(ds); - } - } + if (ds != NULL) + nfs4_pnfs_ds_put(ds); } kfree(dsaddr->stripe_indices); kfree(dsaddr); } -/* - * Create a string with a human readable address and port to avoid - * complicated setup around many dprinks. - */ -static char * -nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) -{ - struct nfs4_pnfs_ds_addr *da; - char *remotestr; - size_t len; - char *p; - - len = 3; /* '{', '}' and eol */ - list_for_each_entry(da, dsaddrs, da_node) { - len += strlen(da->da_remotestr) + 1; /* string plus comma */ - } - - remotestr = kzalloc(len, gfp_flags); - if (!remotestr) - return NULL; - - p = remotestr; - *(p++) = '{'; - len--; - list_for_each_entry(da, dsaddrs, da_node) { - size_t ll = strlen(da->da_remotestr); - - if (ll > len) - goto out_err; - - memcpy(p, da->da_remotestr, ll); - p += ll; - len -= ll; - - if (len < 1) - goto out_err; - (*p++) = ','; - len--; - } - if (len < 2) - goto out_err; - *(p++) = '}'; - *p = '\0'; - return remotestr; -out_err: - kfree(remotestr); - return NULL; -} - -static struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) -{ - struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; - char *remotestr; - - if (list_empty(dsaddrs)) { - dprintk("%s: no addresses defined\n", __func__); - goto out; - } - - ds = kzalloc(sizeof(*ds), gfp_flags); - if (!ds) - goto out; - - /* this is only used for debugging, so it's ok if its NULL */ - remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); - - spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); - if (tmp_ds == NULL) { - INIT_LIST_HEAD(&ds->ds_addrs); - list_splice_init(dsaddrs, &ds->ds_addrs); - ds->ds_remotestr = remotestr; - atomic_set(&ds->ds_count, 1); - INIT_LIST_HEAD(&ds->ds_node); - ds->ds_clp = NULL; - list_add(&ds->ds_node, &nfs4_data_server_cache); - dprintk("%s add new data server %s\n", __func__, - ds->ds_remotestr); - } else { - kfree(remotestr); - kfree(ds); - atomic_inc(&tmp_ds->ds_count); - dprintk("%s data server %s found, inc'ed ds_count to %d\n", - __func__, tmp_ds->ds_remotestr, - atomic_read(&tmp_ds->ds_count)); - ds = tmp_ds; - } - spin_unlock(&nfs4_ds_cache_lock); -out: - return ds; -} - -/* - * Currently only supports ipv4, ipv6 and one multi-path address. - */ -static struct nfs4_pnfs_ds_addr * -decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags) -{ - struct nfs4_pnfs_ds_addr *da = NULL; - char *buf, *portstr; - __be16 port; - int nlen, rlen; - int tmp[2]; - __be32 *p; - char *netid, *match_netid; - size_t len, match_netid_len; - char *startsep = ""; - char *endsep = ""; - - - /* r_netid */ - p = xdr_inline_decode(streamp, 4); - if (unlikely(!p)) - goto out_err; - nlen = be32_to_cpup(p++); - - p = xdr_inline_decode(streamp, nlen); - if (unlikely(!p)) - goto out_err; - - netid = kmalloc(nlen+1, gfp_flags); - if (unlikely(!netid)) - goto out_err; - - netid[nlen] = '\0'; - memcpy(netid, p, nlen); - - /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ - p = xdr_inline_decode(streamp, 4); - if (unlikely(!p)) - goto out_free_netid; - rlen = be32_to_cpup(p); - - p = xdr_inline_decode(streamp, rlen); - if (unlikely(!p)) - goto out_free_netid; - - /* port is ".ABC.DEF", 8 chars max */ - if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { - dprintk("%s: Invalid address, length %d\n", __func__, - rlen); - goto out_free_netid; - } - buf = kmalloc(rlen + 1, gfp_flags); - if (!buf) { - dprintk("%s: Not enough memory\n", __func__); - goto out_free_netid; - } - buf[rlen] = '\0'; - memcpy(buf, p, rlen); - - /* replace port '.' with '-' */ - portstr = strrchr(buf, '.'); - if (!portstr) { - dprintk("%s: Failed finding expected dot in port\n", - __func__); - goto out_free_buf; - } - *portstr = '-'; - - /* find '.' between address and port */ - portstr = strrchr(buf, '.'); - if (!portstr) { - dprintk("%s: Failed finding expected dot between address and " - "port\n", __func__); - goto out_free_buf; - } - *portstr = '\0'; - - da = kzalloc(sizeof(*da), gfp_flags); - if (unlikely(!da)) - goto out_free_buf; - - INIT_LIST_HEAD(&da->da_node); - - if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, - sizeof(da->da_addr))) { - dprintk("%s: error parsing address %s\n", __func__, buf); - goto out_free_da; - } - - portstr++; - sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); - port = htons((tmp[0] << 8) | (tmp[1])); - - switch (da->da_addr.ss_family) { - case AF_INET: - ((struct sockaddr_in *)&da->da_addr)->sin_port = port; - da->da_addrlen = sizeof(struct sockaddr_in); - match_netid = "tcp"; - match_netid_len = 3; - break; - - case AF_INET6: - ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; - da->da_addrlen = sizeof(struct sockaddr_in6); - match_netid = "tcp6"; - match_netid_len = 4; - startsep = "["; - endsep = "]"; - break; - - default: - dprintk("%s: unsupported address family: %u\n", - __func__, da->da_addr.ss_family); - goto out_free_da; - } - - if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { - dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", - __func__, netid, match_netid); - goto out_free_da; - } - - /* save human readable address */ - len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; - da->da_remotestr = kzalloc(len, gfp_flags); - - /* NULL is ok, only used for dprintk */ - if (da->da_remotestr) - snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, - buf, endsep, ntohs(port)); - - dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); - kfree(buf); - kfree(netid); - return da; - -out_free_da: - kfree(da); -out_free_buf: - dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); - kfree(buf); -out_free_netid: - kfree(netid); -out_err: - return NULL; -} - /* Decode opaque device data and return the result */ struct nfs4_file_layout_dsaddr * nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, @@ -585,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = decode_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, + &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -682,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) return flseg->fh_array[i]; } -static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) -{ - might_sleep(); - wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING, - nfs_wait_bit_killable, TASK_KILLABLE); -} - -static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) -{ - smp_mb__before_atomic(); - clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_atomic(); - wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); -} - - +/* Upon return, either ds is connected, or ds is NULL */ struct nfs4_pnfs_ds * nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { @@ -705,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); struct nfs4_pnfs_ds *ret = ds; + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); if (ds == NULL) { printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", __func__, ds_idx); - filelayout_mark_devid_invalid(devid); + pnfs_generic_mark_devid_invalid(devid); goto out; } smp_rmb(); if (ds->ds_clp) goto out_test_devid; - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); - int err; - - err = nfs4_ds_connect(s, ds); - if (err) - nfs4_mark_deviceid_unavailable(devid); - nfs4_clear_ds_conn_bit(ds); - } else { - /* Either ds is connected, or ds is NULL */ - nfs4_wait_ds_connect(ds); - } + nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + dataserver_retrans, 4, + s->nfs_client->cl_minorversion, + s->nfs_client->cl_rpcclient->cl_auth->au_flavor); + out_test_devid: if (filelayout_test_devid_unavailable(devid)) ret = NULL; diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile new file mode 100644 index 000000000000..1d2c9f6bbcd4 --- /dev/null +++ b/fs/nfs/flexfilelayout/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the pNFS Flexfile Layout Driver kernel module +# +obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o +nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c new file mode 100644 index 000000000000..315cc68945b9 --- /dev/null +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -0,0 +1,1533 @@ +/* + * Module for pnfs flexfile layout driver. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tao Peng <bergwolf@primarydata.com> + */ + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/module.h> + +#include <linux/sunrpc/metrics.h> +#include <linux/nfs_idmap.h> + +#include "flexfilelayout.h" +#include "../nfs4session.h" +#include "../internal.h" +#include "../delegation.h" +#include "../nfs4trace.h" +#include "../iostat.h" +#include "../nfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) + +static struct pnfs_layout_hdr * +ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) +{ + struct nfs4_flexfile_layout *ffl; + + ffl = kzalloc(sizeof(*ffl), gfp_flags); + if (ffl) { + INIT_LIST_HEAD(&ffl->error_list); + return &ffl->generic_hdr; + } else + return NULL; +} + +static void +ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct nfs4_ff_layout_ds_err *err, *n; + + list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list, + list) { + list_del(&err->list); + kfree(err); + } + kfree(FF_LAYOUT_FROM_HDR(lo)); +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); + if (unlikely(p == NULL)) + return -ENOBUFS; + memcpy(stateid, p, NFS4_STATEID_SIZE); + dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, + p[0], p[1], p[2], p[3]); + return 0; +} + +static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE); + if (unlikely(!p)) + return -ENOBUFS; + memcpy(devid, p, NFS4_DEVICEID4_SIZE); + nfs4_print_deviceid(devid); + return 0; +} + +static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -ENOBUFS; + fh->size = be32_to_cpup(p++); + if (fh->size > sizeof(struct nfs_fh)) { + printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n", + fh->size); + return -EOVERFLOW; + } + /* fh.data */ + p = xdr_inline_decode(xdr, fh->size); + if (unlikely(!p)) + return -ENOBUFS; + memcpy(&fh->data, p, fh->size); + dprintk("%s: fh len %d\n", __func__, fh->size); + + return 0; +} + +/* + * Currently only stringified uids and gids are accepted. + * I.e., kerberos is not supported to the DSes, so no pricipals. + * + * That means that one common function will suffice, but when + * principals are added, this should be split to accomodate + * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid(). + */ +static int +decode_name(struct xdr_stream *xdr, u32 *id) +{ + __be32 *p; + int len; + + /* opaque_length(4)*/ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -ENOBUFS; + len = be32_to_cpup(p++); + if (len < 0) + return -EINVAL; + + dprintk("%s: len %u\n", __func__, len); + + /* opaque body */ + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + return -ENOBUFS; + + if (!nfs_map_string_to_numeric((char *)p, len, id)) + return -EINVAL; + + return 0; +} + +static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) +{ + int i; + + if (fls->mirror_array) { + for (i = 0; i < fls->mirror_array_cnt; i++) { + /* normally mirror_ds is freed in + * .free_deviceid_node but we still do it here + * for .alloc_lseg error path */ + if (fls->mirror_array[i]) { + kfree(fls->mirror_array[i]->fh_versions); + nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); + kfree(fls->mirror_array[i]); + } + } + kfree(fls->mirror_array); + fls->mirror_array = NULL; + } +} + +static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr) +{ + int ret = 0; + + dprintk("--> %s\n", __func__); + + /* FIXME: remove this check when layout segment support is added */ + if (lgr->range.offset != 0 || + lgr->range.length != NFS4_MAX_UINT64) { + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", + __func__); + ret = -EINVAL; + } + + dprintk("--> %s returns %d\n", __func__, ret); + return ret; +} + +static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) +{ + if (fls) { + ff_layout_free_mirror_array(fls); + kfree(fls); + } +} + +static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) +{ + struct nfs4_ff_layout_mirror *tmp; + int i, j; + + for (i = 0; i < fls->mirror_array_cnt - 1; i++) { + for (j = i + 1; j < fls->mirror_array_cnt; j++) + if (fls->mirror_array[i]->efficiency < + fls->mirror_array[j]->efficiency) { + tmp = fls->mirror_array[i]; + fls->mirror_array[i] = fls->mirror_array[j]; + fls->mirror_array[j] = tmp; + } + } +} + +static struct pnfs_layout_segment * +ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, + struct nfs4_layoutget_res *lgr, + gfp_t gfp_flags) +{ + struct pnfs_layout_segment *ret; + struct nfs4_ff_layout_segment *fls = NULL; + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + u64 stripe_unit; + u32 mirror_array_cnt; + __be32 *p; + int i, rc; + + dprintk("--> %s\n", __func__); + scratch = alloc_page(gfp_flags); + if (!scratch) + return ERR_PTR(-ENOMEM); + + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, + lgr->layoutp->len); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* stripe unit and mirror_array_cnt */ + rc = -EIO; + p = xdr_inline_decode(&stream, 8 + 4); + if (!p) + goto out_err_free; + + p = xdr_decode_hyper(p, &stripe_unit); + mirror_array_cnt = be32_to_cpup(p++); + dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__, + stripe_unit, mirror_array_cnt); + + if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT || + mirror_array_cnt == 0) + goto out_err_free; + + rc = -ENOMEM; + fls = kzalloc(sizeof(*fls), gfp_flags); + if (!fls) + goto out_err_free; + + fls->mirror_array_cnt = mirror_array_cnt; + fls->stripe_unit = stripe_unit; + fls->mirror_array = kcalloc(fls->mirror_array_cnt, + sizeof(fls->mirror_array[0]), gfp_flags); + if (fls->mirror_array == NULL) + goto out_err_free; + + for (i = 0; i < fls->mirror_array_cnt; i++) { + struct nfs4_deviceid devid; + struct nfs4_deviceid_node *idnode; + u32 ds_count; + u32 fh_count; + int j; + + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + ds_count = be32_to_cpup(p); + + /* FIXME: allow for striping? */ + if (ds_count != 1) + goto out_err_free; + + fls->mirror_array[i] = + kzalloc(sizeof(struct nfs4_ff_layout_mirror), + gfp_flags); + if (fls->mirror_array[i] == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + spin_lock_init(&fls->mirror_array[i]->lock); + fls->mirror_array[i]->ds_count = ds_count; + + /* deviceid */ + rc = decode_deviceid(&stream, &devid); + if (rc) + goto out_err_free; + + idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), + &devid, lh->plh_lc_cred, + gfp_flags); + /* + * upon success, mirror_ds is allocated by previous + * getdeviceinfo, or newly by .alloc_deviceid_node + * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure + */ + if (idnode) + fls->mirror_array[i]->mirror_ds = + FF_LAYOUT_MIRROR_DS(idnode); + else + goto out_err_free; + + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fls->mirror_array[i]->efficiency = be32_to_cpup(p); + + /* stateid */ + rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); + if (rc) + goto out_err_free; + + /* fh */ + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); + + fls->mirror_array[i]->fh_versions = + kzalloc(fh_count * sizeof(struct nfs_fh), + gfp_flags); + if (fls->mirror_array[i]->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &fls->mirror_array[i]->fh_versions[j]); + if (rc) + goto out_err_free; + } + + fls->mirror_array[i]->fh_versions_cnt = fh_count; + + /* user */ + rc = decode_name(&stream, &fls->mirror_array[i]->uid); + if (rc) + goto out_err_free; + + /* group */ + rc = decode_name(&stream, &fls->mirror_array[i]->gid); + if (rc) + goto out_err_free; + + dprintk("%s: uid %d gid %d\n", __func__, + fls->mirror_array[i]->uid, + fls->mirror_array[i]->gid); + } + + ff_layout_sort_mirrors(fls); + rc = ff_layout_check_layout(lgr); + if (rc) + goto out_err_free; + + ret = &fls->generic_hdr; + dprintk("<-- %s (success)\n", __func__); +out_free_page: + __free_page(scratch); + return ret; +out_err_free: + _ff_layout_free_lseg(fls); + ret = ERR_PTR(rc); + dprintk("<-- %s (%d)\n", __func__, rc); + goto out_free_page; +} + +static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &layout->plh_segs, pls_list) + if (lseg->pls_range.iomode == IOMODE_RW) + return true; + + return false; +} + +static void +ff_layout_free_lseg(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + int i; + + dprintk("--> %s\n", __func__); + + for (i = 0; i < fls->mirror_array_cnt; i++) { + if (fls->mirror_array[i]) { + nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); + fls->mirror_array[i]->mirror_ds = NULL; + if (fls->mirror_array[i]->cred) { + put_rpccred(fls->mirror_array[i]->cred); + fls->mirror_array[i]->cred = NULL; + } + } + } + + if (lseg->pls_range.iomode == IOMODE_RW) { + struct nfs4_flexfile_layout *ffl; + struct inode *inode; + + ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); + inode = ffl->generic_hdr.plh_inode; + spin_lock(&inode->i_lock); + if (!ff_layout_has_rw_segments(lseg->pls_layout)) { + ffl->commit_info.nbuckets = 0; + kfree(ffl->commit_info.buckets); + ffl->commit_info.buckets = NULL; + } + spin_unlock(&inode->i_lock); + } + _ff_layout_free_lseg(fls); +} + +/* Return 1 until we have multiple lsegs support */ +static int +ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) +{ + return 1; +} + +static int +ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + gfp_t gfp_flags) +{ + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + struct pnfs_commit_bucket *buckets; + int size; + + if (cinfo->ds->nbuckets != 0) { + /* This assumes there is only one RW lseg per file. + * To support multiple lseg per file, we need to + * change struct pnfs_commit_bucket to allow dynamic + * increasing nbuckets. + */ + return 0; + } + + size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); + + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), + gfp_flags); + if (!buckets) + return -ENOMEM; + else { + int i; + + spin_lock(cinfo->lock); + if (cinfo->ds->nbuckets != 0) + kfree(buckets); + else { + cinfo->ds->buckets = buckets; + cinfo->ds->nbuckets = size; + for (i = 0; i < size; i++) { + INIT_LIST_HEAD(&buckets[i].written); + INIT_LIST_HEAD(&buckets[i].committing); + /* mark direct verifier as unset */ + buckets[i].direct_verf.committed = + NFS_INVALID_STABLE_HOW; + } + } + spin_unlock(cinfo->lock); + return 0; + } +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio, + int *best_idx) +{ + struct nfs4_ff_layout_segment *fls; + struct nfs4_pnfs_ds *ds; + int idx; + + fls = FF_LAYOUT_LSEG(pgio->pg_lseg); + /* mirrors are sorted by efficiency */ + for (idx = 0; idx < fls->mirror_array_cnt; idx++) { + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false); + if (ds) { + *best_idx = idx; + return ds; + } + } + + return NULL; +} + +static void +ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + struct nfs_pgio_mirror *pgm; + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_pnfs_ds *ds; + int ds_idx; + + /* Use full layout for now */ + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_READ, + GFP_KERNEL); + /* If no lseg, fall back to read through mds */ + if (pgio->pg_lseg == NULL) + goto out_mds; + + ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx); + if (!ds) + goto out_mds; + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); + + pgio->pg_mirror_idx = ds_idx; + + /* read always uses only one mirror - idx 0 for pgio layer */ + pgm = &pgio->pg_mirrors[0]; + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + + return; +out_mds: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + nfs_pageio_reset_read_mds(pgio); +} + +static void +ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs_pgio_mirror *pgm; + struct nfs_commit_info cinfo; + struct nfs4_pnfs_ds *ds; + int i; + int status; + + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + /* If no lseg, fall back to write through mds */ + if (pgio->pg_lseg == NULL) + goto out_mds; + + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); + status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); + if (status < 0) + goto out_mds; + + /* Use a direct mapping of ds_idx to pgio mirror_idx */ + if (WARN_ON_ONCE(pgio->pg_mirror_count != + FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) + goto out_mds; + + for (i = 0; i < pgio->pg_mirror_count; i++) { + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); + if (!ds) + goto out_mds; + pgm = &pgio->pg_mirrors[i]; + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + } + + return; + +out_mds: + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + nfs_pageio_reset_write_mds(pgio); +} + +static unsigned int +ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + if (!pgio->pg_lseg) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + 0, + NFS4_MAX_UINT64, + IOMODE_RW, + GFP_NOFS); + if (pgio->pg_lseg) + return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); + + /* no lseg means that pnfs is not in use, so no mirroring here */ + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; + nfs_pageio_reset_write_mds(pgio); + return 1; +} + +static const struct nfs_pageio_ops ff_layout_pg_read_ops = { + .pg_init = ff_layout_pg_init_read, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static const struct nfs_pageio_ops ff_layout_pg_write_ops = { + .pg_init = ff_layout_pg_init_write, + .pg_test = pnfs_generic_pg_test, + .pg_doio = pnfs_generic_pg_writepages, + .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, + .pg_cleanup = pnfs_generic_pg_cleanup, +}; + +static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) +{ + struct rpc_task *task = &hdr->task; + + pnfs_layoutcommit_inode(hdr->inode, false); + + if (retry_pnfs) { + dprintk("%s Reset task %5u for i/o through pNFS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + if (!hdr->dreq) { + struct nfs_open_context *ctx; + + ctx = nfs_list_entry(hdr->pages.next)->wb_context; + set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); + hdr->completion_ops->error_cleanup(&hdr->pages); + } else { + nfs_direct_set_resched_writes(hdr->dreq); + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = 0; + } + return; + } + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task->tk_status = pnfs_write_done_resend_to_mds(hdr); + } +} + +static void ff_layout_reset_read(struct nfs_pgio_header *hdr) +{ + struct rpc_task *task = &hdr->task; + + pnfs_layoutcommit_inode(hdr->inode, false); + + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + dprintk("%s Reset task %5u for i/o through MDS " + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, + hdr->task.tk_pid, + hdr->inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(hdr->inode), + hdr->args.count, + (unsigned long long)hdr->args.offset); + + task->tk_status = pnfs_read_done_resend_to_mds(hdr); + } +} + +static int ff_layout_async_handle_error_v4(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg, + int idx) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct inode *inode = lo->plh_inode; + struct nfs_server *mds_server = NFS_SERVER(inode); + + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs_client *mds_client = mds_server->nfs_client; + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; + + if (task->tk_status >= 0) + return 0; + + switch (task->tk_status) { + /* MDS state errors */ + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + if (state == NULL) + break; + nfs_remove_bad_delegation(state->inode); + case -NFS4ERR_OPENMODE: + if (state == NULL) + break; + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + goto wait_on_recovery; + case -NFS4ERR_EXPIRED: + if (state != NULL) { + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) + goto out_bad_stateid; + } + nfs4_schedule_lease_recovery(mds_client); + goto wait_on_recovery; + /* DS session errors */ + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); + break; + case -NFS4ERR_RETRY_UNCACHED_REP: + break; + /* Invalidate Layout errors */ + case -NFS4ERR_PNFS_NO_LAYOUT: + case -ESTALE: /* mapped NFS4ERR_STALE */ + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ + case -EISDIR: /* mapped NFS4ERR_ISDIR */ + case -NFS4ERR_FHEXPIRED: + case -NFS4ERR_WRONG_TYPE: + dprintk("%s Invalid layout error %d\n", __func__, + task->tk_status); + /* + * Destroy layout so new i/o will get a new layout. + * Layout will not be destroyed until all current lseg + * references are put. Mark layout as invalid to resend failed + * i/o and all i/o waiting on the slot table to the MDS until + * layout is destroyed and a new valid layout is obtained. + */ + pnfs_destroy_layout(NFS_I(inode)); + rpc_wake_up(&tbl->slot_tbl_waitq); + goto reset; + /* RPC connection errors */ + case -ECONNREFUSED: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EIO: + case -ETIMEDOUT: + case -EPIPE: + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + nfs4_mark_deviceid_unavailable(devid); + rpc_wake_up(&tbl->slot_tbl_waitq); + /* fall through */ + default: + if (ff_layout_has_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; +reset: + dprintk("%s Retry through MDS. Error %d\n", __func__, + task->tk_status); + return -NFS4ERR_RESET_TO_MDS; + } +out: + task->tk_status = 0; + return -EAGAIN; +out_bad_stateid: + task->tk_status = -EIO; + return 0; +wait_on_recovery: + rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) + rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); + goto out; +} + +/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ +static int ff_layout_async_handle_error_v3(struct rpc_task *task, + struct pnfs_layout_segment *lseg, + int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (task->tk_status >= 0) + return 0; + + if (task->tk_status != -EJUKEBOX) { + dprintk("%s DS connection error %d\n", __func__, + task->tk_status); + nfs4_mark_deviceid_unavailable(devid); + if (ff_layout_has_available_ds(lseg)) + return -NFS4ERR_RESET_TO_PNFS; + else + return -NFS4ERR_RESET_TO_MDS; + } + + if (task->tk_status == -EJUKEBOX) + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return -EAGAIN; +} + +static int ff_layout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + struct pnfs_layout_segment *lseg, + int idx) +{ + int vers = clp->cl_nfs_mod->rpc_vers->number; + + switch (vers) { + case 3: + return ff_layout_async_handle_error_v3(task, lseg, idx); + case 4: + return ff_layout_async_handle_error_v4(task, state, clp, + lseg, idx); + default: + /* should never happen */ + WARN_ON_ONCE(1); + return 0; + } +} + +static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, + int idx, u64 offset, u64 length, + u32 status, int opnum) +{ + struct nfs4_ff_layout_mirror *mirror; + int err; + + mirror = FF_LAYOUT_COMP(lseg, idx); + err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, offset, length, status, opnum, + GFP_NOIO); + dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); +} + +/* NFS_PROTO call done callback routines */ + +static int ff_layout_read_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct inode *inode; + int err; + + trace_nfs4_pnfs_read(hdr, task->tk_status); + if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) + hdr->res.op_status = NFS4ERR_NXIO; + if (task->tk_status < 0 && hdr->res.op_status) + ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + hdr->args.offset, hdr->args.count, + hdr->res.op_status, OP_READ); + err = ff_layout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg, + hdr->pgio_mirror_idx); + + switch (err) { + case -NFS4ERR_RESET_TO_PNFS: + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &hdr->lseg->pls_layout->plh_flags); + pnfs_read_resend_pnfs(hdr); + return task->tk_status; + case -NFS4ERR_RESET_TO_MDS: + inode = hdr->lseg->pls_layout->plh_inode; + pnfs_error_mark_layout_for_return(inode, hdr->lseg); + ff_layout_reset_read(hdr); + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + return 0; +} + +/* + * We reference the rpc_cred of the first WRITE that triggers the need for + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. + * rfc5661 is not clear about which credential should be used. + * + * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so + * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751 + * we always send layoutcommit after DS writes. + */ +static void +ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) +{ + pnfs_set_layoutcommit(hdr); + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); +} + +static bool +ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) +{ + /* No mirroring for now */ + struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); + + return ff_layout_test_devid_unavailable(node); +} + +static int ff_layout_read_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { + rpc_exit(task, -EIO); + return -EIO; + } + if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); + if (ff_layout_has_available_ds(hdr->lseg)) + pnfs_read_resend_pnfs(hdr); + else + ff_layout_reset_read(hdr); + rpc_exit(task, 0); + return -EAGAIN; + } + hdr->pgio_done_cb = ff_layout_read_done_cb; + + return 0; +} + +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_read_prepare_common(task, hdr)) + return; + + rpc_call_start(task); +} + +static int ff_layout_setup_sequence(struct nfs_client *ds_clp, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) +{ + if (ds_clp->cl_session) + return nfs41_setup_sequence(ds_clp->cl_session, + args, + res, + task); + return nfs40_setup_sequence(ds_clp->cl_slot_tbl, + args, + res, + task); +} + +static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_read_prepare_common(task, hdr)) + return; + + if (ff_layout_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return; + + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_READ) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void ff_layout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && + task->tk_status == 0) { + nfs4_sequence_done(task, &hdr->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + hdr->mds_ops->rpc_call_done(task, hdr); +} + +static void ff_layout_read_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + rpc_count_iostats_metrics(task, + &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); +} + +static int ff_layout_write_done_cb(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + struct inode *inode; + int err; + + trace_nfs4_pnfs_write(hdr, task->tk_status); + if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) + hdr->res.op_status = NFS4ERR_NXIO; + if (task->tk_status < 0 && hdr->res.op_status) + ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + hdr->args.offset, hdr->args.count, + hdr->res.op_status, OP_WRITE); + err = ff_layout_async_handle_error(task, hdr->args.context->state, + hdr->ds_clp, hdr->lseg, + hdr->pgio_mirror_idx); + + switch (err) { + case -NFS4ERR_RESET_TO_PNFS: + case -NFS4ERR_RESET_TO_MDS: + inode = hdr->lseg->pls_layout->plh_inode; + pnfs_error_mark_layout_for_return(inode, hdr->lseg); + if (err == -NFS4ERR_RESET_TO_PNFS) { + pnfs_set_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, true); + } else { + pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); + ff_layout_reset_write(hdr, false); + } + return task->tk_status; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + if (hdr->res.verf->committed == NFS_FILE_SYNC || + hdr->res.verf->committed == NFS_DATA_SYNC) + ff_layout_set_layoutcommit(hdr); + + return 0; +} + +static int ff_layout_commit_done_cb(struct rpc_task *task, + struct nfs_commit_data *data) +{ + struct inode *inode; + int err; + + trace_nfs4_pnfs_commit_ds(data, task->tk_status); + if (task->tk_status == -ETIMEDOUT && !data->res.op_status) + data->res.op_status = NFS4ERR_NXIO; + if (task->tk_status < 0 && data->res.op_status) + ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + data->args.offset, data->args.count, + data->res.op_status, OP_COMMIT); + err = ff_layout_async_handle_error(task, NULL, data->ds_clp, + data->lseg, data->ds_commit_index); + + switch (err) { + case -NFS4ERR_RESET_TO_PNFS: + case -NFS4ERR_RESET_TO_MDS: + inode = data->lseg->pls_layout->plh_inode; + pnfs_error_mark_layout_for_return(inode, data->lseg); + if (err == -NFS4ERR_RESET_TO_PNFS) + pnfs_set_retry_layoutget(data->lseg->pls_layout); + else + pnfs_clear_retry_layoutget(data->lseg->pls_layout); + pnfs_generic_prepare_to_resend_writes(data); + return -EAGAIN; + case -EAGAIN: + rpc_restart_call_prepare(task); + return -EAGAIN; + } + + if (data->verf.committed == NFS_UNSTABLE) + pnfs_commit_set_layoutcommit(data); + + return 0; +} + +static int ff_layout_write_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { + rpc_exit(task, -EIO); + return -EIO; + } + + if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { + bool retry_pnfs; + + retry_pnfs = ff_layout_has_available_ds(hdr->lseg); + dprintk("%s task %u reset io to %s\n", __func__, + task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); + ff_layout_reset_write(hdr, retry_pnfs); + rpc_exit(task, 0); + return -EAGAIN; + } + + return 0; +} + +static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_write_prepare_common(task, hdr)) + return; + + rpc_call_start(task); +} + +static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (ff_layout_write_prepare_common(task, hdr)) + return; + + if (ff_layout_setup_sequence(hdr->ds_clp, + &hdr->args.seq_args, + &hdr->res.seq_res, + task)) + return; + + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, + hdr->args.lock_context, FMODE_WRITE) == -EIO) + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ +} + +static void ff_layout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && + task->tk_status == 0) { + nfs4_sequence_done(task, &hdr->res.seq_res); + return; + } + + /* Note this may cause RPC to be resent */ + hdr->mds_ops->rpc_call_done(task, hdr); +} + +static void ff_layout_write_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_pgio_header *hdr = data; + + rpc_count_iostats_metrics(task, + &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); +} + +static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) +{ + rpc_call_start(task); +} + +static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + ff_layout_setup_sequence(wdata->ds_clp, + &wdata->args.seq_args, + &wdata->res.seq_res, + task); +} + +static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + + rpc_count_iostats_metrics(task, + &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); +} + +static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { + .rpc_call_prepare = ff_layout_read_prepare_v3, + .rpc_call_done = ff_layout_read_call_done, + .rpc_count_stats = ff_layout_read_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { + .rpc_call_prepare = ff_layout_read_prepare_v4, + .rpc_call_done = ff_layout_read_call_done, + .rpc_count_stats = ff_layout_read_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { + .rpc_call_prepare = ff_layout_write_prepare_v3, + .rpc_call_done = ff_layout_write_call_done, + .rpc_count_stats = ff_layout_write_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { + .rpc_call_prepare = ff_layout_write_prepare_v4, + .rpc_call_done = ff_layout_write_call_done, + .rpc_count_stats = ff_layout_write_count_stats, + .rpc_release = pnfs_generic_rw_release, +}; + +static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { + .rpc_call_prepare = ff_layout_commit_prepare_v3, + .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_count_stats = ff_layout_commit_count_stats, + .rpc_release = pnfs_generic_commit_release, +}; + +static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { + .rpc_call_prepare = ff_layout_commit_prepare_v4, + .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_count_stats = ff_layout_commit_count_stats, + .rpc_release = pnfs_generic_commit_release, +}; + +static enum pnfs_try_status +ff_layout_read_pagelist(struct nfs_pgio_header *hdr) +{ + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + struct rpc_cred *ds_cred; + loff_t offset = hdr->args.offset; + u32 idx = hdr->pgio_mirror_idx; + int vers; + struct nfs_fh *fh; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, hdr->inode->i_ino, + hdr->args.pgbase, (size_t)hdr->args.count, offset); + + ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + if (!ds) + goto out_failed; + + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + hdr->inode); + if (IS_ERR(ds_clnt)) + goto out_failed; + + ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + if (IS_ERR(ds_cred)) + goto out_failed; + + vers = nfs4_ff_layout_ds_version(lseg, idx); + + dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + + atomic_inc(&ds->ds_clp->cl_count); + hdr->ds_clp = ds->ds_clp; + fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + if (fh) + hdr->args.fh = fh; + + /* + * Note that if we ever decide to split across DSes, + * then we may need to handle dense-like offsets. + */ + hdr->args.offset = offset; + hdr->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, + vers == 3 ? &ff_layout_read_call_ops_v3 : + &ff_layout_read_call_ops_v4, + 0, RPC_TASK_SOFTCONN); + + return PNFS_ATTEMPTED; + +out_failed: + if (ff_layout_has_available_ds(lseg)) + return PNFS_TRY_AGAIN; + return PNFS_NOT_ATTEMPTED; +} + +/* Perform async writes. */ +static enum pnfs_try_status +ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) +{ + struct pnfs_layout_segment *lseg = hdr->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + struct rpc_cred *ds_cred; + loff_t offset = hdr->args.offset; + int vers; + struct nfs_fh *fh; + int idx = hdr->pgio_mirror_idx; + + ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + if (!ds) + return PNFS_NOT_ATTEMPTED; + + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + hdr->inode); + if (IS_ERR(ds_clnt)) + return PNFS_NOT_ATTEMPTED; + + ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + if (IS_ERR(ds_cred)) + return PNFS_NOT_ATTEMPTED; + + vers = nfs4_ff_layout_ds_version(lseg, idx); + + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n", + __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), + vers); + + hdr->pgio_done_cb = ff_layout_write_done_cb; + atomic_inc(&ds->ds_clp->cl_count); + hdr->ds_clp = ds->ds_clp; + hdr->ds_commit_idx = idx; + fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + if (fh) + hdr->args.fh = fh; + + /* + * Note that if we ever decide to split across DSes, + * then we may need to handle dense-like offsets. + */ + hdr->args.offset = offset; + + /* Perform an asynchronous write */ + nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, + vers == 3 ? &ff_layout_write_call_ops_v3 : + &ff_layout_write_call_ops_v4, + sync, RPC_TASK_SOFTCONN); + return PNFS_ATTEMPTED; +} + +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + return i; +} + +static struct nfs_fh * +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + /* FIXME: Assume that there is only one NFS version available + * for the DS. + */ + return &flseg->mirror_array[i]->fh_versions[0]; +} + +static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + struct rpc_clnt *ds_clnt; + struct rpc_cred *ds_cred; + u32 idx; + int vers; + struct nfs_fh *fh; + + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + if (!ds) + goto out_err; + + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + data->inode); + if (IS_ERR(ds_clnt)) + goto out_err; + + ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); + if (IS_ERR(ds_cred)) + goto out_err; + + vers = nfs4_ff_layout_ds_version(lseg, idx); + + dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count), + vers); + data->commit_done_cb = ff_layout_commit_done_cb; + data->cred = ds_cred; + atomic_inc(&ds->ds_clp->cl_count); + data->ds_clp = ds->ds_clp; + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + if (fh) + data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, + vers == 3 ? &ff_layout_commit_call_ops_v3 : + &ff_layout_commit_call_ops_v4, + how, RPC_TASK_SOFTCONN); +out_err: + pnfs_generic_prepare_to_resend_writes(data); + pnfs_generic_commit_release(data); + return -EAGAIN; +} + +static int +ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how, struct nfs_commit_info *cinfo) +{ + return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo, + ff_layout_initiate_commit); +} + +static struct pnfs_ds_commit_info * +ff_layout_get_ds_info(struct inode *inode) +{ + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; + + if (layout == NULL) + return NULL; + + return &FF_LAYOUT_FROM_HDR(layout)->commit_info; +} + +static void +ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) +{ + nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, + id_node)); +} + +static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct pnfs_layout_hdr *hdr = &flo->generic_hdr; + __be32 *start; + int count = 0, ret = 0; + + start = xdr_reserve_space(xdr, 4); + if (unlikely(!start)) + return -E2BIG; + + /* This assume we always return _ALL_ layouts */ + spin_lock(&hdr->plh_inode->i_lock); + ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range); + spin_unlock(&hdr->plh_inode->i_lock); + + *start = cpu_to_be32(count); + + return ret; +} + +/* report nothing for now */ +static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4); + if (likely(p)) + *p = cpu_to_be32(0); +} + +static struct nfs4_deviceid_node * +ff_layout_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags) +{ + struct nfs4_ff_layout_ds *dsaddr; + + dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags); + if (!dsaddr) + return NULL; + return &dsaddr->id_node; +} + +static void +ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, + struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args) +{ + struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); + __be32 *start; + + dprintk("%s: Begin\n", __func__); + start = xdr_reserve_space(xdr, 4); + BUG_ON(!start); + + if (ff_layout_encode_ioerr(flo, xdr, args)) + goto out; + + ff_layout_encode_iostats(flo, xdr, args); +out: + *start = cpu_to_be32((xdr->p - start - 1) * 4); + dprintk("%s: Return\n", __func__); +} + +static struct pnfs_layoutdriver_type flexfilelayout_type = { + .id = LAYOUT_FLEX_FILES, + .name = "LAYOUT_FLEX_FILES", + .owner = THIS_MODULE, + .alloc_layout_hdr = ff_layout_alloc_layout_hdr, + .free_layout_hdr = ff_layout_free_layout_hdr, + .alloc_lseg = ff_layout_alloc_lseg, + .free_lseg = ff_layout_free_lseg, + .pg_read_ops = &ff_layout_pg_read_ops, + .pg_write_ops = &ff_layout_pg_write_ops, + .get_ds_info = ff_layout_get_ds_info, + .free_deviceid_node = ff_layout_free_deveiceid_node, + .mark_request_commit = pnfs_layout_mark_request_commit, + .clear_request_commit = pnfs_generic_clear_request_commit, + .scan_commit_lists = pnfs_generic_scan_commit_lists, + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, + .commit_pagelist = ff_layout_commit_pagelist, + .read_pagelist = ff_layout_read_pagelist, + .write_pagelist = ff_layout_write_pagelist, + .alloc_deviceid_node = ff_layout_alloc_deviceid_node, + .encode_layoutreturn = ff_layout_encode_layoutreturn, +}; + +static int __init nfs4flexfilelayout_init(void) +{ + printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", + __func__); + return pnfs_register_layoutdriver(&flexfilelayout_type); +} + +static void __exit nfs4flexfilelayout_exit(void) +{ + printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", + __func__); + pnfs_unregister_layoutdriver(&flexfilelayout_type); +} + +MODULE_ALIAS("nfs-layouttype4-4"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("The NFSv4 flexfile layout driver"); + +module_init(nfs4flexfilelayout_init); +module_exit(nfs4flexfilelayout_exit); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h new file mode 100644 index 000000000000..070f20445b2d --- /dev/null +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -0,0 +1,155 @@ +/* + * NFSv4 flexfile layout driver data structures. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tao Peng <bergwolf@primarydata.com> + */ + +#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H +#define FS_NFS_NFS4FLEXFILELAYOUT_H + +#include "../pnfs.h" + +/* XXX: Let's filter out insanely large mirror count for now to avoid oom + * due to network error etc. */ +#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 + +struct nfs4_ff_ds_version { + u32 version; + u32 minor_version; + u32 rsize; + u32 wsize; + bool tightly_coupled; +}; + +/* chained in global deviceid hlist */ +struct nfs4_ff_layout_ds { + struct nfs4_deviceid_node id_node; + u32 ds_versions_cnt; + struct nfs4_ff_ds_version *ds_versions; + struct nfs4_pnfs_ds *ds; +}; + +struct nfs4_ff_layout_ds_err { + struct list_head list; /* linked in mirror error_list */ + u64 offset; + u64 length; + int status; + enum nfs_opnum4 opnum; + nfs4_stateid stateid; + struct nfs4_deviceid deviceid; +}; + +struct nfs4_ff_layout_mirror { + u32 ds_count; + u32 efficiency; + struct nfs4_ff_layout_ds *mirror_ds; + u32 fh_versions_cnt; + struct nfs_fh *fh_versions; + nfs4_stateid stateid; + struct nfs4_string user_name; + struct nfs4_string group_name; + u32 uid; + u32 gid; + struct rpc_cred *cred; + spinlock_t lock; +}; + +struct nfs4_ff_layout_segment { + struct pnfs_layout_segment generic_hdr; + u64 stripe_unit; + u32 mirror_array_cnt; + struct nfs4_ff_layout_mirror **mirror_array; +}; + +struct nfs4_flexfile_layout { + struct pnfs_layout_hdr generic_hdr; + struct pnfs_ds_commit_info commit_info; + struct list_head error_list; /* nfs4_ff_layout_ds_err */ +}; + +static inline struct nfs4_flexfile_layout * +FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) +{ + return container_of(lo, struct nfs4_flexfile_layout, generic_hdr); +} + +static inline struct nfs4_ff_layout_segment * +FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) +{ + return container_of(lseg, + struct nfs4_ff_layout_segment, + generic_hdr); +} + +static inline struct nfs4_deviceid_node * +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +{ + if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || + FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || + FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) + return NULL; + return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; +} + +static inline struct nfs4_ff_layout_ds * +FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) +{ + return container_of(node, struct nfs4_ff_layout_ds, id_node); +} + +static inline struct nfs4_ff_layout_mirror * +FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) +{ + if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) + return NULL; + return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; +} + +static inline u32 +FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg) +{ + return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt; +} + +static inline bool +ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) +{ + return nfs4_test_deviceid_unavailable(node); +} + +static inline int +nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; +} + +struct nfs4_ff_layout_ds * +nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags); +void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); +void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); +int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, + struct nfs4_ff_layout_mirror *mirror, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + gfp_t gfp_flags); +int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, int *count, + const struct pnfs_layout_range *range); +struct nfs_fh * +nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); + +struct nfs4_pnfs_ds * +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, + bool fail_return); + +struct rpc_clnt * +nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, + u32 ds_idx, + struct nfs_client *ds_clp, + struct inode *inode); +struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, + u32 ds_idx, struct rpc_cred *mdscred); +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); +#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c new file mode 100644 index 000000000000..e2c01f204a95 --- /dev/null +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -0,0 +1,552 @@ +/* + * Device operations for the pnfs nfs4 file layout driver. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tao Peng <bergwolf@primarydata.com> + */ + +#include <linux/nfs_fs.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/sunrpc/addr.h> + +#include "../internal.h" +#include "../nfs4session.h" +#include "flexfilelayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; + +void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) +{ + if (mirror_ds) + nfs4_put_deviceid_node(&mirror_ds->id_node); +} + +void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) +{ + nfs4_print_deviceid(&mirror_ds->id_node.deviceid); + nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds); +} + +/* Decode opaque device data and construct new_ds using it */ +struct nfs4_ff_layout_ds * +nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) +{ + struct xdr_stream stream; + struct xdr_buf buf; + struct page *scratch; + struct list_head dsaddrs; + struct nfs4_pnfs_ds_addr *da; + struct nfs4_ff_layout_ds *new_ds = NULL; + struct nfs4_ff_ds_version *ds_versions = NULL; + u32 mp_count; + u32 version_count; + __be32 *p; + int i, ret = -ENOMEM; + + /* set up xdr stream */ + scratch = alloc_page(gfp_flags); + if (!scratch) + goto out_err; + + new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags); + if (!new_ds) + goto out_scratch; + + nfs4_init_deviceid_node(&new_ds->id_node, + server, + &pdev->dev_id); + INIT_LIST_HEAD(&dsaddrs); + + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); + + /* multipath count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_drain_dsaddrs; + mp_count = be32_to_cpup(p); + dprintk("%s: multipath ds count %d\n", __func__, mp_count); + + for (i = 0; i < mp_count; i++) { + /* multipath ds */ + da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, + &stream, gfp_flags); + if (da) + list_add_tail(&da->da_node, &dsaddrs); + } + if (list_empty(&dsaddrs)) { + dprintk("%s: no suitable DS addresses found\n", + __func__); + ret = -ENOMEDIUM; + goto out_err_drain_dsaddrs; + } + + /* version count */ + p = xdr_inline_decode(&stream, 4); + if (unlikely(!p)) + goto out_err_drain_dsaddrs; + version_count = be32_to_cpup(p); + dprintk("%s: version count %d\n", __func__, version_count); + + ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version), + gfp_flags); + if (!ds_versions) + goto out_scratch; + + for (i = 0; i < version_count; i++) { + /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) + + * tightly_coupled(4) */ + p = xdr_inline_decode(&stream, 20); + if (unlikely(!p)) + goto out_err_drain_dsaddrs; + ds_versions[i].version = be32_to_cpup(p++); + ds_versions[i].minor_version = be32_to_cpup(p++); + ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].tightly_coupled = be32_to_cpup(p); + + if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) + ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE; + if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) + ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; + + if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { + dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, + i, ds_versions[i].version, + ds_versions[i].minor_version); + ret = -EPROTONOSUPPORT; + goto out_err_drain_dsaddrs; + } + + dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n", + __func__, i, ds_versions[i].version, + ds_versions[i].minor_version, + ds_versions[i].rsize, + ds_versions[i].wsize, + ds_versions[i].tightly_coupled); + } + + new_ds->ds_versions = ds_versions; + new_ds->ds_versions_cnt = version_count; + + new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + if (!new_ds->ds) + goto out_err_drain_dsaddrs; + + /* If DS was already in cache, free ds addrs */ + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + __free_page(scratch); + return new_ds; + +out_err_drain_dsaddrs: + while (!list_empty(&dsaddrs)) { + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds_versions); +out_scratch: + __free_page(scratch); +out_err: + kfree(new_ds); + + dprintk("%s ERROR: returning %d\n", __func__, ret); + return NULL; +} + +static u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end : NFS4_MAX_UINT64; +} + +static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, + u64 offset, u64 length) +{ + u64 end; + + end = max_t(u64, end_offset(err->offset, err->length), + end_offset(offset, length)); + err->offset = min_t(u64, err->offset, offset); + err->length = end - err->offset; +} + +static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + nfs4_stateid *stateid, + struct nfs4_deviceid *deviceid) +{ + return err->status == status && err->opnum == opnum && + nfs4_stateid_match(&err->stateid, stateid) && + !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && + end_offset(err->offset, err->length) >= offset && + err->offset <= end_offset(offset, length); +} + +static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, + struct nfs4_ff_layout_ds_err *new) +{ + if (!ds_error_can_merge(old, new->offset, new->length, new->status, + new->opnum, &new->stateid, &new->deviceid)) + return false; + + extend_ds_error(old, new->offset, new->length); + return true; +} + +static bool +ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, + struct nfs4_ff_layout_ds_err *dserr) +{ + struct nfs4_ff_layout_ds_err *err; + + list_for_each_entry(err, &flo->error_list, list) { + if (merge_ds_error(err, dserr)) { + return true; + } + } + + list_add(&dserr->list, &flo->error_list); + return false; +} + +static bool +ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) +{ + bool found = false; + struct nfs4_ff_layout_ds_err *err; + + list_for_each_entry(err, &flo->error_list, list) { + if (ds_error_can_merge(err, offset, length, status, opnum, + stateid, deviceid)) { + found = true; + extend_ds_error(err, offset, length); + break; + } + } + + return found; +} + +int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, + struct nfs4_ff_layout_mirror *mirror, u64 offset, + u64 length, int status, enum nfs_opnum4 opnum, + gfp_t gfp_flags) +{ + struct nfs4_ff_layout_ds_err *dserr; + bool needfree; + + if (status == 0) + return 0; + + if (mirror->mirror_ds == NULL) + return -EINVAL; + + spin_lock(&flo->generic_hdr.plh_inode->i_lock); + if (ff_layout_update_ds_error(flo, offset, length, status, opnum, + &mirror->stateid, + &mirror->mirror_ds->id_node.deviceid)) { + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); + return 0; + } + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); + dserr = kmalloc(sizeof(*dserr), gfp_flags); + if (!dserr) + return -ENOMEM; + + INIT_LIST_HEAD(&dserr->list); + dserr->offset = offset; + dserr->length = length; + dserr->status = status; + dserr->opnum = opnum; + nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); + memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + NFS4_DEVICEID4_SIZE); + + spin_lock(&flo->generic_hdr.plh_inode->i_lock); + needfree = ff_layout_add_ds_error_locked(flo, dserr); + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); + if (needfree) + kfree(dserr); + + return 0; +} + +/* currently we only support AUTH_NONE and AUTH_SYS */ +static rpc_authflavor_t +nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror->uid == (u32)-1) + return RPC_AUTH_NULL; + return RPC_AUTH_UNIX; +} + +/* fetch cred for NFSv3 DS */ +static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, + struct nfs4_pnfs_ds *ds) +{ + if (ds->ds_clp && !mirror->cred && + mirror->mirror_ds->ds_versions[0].version == 3) { + struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; + struct rpc_cred *cred; + struct auth_cred acred = { + .uid = make_kuid(&init_user_ns, mirror->uid), + .gid = make_kgid(&init_user_ns, mirror->gid), + }; + + /* AUTH_NULL ignores acred */ + cred = auth->au_ops->lookup_cred(auth, &acred, 0); + if (IS_ERR(cred)) { + dprintk("%s: lookup_cred failed with %ld\n", + __func__, PTR_ERR(cred)); + return PTR_ERR(cred); + } else { + mirror->cred = cred; + } + } + return 0; +} + +struct nfs_fh * +nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + struct nfs_fh *fh = NULL; + struct nfs4_deviceid_node *devid; + + if (mirror == NULL || mirror->mirror_ds == NULL || + mirror->mirror_ds->ds == NULL) { + printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n", + __func__, mirror_idx); + if (mirror && mirror->mirror_ds) { + devid = &mirror->mirror_ds->id_node; + pnfs_generic_mark_devid_invalid(devid); + } + goto out; + } + + /* FIXME: For now assume there is only 1 version available for the DS */ + fh = &mirror->fh_versions[0]; +out: + return fh; +} + +/* Upon return, either ds is connected, or ds is NULL */ +struct nfs4_pnfs_ds * +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, + bool fail_return) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + struct nfs4_pnfs_ds *ds = NULL; + struct nfs4_deviceid_node *devid; + struct inode *ino = lseg->pls_layout->plh_inode; + struct nfs_server *s = NFS_SERVER(ino); + unsigned int max_payload; + rpc_authflavor_t flavor; + + if (mirror == NULL || mirror->mirror_ds == NULL || + mirror->mirror_ds->ds == NULL) { + printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", + __func__, ds_idx); + if (mirror && mirror->mirror_ds) { + devid = &mirror->mirror_ds->id_node; + pnfs_generic_mark_devid_invalid(devid); + } + goto out; + } + + devid = &mirror->mirror_ds->id_node; + if (ff_layout_test_devid_unavailable(devid)) + goto out; + + ds = mirror->mirror_ds->ds; + /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ + smp_rmb(); + if (ds->ds_clp) + goto out; + + flavor = nfs4_ff_layout_choose_authflavor(mirror); + + /* FIXME: For now we assume the server sent only one version of NFS + * to use for the DS. + */ + nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + dataserver_retrans, + mirror->mirror_ds->ds_versions[0].version, + mirror->mirror_ds->ds_versions[0].minor_version, + flavor); + + /* connect success, check rsize/wsize limit */ + if (ds->ds_clp) { + max_payload = + nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), + NULL); + if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) + mirror->mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) + mirror->mirror_ds->ds_versions[0].wsize = max_payload; + } else { + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), + mirror, lseg->pls_range.offset, + lseg->pls_range.length, NFS4ERR_NXIO, + OP_ILLEGAL, GFP_NOIO); + if (fail_return) { + pnfs_error_mark_layout_for_return(ino, lseg); + if (ff_layout_has_available_ds(lseg)) + pnfs_set_retry_layoutget(lseg->pls_layout); + else + pnfs_clear_retry_layoutget(lseg->pls_layout); + + } else { + if (ff_layout_has_available_ds(lseg)) + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lseg->pls_layout->plh_flags); + else { + pnfs_error_mark_layout_for_return(ino, lseg); + pnfs_clear_retry_layoutget(lseg->pls_layout); + } + } + } + + if (ff_layout_update_mirror_cred(mirror, ds)) + ds = NULL; +out: + return ds; +} + +struct rpc_cred * +ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, + struct rpc_cred *mdscred) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + struct rpc_cred *cred = ERR_PTR(-EINVAL); + + if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) + goto out; + + if (mirror && mirror->cred) + cred = mirror->cred; + else + cred = mdscred; +out: + return cred; +} + +/** +* Find or create a DS rpc client with th MDS server rpc client auth flavor +* in the nfs_client cl_ds_clients list. +*/ +struct rpc_clnt * +nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, + struct nfs_client *ds_clp, struct inode *inode) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); + + switch (mirror->mirror_ds->ds_versions[0].version) { + case 3: + /* For NFSv3 DS, flavor is set when creating DS connections */ + return ds_clp->cl_rpcclient; + case 4: + return nfs4_find_or_create_ds_client(ds_clp, inode); + default: + BUG(); + } +} + +static bool is_range_intersecting(u64 offset1, u64 length1, + u64 offset2, u64 length2) +{ + u64 end1 = end_offset(offset1, length1); + u64 end2 = end_offset(offset2, length2); + + return (end1 == NFS4_MAX_UINT64 || end1 > offset2) && + (end2 == NFS4_MAX_UINT64 || end2 > offset1); +} + +/* called with inode i_lock held */ +int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, + struct xdr_stream *xdr, int *count, + const struct pnfs_layout_range *range) +{ + struct nfs4_ff_layout_ds_err *err, *n; + __be32 *p; + + list_for_each_entry_safe(err, n, &flo->error_list, list) { + if (!is_range_intersecting(err->offset, err->length, + range->offset, range->length)) + continue; + /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) + * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) + */ + p = xdr_reserve_space(xdr, + 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); + if (unlikely(!p)) + return -ENOBUFS; + p = xdr_encode_hyper(p, err->offset); + p = xdr_encode_hyper(p, err->length); + p = xdr_encode_opaque_fixed(p, &err->stateid, + NFS4_STATEID_SIZE); + p = xdr_encode_opaque_fixed(p, &err->deviceid, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(err->status); + *p++ = cpu_to_be32(err->opnum); + *count += 1; + list_del(&err->list); + dprintk("%s: offset %llu length %llu status %d op %d count %d\n", + __func__, err->offset, err->length, err->status, + err->opnum, *count); + kfree(err); + } + + return 0; +} + +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *devid; + int idx; + + for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { + mirror = FF_LAYOUT_COMP(lseg, idx); + if (mirror && mirror->mirror_ds) { + devid = &mirror->mirror_ds->id_node; + if (!ff_layout_test_devid_unavailable(devid)) + return true; + } + } + + return false; +} + +module_param(dataserver_retrans, uint, 0644); +MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " + "retries a request before it attempts further " + " recovery action."); +module_param(dataserver_timeo, uint, 0644); +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " + "NFSv4.1 client waits for a response from a " + " data server before it retries an NFS request."); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 3ef01f0ba0bc..d63bea8bbfbb 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -269,8 +269,8 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp) if (!fscache_maybe_release_page(cookie, page, gfp)) return 0; - nfs_add_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + nfs_inc_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED); } return 1; @@ -293,8 +293,8 @@ void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode) BUG_ON(!PageLocked(page)); fscache_uncache_page(cookie, page); - nfs_add_fscache_stats(page->mapping->host, - NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + nfs_inc_fscache_stats(page->mapping->host, + NFSIOS_FSCACHE_PAGES_UNCACHED); } /* @@ -343,19 +343,19 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, case 0: /* read BIO submitted (page in fscache) */ dfprintk(FSCACHE, "NFS: readpage_from_fscache: BIO submitted\n"); - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK, 1); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_OK); return ret; case -ENOBUFS: /* inode not in cache */ case -ENODATA: /* page not in cache */ - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); return 1; default: dfprintk(FSCACHE, "NFS: readpage_from_fscache %d\n", ret); - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL, 1); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_READ_FAIL); } return ret; } @@ -429,11 +429,11 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync) if (ret != 0) { fscache_uncache_page(nfs_i_fscache(inode), page); - nfs_add_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1); - nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1); + nfs_inc_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL); + nfs_inc_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED); } else { - nfs_add_fscache_stats(inode, - NFSIOS_FSCACHE_PAGES_WRITTEN_OK, 1); + nfs_inc_fscache_stats(inode, + NFSIOS_FSCACHE_PAGES_WRITTEN_OK); } } diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 880618a8b048..9ac3846cb59e 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -51,14 +51,14 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i /* * Ensure that this dentry is invisible to d_find_alias(). * Otherwise, it may be spliced into the tree by - * d_materialise_unique if a parent directory from the same + * d_splice_alias if a parent directory from the same * filesystem gets mounted at a later time. * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ spin_lock(&sb->s_root->d_inode->i_lock); spin_lock(&sb->s_root->d_lock); - hlist_del_init(&sb->s_root->d_alias); + hlist_del_init(&sb->s_root->d_u.d_alias); spin_unlock(&sb->s_root->d_lock); spin_unlock(&sb->s_root->d_inode->i_lock); } diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 2f5db844c172..857e2a99acc8 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f nfs_fattr_free_group_name(fattr); } -static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { unsigned long val; char buf[16]; @@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re *res = val; return 1; } +EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric); static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 00689a8a85e4..d42dff6d5e98 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -192,6 +192,7 @@ void nfs_zap_caches(struct inode *inode) nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } +EXPORT_SYMBOL_GPL(nfs_zap_caches); void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { @@ -351,8 +352,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_attr_check_mountpoint(sb, fattr); - if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && - !nfs_attr_use_mounted_on_fileid(fattr)) + if (nfs_attr_use_mounted_on_fileid(fattr)) + fattr->fileid = fattr->mounted_on_fileid; + else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; @@ -386,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; - inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; @@ -505,10 +506,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { + loff_t i_size; + BUG_ON(!S_ISREG(inode->i_mode)); - if (attr->ia_size == i_size_read(inode)) + i_size = i_size_read(inode); + if (attr->ia_size == i_size) attr->ia_valid &= ~ATTR_SIZE; + else if (attr->ia_size < i_size && IS_SWAPFILE(inode)) + return -ETXTBSY; } /* Optimization: if the end result is no change, don't RPC */ @@ -550,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr); * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. + * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { @@ -559,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset) if (err) goto out; - spin_lock(&inode->i_lock); i_size_write(inode, offset); /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA; - spin_unlock(&inode->i_lock); + spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); + spin_lock(&inode->i_lock); out: return err; } @@ -579,10 +586,15 @@ out: * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ -void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) +void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, + struct nfs_fattr *fattr) { + /* Barrier: bump the attribute generation count. */ + nfs_fattr_set_barrier(fattr); + + spin_lock(&inode->i_lock); + NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { - spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -594,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_gid = attr->ia_gid; nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); - spin_unlock(&inode->i_lock); } if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } + nfs_update_inode(inode, fattr); + spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); @@ -1022,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { + unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; @@ -1054,11 +1068,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) } /** - * nfs_revalidate_mapping - Revalidate the pagecache + * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode * @mapping - pointer to mapping + * @may_lock - take inode->i_mutex? */ -int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +static int __nfs_revalidate_mapping(struct inode *inode, + struct address_space *mapping, + bool may_lock) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; @@ -1107,7 +1124,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) nfsi->cache_validity &= ~NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); - ret = nfs_invalidate_mapping(inode, mapping); + if (may_lock) { + mutex_lock(&inode->i_mutex); + ret = nfs_invalidate_mapping(inode, mapping); + mutex_unlock(&inode->i_mutex); + } else + ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); @@ -1117,6 +1139,29 @@ out: return ret; } +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, false); +} + +/** + * nfs_revalidate_mapping_protected - Revalidate the pagecache + * @inode - pointer to host inode + * @mapping - pointer to mapping + * + * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex + * while invalidating the mapping. + */ +int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping) +{ + return __nfs_revalidate_mapping(inode, mapping, true); +} + static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1149,7 +1194,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->npages == 0) { + && nfsi->nrequests == 0) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); ret |= NFS_INO_INVALID_ATTR; } @@ -1192,7 +1237,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->npages == 0) + if (cur_size != new_isize && nfsi->nrequests == 0) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; } @@ -1225,13 +1270,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; } -static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) -{ - if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) - return 0; - return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); -} - static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) @@ -1243,6 +1281,7 @@ unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } +EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { @@ -1254,6 +1293,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr) } EXPORT_SYMBOL_GPL(nfs_fattr_init); +/** + * nfs_fattr_set_barrier + * @fattr: attributes + * + * Used to set a barrier after an attribute was updated. This + * barrier ensures that older attributes from RPC calls that may + * have raced with our update cannot clobber these new values. + * Note that you are still responsible for ensuring that other + * operations which change the attribute on the server do not + * collide. + */ +void nfs_fattr_set_barrier(struct nfs_fattr *fattr) +{ + fattr->gencount = nfs_inc_attr_generation_counter(); +} + struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; @@ -1364,7 +1419,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || nfs_ctime_need_update(inode, fattr) || - nfs_size_need_update(inode, fattr) || ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } @@ -1454,6 +1508,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) int status; spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); @@ -1462,7 +1517,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** - * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode - pointer to inode * @fattr - updated attributes * @@ -1472,11 +1527,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); * * This function is mainly designed to be used by the ->write_done() functions. */ -int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int status; - spin_lock(&inode->i_lock); /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { @@ -1508,6 +1562,27 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr); + return status; +} + +/** + * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache + * @inode - pointer to inode + * @fattr - updated attributes + * + * After an operation that has changed the inode metadata, mark the + * attribute cache as being invalid, then try to update it. Fake up + * weak cache consistency data, if none exist. + * + * This function is mainly designed to be used by the ->write_done() functions. + */ +int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) +{ + int status; + + spin_lock(&inode->i_lock); + nfs_fattr_set_barrier(fattr); + status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } @@ -1619,7 +1694,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->npages == 0) || new_isize > cur_isize) { + if ((nfsi->nrequests == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; invalid &= ~NFS_INO_REVAL_PAGECACHE; @@ -1709,6 +1784,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; + /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { @@ -1716,6 +1792,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } + /* Set the barrier to be more recent than this fattr */ + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ @@ -1769,7 +1848,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi) #if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; - nfsi->delegation_state = 0; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; #endif @@ -1784,7 +1862,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_LIST_HEAD(&nfsi->commit_info.list); - nfsi->npages = 0; + nfsi->nrequests = 0; nfsi->commit_info.ncommit = 0; atomic_set(&nfsi->commit_info.rpcs_out, 0); atomic_set(&nfsi->silly_count, 1); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index efaa31c70fbe..9e6475bc5ba2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -6,6 +6,7 @@ #include <linux/mount.h> #include <linux/security.h> #include <linux/crc32.h> +#include <linux/nfs_page.h> #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) @@ -31,8 +32,6 @@ static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) return 0; - - fattr->fileid = fattr->mounted_on_fileid; return 1; } @@ -189,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, - unsigned int ds_retrans); + unsigned int ds_retrans, + u32 minor_version, + rpc_authflavor_t au_flavor); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); +extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, + unsigned int ds_retrans, rpc_authflavor_t au_flavor); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -244,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); -int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *, - const struct rpc_call_ops *, int, int); +int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, + struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, + const struct rpc_call_ops *call_ops, int how, int flags); void nfs_free_request(struct nfs_page *req); +struct nfs_pgio_mirror * +nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); static inline void nfs_iocounter_init(struct nfs_io_counter *c) { @@ -254,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c) atomic_set(&c->io_count, 0); } +static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) +{ + WARN_ON_ONCE(desc->pg_mirror_count < 1); + return desc->pg_mirror_count > 1; +} + /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, @@ -377,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct super_block *sb); +extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ @@ -416,7 +430,6 @@ int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); -void nfs_put_super(struct super_block *); int nfs_remount(struct super_block *sb, int *flags, char *raw_data); /* write.c */ @@ -429,6 +442,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, + const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, int how, int flags); extern void nfs_init_commit(struct nfs_commit_data *data, @@ -442,13 +456,16 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo); + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo); + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); @@ -459,6 +476,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx); +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, @@ -482,6 +500,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode) inode_dio_wait(inode); } extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); +extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); @@ -495,6 +514,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, struct rpc_cred *cred); +static inline struct inode *nfs_igrab_and_active(struct inode *inode) +{ + inode = igrab(inode); + if (inode != NULL && !nfs_sb_active(inode->i_sb)) { + iput(inode); + inode = NULL; + } + return inode; +} + +static inline void nfs_iput_and_deactive(struct inode *inode) +{ + if (inode != NULL) { + struct super_block *sb = inode->i_sb; + + iput(inode); + nfs_sb_deactive(sb); + } +} + /* * Determine the device name as a string */ @@ -560,6 +599,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* + * Record the page as unstable and mark its inode as dirty. + */ +static inline +void nfs_mark_page_unstable(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + + inc_zone_page_state(page, NR_UNSTABLE_NFS); + inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); +} + +/* * Determine the number of bytes of data the page contains */ static inline diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index c5832487c456..0cb806fbd4c4 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -55,6 +55,11 @@ static inline void nfs_add_fscache_stats(struct inode *inode, { this_cpu_add(NFS_SERVER(inode)->io_stats->fscache[stat], addend); } +static inline void nfs_inc_fscache_stats(struct inode *inode, + enum nfs_stat_fscachecounters stat) +{ + this_cpu_inc(NFS_SERVER(inode)->io_stats->fscache[stat]); +} #endif static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void) diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 5f61b83f4a1c..b4e03ed8599d 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -481,7 +481,8 @@ out_overflow: * void; * }; */ -static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) +static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result, + __u32 *op_status) { enum nfs_stat status; int error; @@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result) error = decode_stat(xdr, &status); if (unlikely(error)) goto out; + if (op_status) + *op_status = status; if (status != NFS_OK) goto out_default; error = decode_fattr(xdr, result); @@ -808,7 +811,7 @@ out_default: static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr, struct nfs_fattr *result) { - return decode_attrstat(xdr, result); + return decode_attrstat(xdr, result, NULL); } static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_stat(xdr, &status); if (unlikely(error)) goto out; + result->op_status = status; if (status != NFS_OK) goto out_default; error = decode_fattr(xdr, result->fattr); @@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, { /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC; - return decode_attrstat(xdr, result->fattr); + return decode_attrstat(xdr, result->fattr, &result->op_status); } /** diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 333ae4068506..e134d6548ab7 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); +/* nfs3super.c */ +extern struct nfs_subversion nfs_v3; #endif /* __LINUX_FS_NFS_NFS3_FS_H */ diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 8c1b437c5403..9e9fa347a948 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -1,5 +1,6 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> +#include <linux/sunrpc/addr.h> #include "internal.h" #include "nfs3_fs.h" @@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, nfs_init_server_aclclient(server); return server; } + +/* + * Set up a pNFS Data Server client over NFSv3. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp, + const struct sockaddr *ds_addr, int ds_addrlen, + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, + rpc_authflavor_t au_flavor) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .nfs_mod = &nfs_v3, + .proto = ds_proto, + .net = mds_clp->cl_net, + }; + struct rpc_timeout ds_timeout; + struct nfs_client *clp; + char buf[INET6_ADDRSTRLEN + 1]; + + /* fake a hostname because lockd wants it */ + if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) + return ERR_PTR(-EINVAL); + cl_init.hostname = buf; + + /* Use the MDS nfs_client cl_ipaddr. */ + nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + au_flavor); + + return clp; +} +EXPORT_SYMBOL_GPL(nfs3_set_ds_client); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 524f9f837408..1f11d2533ee4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; + if (hdr->pgio_done_cb != NULL) + return hdr->pgio_done_cb(task, hdr); + if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; @@ -825,10 +828,13 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; + if (hdr->pgio_done_cb != NULL) + return hdr->pgio_done_cb(task, hdr); + if (nfs3_async_handle_jukebox(task, inode)) return -EAGAIN; if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } @@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data) { + if (data->commit_done_cb != NULL) + return data->commit_done_cb(task, data); + if (nfs3_async_handle_jukebox(task, data->inode)) return -EAGAIN; nfs_refresh_inode(data->inode, data->res.fattr); diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index 6af29c2da352..5c4394e4656b 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -7,7 +7,7 @@ #include "nfs3_fs.h" #include "nfs.h" -static struct nfs_subversion nfs_v3 = { +struct nfs_subversion nfs_v3 = { .owner = THIS_MODULE, .nfs_fs = &nfs_fs_type, .rpc_vers = &nfs_version3, diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 8f4cbe7f4aa8..53852a4bd88b 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_post_op_attr(xdr, result->fattr); if (unlikely(error)) goto out; + result->op_status = status; if (status != NFS3_OK) goto out_status; error = decode_read3resok(xdr, result); @@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, error = decode_wcc_data(xdr, result->fattr); if (unlikely(error)) goto out; + result->op_status = status; if (status != NFS3_OK) goto out_status; error = decode_write3resok(xdr, result); @@ -1985,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_V3) entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (entry->fattr->fileid != entry->ino) { + entry->fattr->mounted_on_fileid = entry->ino; + entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID; + } + /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) @@ -2323,6 +2330,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req, error = decode_wcc_data(xdr, result->fattr); if (unlikely(error)) goto out; + result->op_status = status; if (status != NFS3_OK) goto out_status; error = decode_writeverf3(xdr, &result->verf->verifier); diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index d10333a197bf..7afb8947dfdf 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -6,6 +6,8 @@ #define __LINUX_FS_NFS_NFS4_2_H /* nfs4.2proc.c */ +int nfs42_proc_allocate(struct file *, loff_t, loff_t); +int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); /* nfs4.2xdr.h */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 0886f1db5917..cb170722769c 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -32,6 +32,81 @@ static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, return ret; } +static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, + loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(filep); + struct nfs42_falloc_args args = { + .falloc_fh = NFS_FH(inode), + .falloc_offset = offset, + .falloc_length = len, + }; + struct nfs42_falloc_res res; + struct nfs_server *server = NFS_SERVER(inode); + int status; + + msg->rpc_argp = &args; + msg->rpc_resp = &res; + + status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE); + if (status) + return status; + + return nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); +} + +static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, + loff_t offset, loff_t len) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_fallocate(msg, filep, offset, len); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + +int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ALLOCATE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) + return -EOPNOTSUPP; + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + return err; +} + +int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DEALLOCATE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) + return -EOPNOTSUPP; + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + return err; +} + loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct inode *inode = file_inode(filep); @@ -50,7 +125,7 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) struct nfs_server *server = NFS_SERVER(inode); int status; - if (!(server->caps & NFS_CAP_SEEK)) + if (!nfs_server_capable(inode, NFS_CAP_SEEK)) return -ENOTSUPP; status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index c90469b604b8..038a7e1521fa 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -4,6 +4,15 @@ #ifndef __LINUX_FS_NFS_NFS4_2XDR_H #define __LINUX_FS_NFS_NFS4_2XDR_H +#define encode_fallocate_maxsz (encode_stateid_maxsz + \ + 2 /* offset */ + \ + 2 /* length */) +#define encode_allocate_maxsz (op_encode_hdr_maxsz + \ + encode_fallocate_maxsz) +#define decode_allocate_maxsz (op_decode_hdr_maxsz) +#define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ + encode_fallocate_maxsz) +#define decode_deallocate_maxsz (op_decode_hdr_maxsz) #define encode_seek_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ 2 /* offset */ + \ @@ -14,6 +23,18 @@ 2 /* offset */ + \ 2 /* length */) +#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_allocate_maxsz) +#define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_allocate_maxsz) +#define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz) +#define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) @@ -22,6 +43,30 @@ decode_seek_maxsz) +static void encode_fallocate(struct xdr_stream *xdr, + struct nfs42_falloc_args *args) +{ + encode_nfs4_stateid(xdr, &args->falloc_stateid); + encode_uint64(xdr, args->falloc_offset); + encode_uint64(xdr, args->falloc_length); +} + +static void encode_allocate(struct xdr_stream *xdr, + struct nfs42_falloc_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_ALLOCATE, decode_allocate_maxsz, hdr); + encode_fallocate(xdr, args); +} + +static void encode_deallocate(struct xdr_stream *xdr, + struct nfs42_falloc_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_DEALLOCATE, decode_deallocate_maxsz, hdr); + encode_fallocate(xdr, args); +} + static void encode_seek(struct xdr_stream *xdr, struct nfs42_seek_args *args, struct compound_hdr *hdr) @@ -33,6 +78,42 @@ static void encode_seek(struct xdr_stream *xdr, } /* + * Encode ALLOCATE request + */ +static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_falloc_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_allocate(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* + * Encode DEALLOCATE request + */ +static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_falloc_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode SEEK request */ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, @@ -50,6 +131,16 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, encode_nops(&hdr); } +static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) +{ + return decode_op_hdr(xdr, OP_ALLOCATE); +} + +static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) +{ + return decode_op_hdr(xdr, OP_DEALLOCATE); +} + static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) { int status; @@ -73,6 +164,54 @@ out_overflow: } /* + * Decode ALLOCATE request + */ +static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_falloc_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_allocate(xdr, res); +out: + return status; +} + +/* + * Decode DEALLOCATE request + */ +static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_falloc_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); +out: + return status; +} + +/* * Decode SEEK request */ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index be6cac37ea10..fdef424b0cd3 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -44,6 +44,7 @@ enum nfs4_client_state { #define NFS4_RENEW_TIMEOUT 0x01 #define NFS4_RENEW_DELEGATION_CB 0x02 +struct nfs_seqid_counter; struct nfs4_minor_version_ops { u32 minor_version; unsigned init_caps; @@ -56,6 +57,8 @@ struct nfs4_minor_version_ops { struct nfs_fsinfo *); void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); + struct nfs_seqid * + (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); const struct rpc_call_ops *call_sync_ops; const struct nfs4_state_recovery_ops *reboot_recovery_ops; const struct nfs4_state_recovery_ops *nograce_recovery_ops; @@ -226,6 +229,7 @@ int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); /* nfs4proc.c */ +extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception *); extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); @@ -442,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); +extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task); +extern int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res); extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index ffdb28d86cf8..86d6214ea022 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -228,6 +228,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp) kfree(clp->cl_serverowner); kfree(clp->cl_serverscope); kfree(clp->cl_implid); + kfree(clp->cl_owner_id); } void nfs4_free_client(struct nfs_client *clp) @@ -241,28 +242,25 @@ void nfs4_free_client(struct nfs_client *clp) */ static int nfs4_init_callback(struct nfs_client *clp) { + struct rpc_xprt *xprt; int error; - if (clp->rpc_ops->version == 4) { - struct rpc_xprt *xprt; + xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); - xprt = rcu_dereference_raw(clp->cl_rpcclient->cl_xprt); - - if (nfs4_has_session(clp)) { - error = xprt_setup_backchannel(xprt, - NFS41_BC_MIN_CALLBACKS); - if (error < 0) - return error; - } - - error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); - if (error < 0) { - dprintk("%s: failed to start callback. Error = %d\n", - __func__, error); + if (nfs4_has_session(clp)) { + error = xprt_setup_backchannel(xprt, NFS41_BC_MIN_CALLBACKS); + if (error < 0) return error; - } - __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); } + + error = nfs_callback_up(clp->cl_mvops->minor_version, xprt); + if (error < 0) { + dprintk("%s: failed to start callback. Error = %d\n", + __func__, error); + return error; + } + __set_bit(NFS_CS_CALLBACK, &clp->cl_res_state); + return 0; } @@ -455,6 +453,14 @@ static void nfs4_swap_callback_idents(struct nfs_client *keep, spin_unlock(&nn->nfs_client_lock); } +static bool nfs4_match_client_owner_id(const struct nfs_client *clp1, + const struct nfs_client *clp2) +{ + if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL) + return true; + return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -486,9 +492,6 @@ int nfs40_walk_client_list(struct nfs_client *new, if (pos->rpc_ops != new->rpc_ops) continue; - if (pos->cl_proto != new->cl_proto) - continue; - if (pos->cl_minorversion != new->cl_minorversion) continue; @@ -498,8 +501,7 @@ int nfs40_walk_client_list(struct nfs_client *new, atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); - if (prev) - nfs_put_client(prev); + nfs_put_client(prev); prev = pos; status = nfs_wait_client_init_complete(pos); @@ -514,11 +516,13 @@ int nfs40_walk_client_list(struct nfs_client *new, if (pos->cl_clientid != new->cl_clientid) continue; + if (!nfs4_match_client_owner_id(pos, new)) + continue; + atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); - if (prev) - nfs_put_client(prev); + nfs_put_client(prev); prev = pos; status = nfs4_proc_setclientid_confirm(pos, &clid, cred); @@ -549,8 +553,7 @@ int nfs40_walk_client_list(struct nfs_client *new, /* No match found. The server lost our clientid */ out: - if (prev) - nfs_put_client(prev); + nfs_put_client(prev); dprintk("NFS: <-- %s status = %d\n", __func__, status); return status; } @@ -572,20 +575,14 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b) } /* - * Returns true if the server owners match + * Returns true if the server major ids match */ static bool -nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b) +nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b) { struct nfs41_server_owner *o1 = a->cl_serverowner; struct nfs41_server_owner *o2 = b->cl_serverowner; - if (o1->minor_id != o2->minor_id) { - dprintk("NFS: --> %s server owner minor IDs do not match\n", - __func__); - return false; - } - if (o1->major_id_sz != o2->major_id_sz) goto out_major_mismatch; if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) @@ -624,10 +621,10 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { - if (pos->rpc_ops != new->rpc_ops) - continue; + if (pos == new) + goto found; - if (pos->cl_proto != new->cl_proto) + if (pos->rpc_ops != new->rpc_ops) continue; if (pos->cl_minorversion != new->cl_minorversion) @@ -641,15 +638,10 @@ int nfs41_walk_client_list(struct nfs_client *new, atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); - if (prev) - nfs_put_client(prev); + nfs_put_client(prev); prev = pos; status = nfs_wait_client_init_complete(pos); - if (status == 0) { - nfs4_schedule_lease_recovery(pos); - status = nfs4_wait_clnt_recover(pos); - } spin_lock(&nn->nfs_client_lock); if (status < 0) break; @@ -661,9 +653,21 @@ int nfs41_walk_client_list(struct nfs_client *new, if (!nfs4_match_clientids(pos, new)) continue; - if (!nfs4_match_serverowners(pos, new)) + /* + * Note that session trunking is just a special subcase of + * client id trunking. In either case, we want to fall back + * to using the existing nfs_client. + */ + if (!nfs4_check_clientid_trunking(pos, new)) continue; + /* Unlike NFSv4.0, we know that NFSv4.1 always uses the + * uniform string, however someone might switch the + * uniquifier string on us. + */ + if (!nfs4_match_client_owner_id(pos, new)) + continue; +found: atomic_inc(&pos->cl_count); *result = pos; status = 0; @@ -675,8 +679,7 @@ int nfs41_walk_client_list(struct nfs_client *new, /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); - if (prev) - nfs_put_client(prev); + nfs_put_client(prev); return status; } #endif /* CONFIG_NFS_V4_1 */ @@ -845,14 +848,15 @@ error: */ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, const struct sockaddr *ds_addr, int ds_addrlen, - int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, + u32 minor_version, rpc_authflavor_t au_flavor) { struct nfs_client_initdata cl_init = { .addr = ds_addr, .addrlen = ds_addrlen, .nfs_mod = &nfs_v4, .proto = ds_proto, - .minorversion = mds_clp->cl_minorversion, + .minorversion = minor_version, .net = mds_clp->cl_net, }; struct rpc_timeout ds_timeout; @@ -870,7 +874,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, - mds_clp->cl_rpcclient->cl_auth->au_flavor); + au_flavor); dprintk("<-- %s %p\n", __func__, clp); return clp; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index c51fb4db9bfe..8b46389c4c5b 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -3,6 +3,8 @@ * * Copyright (C) 1992 Rick Sladkey */ +#include <linux/fs.h> +#include <linux/falloc.h> #include <linux/nfs_fs.h> #include "internal.h" #include "fscache.h" @@ -134,6 +136,32 @@ static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) return nfs_file_llseek(filep, offset, whence); } } + +static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(filep); + long ret; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + return -EOPNOTSUPP; + + ret = inode_newsize_ok(inode, offset + len); + if (ret < 0) + return ret; + + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = nfs42_proc_deallocate(filep, offset, len); + else + ret = nfs42_proc_allocate(filep, offset, len); + mutex_unlock(&inode->i_mutex); + + nfs_zap_caches(inode); + return ret; +} #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { @@ -155,6 +183,9 @@ const struct file_operations nfs4_file_operations = { .flock = nfs_flock, .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, +#ifdef CONFIG_NFS_V4_2 + .fallocate = nfs42_fallocate, +#endif /* CONFIG_NFS_V4_2 */ .check_flags = nfs_check_flags, .setlease = simple_nosetlease, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 69dc20a743f9..627f37c44456 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -158,8 +158,6 @@ static int nfs4_map_errors(int err) return -EACCES; case -NFS4ERR_MINOR_VERS_MISMATCH: return -EPROTONOSUPPORT; - case -NFS4ERR_ACCESS: - return -EACCES; case -NFS4ERR_FILE_OPEN: return -EBUSY; default: @@ -344,7 +342,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) /* This is the error handling routine for processes that are allowed * to sleep. */ -static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; @@ -497,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args) args->sa_privileged = 1; } -static int nfs40_setup_sequence(const struct nfs_server *server, - struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, - struct rpc_task *task) +int nfs40_setup_sequence(struct nfs4_slot_table *tbl, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct rpc_task *task) { - struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl; struct nfs4_slot *slot; /* slot already allocated? */ @@ -537,6 +534,7 @@ out_sleep: spin_unlock(&tbl->slot_tbl_lock); return -EAGAIN; } +EXPORT_SYMBOL_GPL(nfs40_setup_sequence); static int nfs40_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -696,8 +694,7 @@ out_retry: } EXPORT_SYMBOL_GPL(nfs41_sequence_done); -static int nfs4_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) +int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { if (res->sr_slot == NULL) return 1; @@ -705,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task, return nfs40_sequence_done(task, res); return nfs41_sequence_done(task, res); } +EXPORT_SYMBOL_GPL(nfs4_sequence_done); int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, @@ -779,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server, int ret = 0; if (!session) - return nfs40_setup_sequence(server, args, res, task); + return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + args, res, task); dprintk("--> %s clp %p session %p sr_slot %u\n", __func__, session->clp, session, res->sr_slot ? @@ -820,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_res *res, struct rpc_task *task) { - return nfs40_setup_sequence(server, args, res, task); + return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + args, res, task); } -static int nfs4_sequence_done(struct rpc_task *task, - struct nfs4_sequence_res *res) +int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) { return nfs40_sequence_done(task, res); } +EXPORT_SYMBOL_GPL(nfs4_sequence_done); #endif /* !CONFIG_NFS_V4_1 */ @@ -900,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) if (!cinfo->atomic || cinfo->before != dir->i_version) nfs_force_lookup_revalidate(dir); dir->i_version = cinfo->after; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); } @@ -939,6 +941,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, return true; } +static u32 +nfs4_map_atomic_open_share(struct nfs_server *server, + fmode_t fmode, int openflags) +{ + u32 res = 0; + + switch (fmode & (FMODE_READ | FMODE_WRITE)) { + case FMODE_READ: + res = NFS4_SHARE_ACCESS_READ; + break; + case FMODE_WRITE: + res = NFS4_SHARE_ACCESS_WRITE; + break; + case FMODE_READ|FMODE_WRITE: + res = NFS4_SHARE_ACCESS_BOTH; + } + if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1)) + goto out; + /* Want no delegation if we're using O_DIRECT */ + if (openflags & O_DIRECT) + res |= NFS4_SHARE_WANT_NO_DELEG; +out: + return res; +} + static enum open_claim_type4 nfs4_map_atomic_open_claim(struct nfs_server *server, enum open_claim_type4 claim) @@ -979,6 +1006,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct dentry *parent = dget_parent(dentry); struct inode *dir = parent->d_inode; struct nfs_server *server = NFS_SERVER(dir); + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); struct nfs4_opendata *p; p = kzalloc(sizeof(*p), gfp_mask); @@ -989,8 +1017,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (IS_ERR(p->f_label)) goto err_free_p; - p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask); - if (p->o_arg.seqid == NULL) + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; + p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); + if (IS_ERR(p->o_arg.seqid)) goto err_free_label; nfs_sb_active(dentry->d_sb); p->dentry = dget(dentry); @@ -999,6 +1028,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, atomic_inc(&sp->so_count); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + p->o_arg.share_access = nfs4_map_atomic_open_share(server, + fmode, flags); /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS * will return permission denied for all bits until close */ if (!(flags & O_EXCL)) { @@ -1119,8 +1150,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) return 0; if ((delegation->type & fmode) != fmode) return 0; - if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) - return 0; if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) return 0; nfs_mark_delegation_referenced(delegation); @@ -1171,6 +1200,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state, return false; } +static void nfs_resync_open_stateid_locked(struct nfs4_state *state) +{ + if (state->n_wronly) + set_bit(NFS_O_WRONLY_STATE, &state->flags); + if (state->n_rdonly) + set_bit(NFS_O_RDONLY_STATE, &state->flags); + if (state->n_rdwr) + set_bit(NFS_O_RDWR_STATE, &state->flags); +} + static void nfs_clear_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode) { @@ -1189,8 +1228,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, } if (stateid == NULL) return; - if (!nfs_need_update_open_stateid(state, stateid)) + /* Handle races with OPEN */ + if (!nfs4_stateid_match_other(stateid, &state->open_stateid) || + !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + nfs_resync_open_stateid_locked(state); return; + } if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); @@ -1285,6 +1328,23 @@ no_delegation: return ret; } +static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp, + const nfs4_stateid *stateid) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid)) + goto out_noupdate; + if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid)) + goto out_noupdate; + nfs4_stateid_copy(&lsp->ls_stateid, stateid); + ret = true; +out_noupdate: + spin_unlock(&state->state_lock); + return ret; +} static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode) { @@ -1493,6 +1553,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; + opendata->o_arg.share_access = nfs4_map_atomic_open_share( + NFS_SB(opendata->dentry->d_sb), + fmode, 0); memset(&opendata->o_res, 0, sizeof(opendata->o_res)); memset(&opendata->c_res, 0, sizeof(opendata->c_res)); nfs4_init_opendata_res(opendata); @@ -1683,8 +1746,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { struct nfs4_opendata *data = calldata; - nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args, - &data->c_res.seq_res, task); + nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl, + &data->c_arg.seq_args, &data->c_res.seq_res, task); } static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) @@ -2354,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir, opendata->o_res.f_attr, sattr, state, label, olabel); if (status == 0) { - nfs_setattr_update_inode(state->inode, sattr); - nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr); + nfs_setattr_update_inode(state->inode, sattr, + opendata->o_res.f_attr); nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel); } } @@ -2591,6 +2654,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) case -NFS4ERR_OLD_STATEID: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_EXPIRED: + if (!nfs4_stateid_match(&calldata->arg.stateid, + &state->open_stateid)) { + rpc_restart_call_prepare(task); + goto out_release; + } if (calldata->arg.fmode == 0) break; default: @@ -2623,6 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); + nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { @@ -2657,6 +2726,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_wait; } } + calldata->arg.share_access = + nfs4_map_atomic_open_share(NFS_SERVER(inode), + calldata->arg.fmode, 0); nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; @@ -2679,45 +2751,10 @@ static const struct rpc_call_ops nfs4_close_ops = { .rpc_release = nfs4_free_closedata, }; -static bool nfs4_state_has_opener(struct nfs4_state *state) -{ - /* first check existing openers */ - if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 && - state->n_rdonly != 0) - return true; - - if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 && - state->n_wronly != 0) - return true; - - if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 && - state->n_rdwr != 0) - return true; - - return false; -} - static bool nfs4_roc(struct inode *inode) { - struct nfs_inode *nfsi = NFS_I(inode); - struct nfs_open_context *ctx; - struct nfs4_state *state; - - spin_lock(&inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { - state = ctx->state; - if (state == NULL) - continue; - if (nfs4_state_has_opener(state)) { - spin_unlock(&inode->i_lock); - return false; - } - } - spin_unlock(&inode->i_lock); - - if (nfs4_check_delegation(inode, FMODE_READ)) + if (!nfs_have_layout(inode)) return false; - return pnfs_roc(inode); } @@ -2735,6 +2772,7 @@ static bool nfs4_roc(struct inode *inode) int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) { struct nfs_server *server = NFS_SERVER(state->inode); + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); struct nfs4_closedata *calldata; struct nfs4_state_owner *sp = state->owner; struct rpc_task *task; @@ -2761,10 +2799,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); - calldata->arg.stateid = &state->open_stateid; /* Serialization for the sequence id */ - calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask); - if (calldata->arg.seqid == NULL) + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; + calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask); + if (IS_ERR(calldata->arg.seqid)) goto out_free_calldata; calldata->arg.fmode = 0; calldata->arg.bitmask = server->cache_consistency_bitmask; @@ -3254,7 +3292,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); if (status == 0) { - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); } nfs4_label_free(label); @@ -4200,7 +4238,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, } if (task->tk_status >= 0) { renew_lease(NFS_SERVER(inode), hdr->timestamp); - nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr); + nfs_writeback_update_inode(hdr); } return 0; } @@ -4919,11 +4957,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, } static unsigned int -nfs4_init_nonuniform_client_string(const struct nfs_client *clp, +nfs4_init_nonuniform_client_string(struct nfs_client *clp, char *buf, size_t len) { unsigned int result; + if (clp->cl_owner_id != NULL) + return strlcpy(buf, clp->cl_owner_id, len); + rcu_read_lock(); result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", clp->cl_ipaddr, @@ -4932,24 +4973,32 @@ nfs4_init_nonuniform_client_string(const struct nfs_client *clp, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); rcu_read_unlock(); + clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); return result; } static unsigned int -nfs4_init_uniform_client_string(const struct nfs_client *clp, +nfs4_init_uniform_client_string(struct nfs_client *clp, char *buf, size_t len) { const char *nodename = clp->cl_rpcclient->cl_nodename; + unsigned int result; + + if (clp->cl_owner_id != NULL) + return strlcpy(buf, clp->cl_owner_id, len); if (nfs4_client_id_uniquifier[0] != '\0') - return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", + result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, nodename); - return scnprintf(buf, len, "Linux NFSv%u.%u %s", + else + result = scnprintf(buf, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, nodename); + clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); + return result; } /* @@ -5130,9 +5179,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) static void nfs4_delegreturn_release(void *calldata) { struct nfs4_delegreturndata *data = calldata; + struct inode *inode = data->inode; - if (data->roc) - pnfs_roc_release(data->inode); + if (inode) { + if (data->roc) + pnfs_roc_release(inode); + nfs_iput_and_deactive(inode); + } kfree(calldata); } @@ -5189,9 +5242,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; - data->inode = inode; - data->roc = list_empty(&NFS_I(inode)->open_files) ? - pnfs_roc(inode) : false; + data->inode = nfs_igrab_and_active(inode); + if (data->inode) + data->roc = nfs4_roc(inode); task_setup_data.callback_data = data; msg.rpc_argp = &data->args; @@ -5346,7 +5399,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.fl = &p->fl; p->arg.seqid = seqid; p->res.seqid = seqid; - p->arg.stateid = &lsp->ls_stateid; p->lsp = lsp; atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ @@ -5373,14 +5425,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) return; switch (task->tk_status) { case 0: - nfs4_stateid_copy(&calldata->lsp->ls_stateid, - &calldata->res.stateid); renew_lease(calldata->server, calldata->timestamp); - break; + do_vfs_lock(calldata->fl.fl_file, &calldata->fl); + if (nfs4_update_lock_stateid(calldata->lsp, + &calldata->res.stateid)) + break; case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: + if (!nfs4_stateid_match(&calldata->arg.stateid, + &calldata->lsp->ls_stateid)) + rpc_restart_call_prepare(task); break; default: if (nfs4_async_handle_error(task, calldata->server, @@ -5396,6 +5452,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; + nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; @@ -5466,6 +5523,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; struct rpc_task *task; + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); int status = 0; unsigned char fl_flags = request->fl_flags; @@ -5489,9 +5547,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * lsp = request->fl_u.nfs4_fl.owner; if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0) goto out; - seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); + alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid; + seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL); status = -ENOMEM; - if (seqid == NULL) + if (IS_ERR(seqid)) goto out; task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid); status = PTR_ERR(task); @@ -5524,6 +5583,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, struct nfs4_lockdata *p; struct inode *inode = lsp->ls_state->inode; struct nfs_server *server = NFS_SERVER(inode); + struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); p = kzalloc(sizeof(*p), gfp_mask); if (p == NULL) @@ -5532,12 +5592,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask); - if (p->arg.open_seqid == NULL) + if (IS_ERR(p->arg.open_seqid)) goto out_free; - p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask); - if (p->arg.lock_seqid == NULL) + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; + p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask); + if (IS_ERR(p->arg.lock_seqid)) goto out_free_seqid; - p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_seqid.owner_id; p->arg.lock_owner.s_dev = server->s_dev; @@ -5564,15 +5624,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) goto out_wait; /* Do we need to do an open_to_lock_owner? */ - if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { + if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) { if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) { goto out_release_lock_seqid; } - data->arg.open_stateid = &state->open_stateid; + nfs4_stateid_copy(&data->arg.open_stateid, + &state->open_stateid); data->arg.new_lock_owner = 1; data->res.open_seqid = data->arg.open_seqid; - } else + } else { data->arg.new_lock_owner = 0; + nfs4_stateid_copy(&data->arg.lock_stateid, + &data->lsp->ls_stateid); + } if (!nfs4_valid_open_stateid(state)) { data->rpc_status = -EBADF; task->tk_action = NULL; @@ -5596,6 +5660,7 @@ out_wait: static void nfs4_lock_done(struct rpc_task *task, void *calldata) { struct nfs4_lockdata *data = calldata; + struct nfs4_lock_state *lsp = data->lsp; dprintk("%s: begin!\n", __func__); @@ -5603,18 +5668,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) return; data->rpc_status = task->tk_status; - if (data->arg.new_lock_owner != 0) { - if (data->rpc_status == 0) - nfs_confirm_seqid(&data->lsp->ls_seqid, 0); - else - goto out; - } - if (data->rpc_status == 0) { - nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid); - set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags); - renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); + switch (task->tk_status) { + case 0: + renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), + data->timestamp); + if (data->arg.new_lock) { + data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); + if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) { + rpc_restart_call_prepare(task); + break; + } + } + if (data->arg.new_lock_owner != 0) { + nfs_confirm_seqid(&lsp->ls_seqid, 0); + nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid); + set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags); + } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + if (data->arg.new_lock_owner != 0) { + if (!nfs4_stateid_match(&data->arg.open_stateid, + &lsp->ls_state->open_stateid)) + rpc_restart_call_prepare(task); + } else if (!nfs4_stateid_match(&data->arg.lock_stateid, + &lsp->ls_stateid)) + rpc_restart_call_prepare(task); } -out: dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status); } @@ -5695,7 +5778,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f if (recovery_type == NFS_LOCK_RECLAIM) data->arg.reclaim = NFS_LOCK_RECLAIM; nfs4_set_sequence_privileged(&data->arg.seq_args); - } + } else + data->arg.new_lock = 1; task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); @@ -5819,10 +5903,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_state_owner *sp = state->owner; struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; - unsigned int seq; int status = -ENOLCK; if ((fl_flags & FL_POSIX) && @@ -5842,25 +5924,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; status = do_vfs_lock(request->fl_file, request); - goto out_unlock; - } - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); - up_read(&nfsi->rwsem); - status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); - if (status != 0) + up_read(&nfsi->rwsem); goto out; - down_read(&nfsi->rwsem); - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) { - status = -NFS4ERR_DELAY; - goto out_unlock; } - /* Note: we always want to sleep here! */ - request->fl_flags = fl_flags | FL_SLEEP; - if (do_vfs_lock(request->fl_file, request) < 0) - printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock " - "manager!\n", __func__); -out_unlock: up_read(&nfsi->rwsem); + status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW); out: request->fl_flags = fl_flags; return status; @@ -5967,8 +6035,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata { struct nfs_release_lockowner_data *data = calldata; struct nfs_server *server = data->server; - nfs40_setup_sequence(server, &data->args.seq_args, - &data->res.seq_res, task); + nfs40_setup_sequence(server->nfs_client->cl_slot_tbl, + &data->args.seq_args, &data->res.seq_res, task); data->args.lock_owner.clientid = server->nfs_client->cl_clientid; data->timestamp = jiffies; } @@ -6584,47 +6652,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a, int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred) { int status; + struct nfs41_bind_conn_to_session_args args = { + .client = clp, + .dir = NFS4_CDFC4_FORE_OR_BOTH, + }; struct nfs41_bind_conn_to_session_res res; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION], - .rpc_argp = clp, + .rpc_argp = &args, .rpc_resp = &res, .rpc_cred = cred, }; dprintk("--> %s\n", __func__); - res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS); - if (unlikely(res.session == NULL)) { - status = -ENOMEM; - goto out; - } + nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); + if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) + args.dir = NFS4_CDFC4_FORE; status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_bind_conn_to_session(clp, status); if (status == 0) { - if (memcmp(res.session->sess_id.data, + if (memcmp(res.sessionid.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s: Session ID mismatch\n", __func__); status = -EIO; - goto out_session; + goto out; } - if (res.dir != NFS4_CDFS4_BOTH) { + if ((res.dir & args.dir) != res.dir || res.dir == 0) { dprintk("NFS: %s: Unexpected direction from server\n", __func__); status = -EIO; - goto out_session; + goto out; } - if (res.use_conn_in_rdma_mode) { + if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { dprintk("NFS: %s: Server returned RDMA mode = true\n", __func__); status = -EIO; - goto out_session; + goto out; } } -out_session: - kfree(res.session); out: dprintk("<-- %s status= %d\n", __func__, status); return status; @@ -6829,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status == 0) { clp->cl_clientid = res.clientid; - clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R); - if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) + clp->cl_exchange_flags = res.flags; + /* Client ID is not confirmed */ + if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) { + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); clp->cl_seqid = res.seqid; + } kfree(clp->cl_serverowner); clp->cl_serverowner = res.server_owner; @@ -7102,10 +7174,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->bc_attrs.max_reqs); } -static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) { struct nfs4_channel_attrs *sent = &args->fc_attrs; - struct nfs4_channel_attrs *rcvd = &session->fc_attrs; + struct nfs4_channel_attrs *rcvd = &res->fc_attrs; if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; @@ -7124,11 +7197,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args return 0; } -static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session) +static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, + struct nfs41_create_session_res *res) { struct nfs4_channel_attrs *sent = &args->bc_attrs; - struct nfs4_channel_attrs *rcvd = &session->bc_attrs; + struct nfs4_channel_attrs *rcvd = &res->bc_attrs; + if (!(res->flags & SESSION4_BACK_CHAN)) + goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; if (rcvd->max_resp_sz < sent->max_resp_sz) @@ -7140,18 +7216,33 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args return -EINVAL; if (rcvd->max_reqs != sent->max_reqs) return -EINVAL; +out: return 0; } static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args, - struct nfs4_session *session) + struct nfs41_create_session_res *res) { int ret; - ret = nfs4_verify_fore_channel_attrs(args, session); + ret = nfs4_verify_fore_channel_attrs(args, res); if (ret) return ret; - return nfs4_verify_back_channel_attrs(args, session); + return nfs4_verify_back_channel_attrs(args, res); +} + +static void nfs4_update_session(struct nfs4_session *session, + struct nfs41_create_session_res *res) +{ + nfs4_copy_sessionid(&session->sess_id, &res->sessionid); + /* Mark client id and session as being confirmed */ + session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; + set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state); + session->flags = res->flags; + memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs)); + if (res->flags & SESSION4_BACK_CHAN) + memcpy(&session->bc_attrs, &res->bc_attrs, + sizeof(session->bc_attrs)); } static int _nfs4_proc_create_session(struct nfs_client *clp, @@ -7160,11 +7251,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, struct nfs4_session *session = clp->cl_session; struct nfs41_create_session_args args = { .client = clp, + .clientid = clp->cl_clientid, + .seqid = clp->cl_seqid, .cb_program = NFS4_CALLBACK, }; - struct nfs41_create_session_res res = { - .client = clp, - }; + struct nfs41_create_session_res res; + struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION], .rpc_argp = &args, @@ -7181,11 +7273,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp, if (!status) { /* Verify the session's negotiated channel_attrs values */ - status = nfs4_verify_channel_attrs(&args, session); + status = nfs4_verify_channel_attrs(&args, &res); /* Increment the clientid slot sequence id */ - clp->cl_seqid++; + if (clp->cl_seqid == res.seqid) + clp->cl_seqid++; + if (status) + goto out; + nfs4_update_session(session, &res); } - +out: return status; } @@ -7237,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, dprintk("--> nfs4_proc_destroy_session\n"); /* session is still being setup */ - if (session->clp->cl_cons_state != NFS_CS_READY) - return status; + if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state)) + return 0; status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); trace_nfs4_destroy_session(session->clp, status); @@ -7530,6 +7626,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) return; if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, + &lgp->args.range, lgp->args.ctx->state)) { rpc_exit(task, NFS4_OK); } @@ -7704,6 +7801,9 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) dprintk("--> %s\n", __func__); + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { nfs4_layoutget_release(lgp); @@ -7716,9 +7816,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) lgp->res.seq_res.sr_slot = NULL; nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); - /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ - pnfs_get_layout_hdr(NFS_I(inode)->layout); - task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); @@ -7785,9 +7882,13 @@ static void nfs4_layoutreturn_release(void *calldata) spin_lock(&lo->plh_inode->i_lock); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_clear_layoutreturn_waitbit(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); pnfs_put_layout_hdr(lrp->args.layout); + nfs_iput_and_deactive(lrp->inode); kfree(calldata); dprintk("<-- %s\n", __func__); } @@ -7798,7 +7899,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = { .rpc_release = nfs4_layoutreturn_release, }; -int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) +int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) { struct rpc_task *task; struct rpc_message msg = { @@ -7813,14 +7914,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) .callback_ops = &nfs4_layoutreturn_call_ops, .callback_data = lrp, }; - int status; + int status = 0; dprintk("--> %s\n", __func__); + if (!sync) { + lrp->inode = nfs_igrab_and_active(lrp->args.inode); + if (!lrp->inode) { + nfs4_layoutreturn_release(lrp); + return -EAGAIN; + } + task_setup_data.flags |= RPC_TASK_ASYNC; + } nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - status = task->tk_status; + if (sync) + status = task->tk_status; trace_nfs4_layoutreturn(lrp->args.inode, status); dprintk("<-- %s status=%d\n", __func__, status); rpc_put_task(task); @@ -7914,6 +8024,7 @@ static void nfs4_layoutcommit_release(void *calldata) nfs_post_op_update_inode_force_wcc(data->args.inode, data->res.fattr); put_rpccred(data->cred); + nfs_iput_and_deactive(data->inode); kfree(data); } @@ -7938,7 +8049,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutcommit_ops, .callback_data = data, - .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; int status = 0; @@ -7949,18 +8059,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) data->args.lastbytewritten, data->args.inode->i_ino); + if (!sync) { + data->inode = nfs_igrab_and_active(data->args.inode); + if (data->inode == NULL) { + nfs4_layoutcommit_release(data); + return -EAGAIN; + } + task_setup_data.flags = RPC_TASK_ASYNC; + } nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - if (sync == false) - goto out; - status = nfs4_wait_for_completion_rpc_task(task); - if (status != 0) - goto out; - status = task->tk_status; + if (sync) + status = task->tk_status; trace_nfs4_layoutcommit(data->args.inode, status); -out: dprintk("%s: status %d\n", __func__, status); rpc_put_task(task); return status; @@ -8388,6 +8501,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .match_stateid = nfs4_match_stateid, .find_root_sec = nfs4_find_root_sec, .free_lock_state = nfs4_release_lockowner, + .alloc_seqid = nfs_alloc_seqid, .call_sync_ops = &nfs40_call_sync_ops, .reboot_recovery_ops = &nfs40_reboot_recovery_ops, .nograce_recovery_ops = &nfs40_nograce_recovery_ops, @@ -8396,6 +8510,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { }; #if defined(CONFIG_NFS_V4_1) +static struct nfs_seqid * +nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2) +{ + return NULL; +} + static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS @@ -8409,6 +8529,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .match_stateid = nfs41_match_stateid, .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, + .alloc_seqid = nfs_alloc_no_seqid, .call_sync_ops = &nfs41_call_sync_ops, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, @@ -8426,6 +8547,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_ALLOCATE + | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, @@ -8433,6 +8556,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .find_root_sec = nfs41_find_root_sec, .free_lock_state = nfs41_free_lock_state, .call_sync_ops = &nfs41_call_sync_ops, + .alloc_seqid = nfs_alloc_no_seqid, .reboot_recovery_ops = &nfs41_reboot_recovery_ops, .nograce_recovery_ops = &nfs41_nograce_recovery_ops, .state_renewal_ops = &nfs41_state_renewal_ops, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index e799dc3c3b1d..e23366effcfb 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses) tbl = &ses->fc_slot_table; tbl->session = ses; status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1); - if (status) /* -ENOMEM */ + if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */ return status; /* Back channel */ tbl = &ses->bc_slot_table; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index b34ada9bc6a2..e3ea2c5324d6 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -70,6 +70,7 @@ struct nfs4_session { enum nfs4_session_state { NFS4_SESSION_INITING, + NFS4_SESSION_ESTABLISHED, }; extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl, @@ -118,6 +119,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp) return 0; } +static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, + const struct nfs4_sessionid *src) +{ + memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); +} + #ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5194933ed419..f95e3b58bbc3 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -346,9 +346,23 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, status = nfs4_proc_exchange_id(clp, cred); if (status != NFS4_OK) return status; - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - return nfs41_walk_client_list(clp, result, cred); + status = nfs41_walk_client_list(clp, result, cred); + if (status < 0) + return status; + if (clp != *result) + return 0; + + /* Purge state if the client id was established in a prior instance */ + if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R) + set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state); + else + set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); + nfs4_schedule_state_manager(clp); + status = nfs_wait_client_init_complete(clp); + if (status < 0) + nfs_put_client(clp); + return status; } #endif /* CONFIG_NFS_V4_1 */ @@ -1003,11 +1017,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m struct nfs_seqid *new; new = kmalloc(sizeof(*new), gfp_mask); - if (new != NULL) { - new->sequence = counter; - INIT_LIST_HEAD(&new->list); - new->task = NULL; - } + if (new == NULL) + return ERR_PTR(-ENOMEM); + new->sequence = counter; + INIT_LIST_HEAD(&new->list); + new->task = NULL; return new; } @@ -1015,7 +1029,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid) { struct nfs_seqid_counter *sequence; - if (list_empty(&seqid->list)) + if (seqid == NULL || list_empty(&seqid->list)) return; sequence = seqid->sequence; spin_lock(&sequence->lock); @@ -1071,13 +1085,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) { - struct nfs4_state_owner *sp = container_of(seqid->sequence, - struct nfs4_state_owner, so_seqid); - struct nfs_server *server = sp->so_server; + struct nfs4_state_owner *sp; + if (seqid == NULL) + return; + + sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid); if (status == -NFS4ERR_BAD_SEQID) nfs4_drop_state_owner(sp); - if (!nfs4_has_session(server->nfs_client)) + if (!nfs4_has_session(sp->so_server->nfs_client)) nfs_increment_seqid(status, seqid); } @@ -1088,14 +1104,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) */ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) { - nfs_increment_seqid(status, seqid); + if (seqid != NULL) + nfs_increment_seqid(status, seqid); } int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) { - struct nfs_seqid_counter *sequence = seqid->sequence; + struct nfs_seqid_counter *sequence; int status = 0; + if (seqid == NULL) + goto out; + sequence = seqid->sequence; spin_lock(&sequence->lock); seqid->task = task; if (list_empty(&seqid->list)) @@ -1106,6 +1126,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) status = -EAGAIN; unlock: spin_unlock(&sequence->lock); +out: return status; } @@ -1366,49 +1387,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ struct nfs_inode *nfsi = NFS_I(inode); struct file_lock *fl; int status = 0; + struct file_lock_context *flctx = inode->i_flctx; + struct list_head *list; - if (inode->i_flock == NULL) + if (flctx == NULL) return 0; + list = &flctx->flc_posix; + /* Guard against delegation returns and new lock/unlock calls */ down_write(&nfsi->rwsem); - /* Protect inode->i_flock using the BKL */ - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK))) - continue; + spin_lock(&flctx->flc_lock); +restart: + list_for_each_entry(fl, list, fl_list) { if (nfs_file_open_context(fl->fl_file)->state != state) continue; - spin_unlock(&inode->i_lock); + spin_unlock(&flctx->flc_lock); status = ops->recover_lock(state, fl); switch (status) { - case 0: - break; - case -ESTALE: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_NO_GRACE: - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_BADSESSION: - case -NFS4ERR_BADSLOT: - case -NFS4ERR_BAD_HIGH_SLOT: - case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - goto out; - default: - printk(KERN_ERR "NFS: %s: unhandled error %d\n", - __func__, status); - case -ENOMEM: - case -NFS4ERR_DENIED: - case -NFS4ERR_RECLAIM_BAD: - case -NFS4ERR_RECLAIM_CONFLICT: - /* kill_proc(fl->fl_pid, SIGLOST, 1); */ - status = 0; + case 0: + break; + case -ESTALE: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto out; + default: + pr_err("NFS: %s: unhandled error %d\n", + __func__, status); + case -ENOMEM: + case -NFS4ERR_DENIED: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + /* kill_proc(fl->fl_pid, SIGLOST, 1); */ + status = 0; } - spin_lock(&inode->i_lock); + spin_lock(&flctx->flc_lock); } - spin_unlock(&inode->i_lock); + if (list == &flctx->flc_posix) { + list = &flctx->flc_flock; + goto restart; + } + spin_unlock(&flctx->flc_lock); out: up_write(&nfsi->rwsem); return status; diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 6f340f02f2ba..75090feeafad 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = { .destroy_inode = nfs_destroy_inode, .write_inode = nfs4_write_inode, .drop_inode = nfs_drop_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs4_evict_inode, .umount_begin = nfs_umount_begin, @@ -346,6 +345,9 @@ out: static void __exit exit_nfs_v4(void) { + /* Not called in the _init(), conditionally loaded */ + nfs4_pnfs_v3_ds_connect_unload(); + unregister_nfs_version(&nfs_v4); nfs4_unregister_sysctl(); nfs_idmap_quit(); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 206c08a60c7f..5c399ec41079 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -141,13 +141,15 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ 1 /* sc_prog */ + \ - XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ - XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ 1) /* sc_cb_ident */ #define decode_setclientid_maxsz \ (op_decode_hdr_maxsz + \ - 2 + \ - 1024) /* large value for CLID_INUSE */ + 2 /* clientid */ + \ + XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ + 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN)) #define encode_setclientid_confirm_maxsz \ (op_encode_hdr_maxsz + \ 3 + (NFS4_VERIFIER_SIZE >> 2)) @@ -944,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n) static void encode_nfs4_seqid(struct xdr_stream *xdr, const struct nfs_seqid *seqid) { - encode_uint32(xdr, seqid->sequence->counter); + if (seqid != NULL) + encode_uint32(xdr, seqid->sequence->counter); + else + encode_uint32(xdr, 0); } static void encode_compound_hdr(struct xdr_stream *xdr, @@ -1123,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg { encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr); encode_nfs4_seqid(xdr, arg->seqid); - encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_stateid(xdr, &arg->stateid); } static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr) @@ -1299,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ encode_nfs4_seqid(xdr, args->open_seqid); - encode_nfs4_stateid(xdr, args->open_stateid); + encode_nfs4_stateid(xdr, &args->open_stateid); encode_nfs4_seqid(xdr, args->lock_seqid); encode_lockowner(xdr, &args->lock_owner); } else { - encode_nfs4_stateid(xdr, args->lock_stateid); + encode_nfs4_stateid(xdr, &args->lock_stateid); encode_nfs4_seqid(xdr, args->lock_seqid); } } @@ -1328,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr); encode_uint32(xdr, nfs4_lock_type(args->fl, 0)); encode_nfs4_seqid(xdr, args->seqid); - encode_nfs4_stateid(xdr, args->stateid); + encode_nfs4_stateid(xdr, &args->stateid); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->fl->fl_start); xdr_encode_hyper(p, nfs4_lock_length(args->fl)); @@ -1346,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc encode_string(xdr, name->len, name->name); } -static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) +static void encode_share_access(struct xdr_stream *xdr, u32 share_access) { __be32 *p; p = reserve_space(xdr, 8); - switch (fmode & (FMODE_READ|FMODE_WRITE)) { - case FMODE_READ: - *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); - break; - case FMODE_WRITE: - *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); - break; - case FMODE_READ|FMODE_WRITE: - *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); - break; - default: - *p++ = cpu_to_be32(0); - } + *p++ = cpu_to_be32(share_access); *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } @@ -1375,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * owner 4 = 32 */ encode_nfs4_seqid(xdr, arg->seqid); - encode_share_access(xdr, arg->fmode); + encode_share_access(xdr, arg->share_access); p = reserve_space(xdr, 36); p = xdr_encode_hyper(p, arg->clientid); *p++ = cpu_to_be32(24); @@ -1528,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr) { encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr); - encode_nfs4_stateid(xdr, arg->stateid); + encode_nfs4_stateid(xdr, &arg->stateid); encode_nfs4_seqid(xdr, arg->seqid); - encode_share_access(xdr, arg->fmode); + encode_share_access(xdr, arg->share_access); } static void @@ -1722,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru #if defined(CONFIG_NFS_V4_1) /* NFSv4.1 operations */ static void encode_bind_conn_to_session(struct xdr_stream *xdr, - struct nfs4_session *session, + struct nfs41_bind_conn_to_session_args *args, struct compound_hdr *hdr) { __be32 *p; encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION, decode_bind_conn_to_session_maxsz, hdr); - encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN); p = xdr_reserve_space(xdr, 8); - *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH); - *p = 0; /* use_conn_in_rdma_mode = False */ + *p++ = cpu_to_be32(args->dir); + *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0); } static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) @@ -1799,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - char machine_name[NFS4_MAX_MACHINE_NAME_LEN]; - uint32_t len; struct nfs_client *clp = args->client; + struct rpc_clnt *clnt = clp->cl_rpcclient; struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id); u32 max_resp_sz_cached; @@ -1812,13 +1804,10 @@ static void encode_create_session(struct xdr_stream *xdr, max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; - len = scnprintf(machine_name, sizeof(machine_name), "%s", - clp->cl_ipaddr); - encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); - p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12); - p = xdr_encode_hyper(p, clp->cl_clientid); - *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); + p = xdr_encode_hyper(p, args->clientid); + *p++ = cpu_to_be32(args->seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ /* Fore Channel */ @@ -1845,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr, /* authsys_parms rfc1831 */ *p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */ - p = xdr_encode_opaque(p, machine_name, len); + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); *p++ = cpu_to_be32(0); /* UID */ *p++ = cpu_to_be32(0); /* GID */ *p = cpu_to_be32(0); /* No more gids */ @@ -2010,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr, p = reserve_space(xdr, 16); *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */ *p++ = cpu_to_be32(args->layout_type); - *p++ = cpu_to_be32(IOMODE_ANY); + *p++ = cpu_to_be32(args->range.iomode); *p = cpu_to_be32(RETURN_FILE); p = reserve_space(xdr, 16); - p = xdr_encode_hyper(p, 0); - p = xdr_encode_hyper(p, NFS4_MAX_UINT64); + p = xdr_encode_hyper(p, args->range.offset); + p = xdr_encode_hyper(p, args->range.length); spin_lock(&args->inode->i_lock); encode_nfs4_stateid(xdr, &args->stateid); spin_unlock(&args->inode->i_lock); @@ -2745,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req, */ static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req, struct xdr_stream *xdr, - struct nfs_client *clp) + struct nfs41_bind_conn_to_session_args *args) { struct compound_hdr hdr = { - .minorversion = clp->cl_mvops->minor_version, + .minorversion = args->client->cl_mvops->minor_version, }; encode_compound_hdr(xdr, req, &hdr); - encode_bind_conn_to_session(xdr, clp->cl_session, &hdr); + encode_bind_conn_to_session(xdr, args, &hdr); encode_nops(&hdr); } @@ -4934,20 +4923,13 @@ out_overflow: return -EIO; } -static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +static int decode_rw_delegation(struct xdr_stream *xdr, + uint32_t delegation_type, + struct nfs_openres *res) { __be32 *p; - uint32_t delegation_type; int status; - p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) - goto out_overflow; - delegation_type = be32_to_cpup(p); - if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { - res->delegation_type = 0; - return 0; - } status = decode_stateid(xdr, &res->delegation); if (unlikely(status)) return status; @@ -4971,6 +4953,52 @@ out_overflow: return -EIO; } +static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t why_no_delegation; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + why_no_delegation = be32_to_cpup(p); + switch (why_no_delegation) { + case WND4_CONTENTION: + case WND4_RESOURCE: + xdr_inline_decode(xdr, 4); + /* Ignore for now */ + } + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) +{ + __be32 *p; + uint32_t delegation_type; + + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); + res->delegation_type = 0; + switch (delegation_type) { + case NFS4_OPEN_DELEGATE_NONE: + return 0; + case NFS4_OPEN_DELEGATE_READ: + case NFS4_OPEN_DELEGATE_WRITE: + return decode_rw_delegation(xdr, delegation_type, res); + case NFS4_OPEN_DELEGATE_NONE_EXT: + return decode_no_delegation(xdr, res); + } + return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; @@ -5585,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION); if (!status) - status = decode_sessionid(xdr, &res->session->sess_id); + status = decode_sessionid(xdr, &res->sessionid); if (unlikely(status)) return status; @@ -5613,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr, { __be32 *p; int status; - struct nfs_client *clp = res->client; - struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); if (!status) - status = decode_sessionid(xdr, &session->sess_id); + status = decode_sessionid(xdr, &res->sessionid); if (unlikely(status)) return status; @@ -5626,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - clp->cl_seqid = be32_to_cpup(p++); - session->flags = be32_to_cpup(p); + res->seqid = be32_to_cpup(p++); + res->flags = be32_to_cpup(p); /* Channel attributes */ - status = decode_chan_attrs(xdr, &session->fc_attrs); + status = decode_chan_attrs(xdr, &res->fc_attrs); if (!status) - status = decode_chan_attrs(xdr, &session->bc_attrs); + status = decode_chan_attrs(xdr, &res->bc_attrs); return status; out_overflow: print_overflow_msg(__func__, xdr); @@ -6565,6 +6591,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; status = decode_compound_hdr(xdr, &hdr); + res->op_status = hdr.status; if (status) goto out; status = decode_sequence(xdr, &res->seq_res, rqstp); @@ -6590,6 +6617,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; status = decode_compound_hdr(xdr, &hdr); + res->op_status = hdr.status; if (status) goto out; status = decode_sequence(xdr, &res->seq_res, rqstp); @@ -6619,6 +6647,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; status = decode_compound_hdr(xdr, &hdr); + res->op_status = hdr.status; if (status) goto out; status = decode_sequence(xdr, &res->seq_res, rqstp); @@ -7394,6 +7423,8 @@ struct rpc_procinfo nfs4_procedures[] = { #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_V4_2 PROC(SEEK, enc_seek, dec_seek), + PROC(ALLOCATE, enc_allocate, dec_allocate), + PROC(DEALLOCATE, enc_deallocate, dec_deallocate), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index cd3c910d2d12..9bc9f04fb7f6 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline) */ len = snprintf(nfs_export_path, sizeof(nfs_export_path), tmp, utsname()->nodename); - if (len > (int)sizeof(nfs_export_path)) + if (len >= (int)sizeof(nfs_export_path)) goto out_devnametoolong; len = snprintf(nfs_root_device, sizeof(nfs_root_device), "%pI4:%s", &servaddr, nfs_export_path); - if (len > (int)sizeof(nfs_root_device)) + if (len >= (int)sizeof(nfs_root_device)) goto out_devnametoolong; retval = 0; diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 9e5bc42180e4..24e1d7403c0b 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio); unsigned int size; size = pnfs_generic_pg_test(pgio, prev, req); - if (!size || pgio->pg_count + req->wb_bytes > + if (!size || mirror->pg_count + req->wb_bytes > (unsigned long)pgio->pg_layout_private) return 0; @@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = { .pg_init = objio_init_read, .pg_test = objio_pg_test, .pg_doio = pnfs_generic_pg_readpages, + .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops objio_pg_write_ops = { .pg_init = objio_init_write, .pg_test = objio_pg_test, .pg_doio = pnfs_generic_pg_writepages, + .pg_cleanup = pnfs_generic_pg_cleanup, }; static struct pnfs_layoutdriver_type objlayout_type = { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ed0db61f8543..d57190a0d533 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) return p->pagevec != NULL; } +struct nfs_pgio_mirror * +nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) +{ + return nfs_pgio_has_mirroring(desc) ? + &desc->pg_mirrors[desc->pg_mirror_idx] : + &desc->pg_mirrors[0]; +} +EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror); + void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)) { - hdr->req = nfs_list_entry(desc->pg_list.next); + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + + hdr->req = nfs_list_entry(mirror->pg_list.next); hdr->inode = desc->pg_inode; hdr->cred = hdr->req->wb_context->cred; hdr->io_start = req_offset(hdr->req); - hdr->good_bytes = desc->pg_count; + hdr->good_bytes = mirror->pg_count; hdr->dreq = desc->pg_dreq; hdr->layout_private = desc->pg_layout_private; hdr->release = release; hdr->completion_ops = desc->pg_completion_ops; if (hdr->completion_ops->init_hdr) hdr->completion_ops->init_hdr(hdr); + + hdr->pgio_mirror_idx = desc->pg_mirror_idx; } EXPORT_SYMBOL_GPL(nfs_pgheader_init); @@ -258,6 +272,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) static inline void nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) { + struct inode *inode; WARN_ON_ONCE(prev == req); if (!prev) { @@ -276,12 +291,16 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) * nfs_page_group_destroy is called */ kref_get(&req->wb_head->wb_kref); - /* grab extra ref if head request has extra ref from - * the write/commit path to handle handoff between write - * and commit lists */ + /* grab extra ref and bump the request count if head request + * has extra ref from the write/commit path to handle handoff + * between write and commit lists. */ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) { + inode = page_file_mapping(req->wb_page)->host; set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); + spin_lock(&inode->i_lock); + NFS_I(inode)->nrequests++; + spin_unlock(&inode->i_lock); } } } @@ -475,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req) size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) { - if (desc->pg_count > desc->pg_bsize) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + + if (mirror->pg_count > mirror->pg_bsize) { /* should never happen */ WARN_ON_ONCE(1); return 0; @@ -485,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, * Limit the request size so that we can still allocate a page array * for it without upsetting the slab allocator. */ - if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * + if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) * sizeof(struct page) > PAGE_SIZE) return 0; - return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); + return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); @@ -592,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) } int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, + struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, const struct rpc_call_ops *call_ops, int how, int flags) { struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &hdr->args, .rpc_resp = &hdr->res, - .rpc_cred = hdr->cred, + .rpc_cred = cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, @@ -611,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, }; int ret = 0; - hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how); + hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); dprintk("NFS: %5u initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", @@ -645,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { + struct nfs_pgio_mirror *mirror; + u32 midx; + set_bit(NFS_IOHDR_REDO, &hdr->flags); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); - desc->pg_completion_ops->error_cleanup(&desc->pg_list); + /* TODO: Make sure it's right to clean up all mirrors here + * and not just hdr->pgio_mirror_idx */ + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + } return -ENOMEM; } @@ -665,6 +696,17 @@ static void nfs_pgio_release(void *calldata) hdr->completion_ops->completion(hdr); } +static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror, + unsigned int bsize) +{ + INIT_LIST_HEAD(&mirror->pg_list); + mirror->pg_bytes_written = 0; + mirror->pg_count = 0; + mirror->pg_bsize = bsize; + mirror->pg_base = 0; + mirror->pg_recoalesce = 0; +} + /** * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor @@ -681,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, size_t bsize, int io_flags) { - INIT_LIST_HEAD(&desc->pg_list); - desc->pg_bytes_written = 0; - desc->pg_count = 0; - desc->pg_bsize = bsize; - desc->pg_base = 0; + struct nfs_pgio_mirror *new; + int i; + desc->pg_moreio = 0; - desc->pg_recoalesce = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; desc->pg_completion_ops = compl_ops; @@ -697,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_lseg = NULL; desc->pg_dreq = NULL; desc->pg_layout_private = NULL; + desc->pg_bsize = bsize; + + desc->pg_mirror_count = 1; + desc->pg_mirror_idx = 0; + + if (pg_ops->pg_get_mirror_count) { + /* until we have a request, we don't have an lseg and no + * idea how many mirrors there will be */ + new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, + sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + desc->pg_mirrors_dynamic = new; + desc->pg_mirrors = new; + + for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++) + nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize); + } else { + desc->pg_mirrors_dynamic = NULL; + desc->pg_mirrors = desc->pg_mirrors_static; + nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); + } } EXPORT_SYMBOL_GPL(nfs_pageio_init); @@ -732,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata) int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_page *req; struct page **pages, *last_page; - struct list_head *head = &desc->pg_list; + struct list_head *head = &mirror->pg_list; struct nfs_commit_info cinfo; unsigned int pagecount, pageused; - pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); + pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); if (!nfs_pgarray_set(&hdr->page_array, pagecount)) return nfs_pgio_error(desc, hdr); @@ -767,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, desc->pg_ioflags &= ~FLUSH_COND_STABLE; /* Set up the argument struct */ - nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo); + nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo); desc->pg_rpc_callops = &nfs_pgio_common_ops; return 0; } @@ -775,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio); static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { + struct nfs_pgio_mirror *mirror; struct nfs_pgio_header *hdr; int ret; + mirror = nfs_pgio_current_mirror(desc); + hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); + /* TODO: make sure this is right with mirroring - or + * should it back out all mirrors? */ + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); return -ENOMEM; } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); if (ret == 0) ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), - hdr, desc->pg_rpc_callops, + hdr, + hdr->cred, + NFS_PROTO(hdr->inode), + desc->pg_rpc_callops, desc->pg_ioflags, 0); return ret; } +/* + * nfs_pageio_setup_mirroring - determine if mirroring is to be used + * by calling the pg_get_mirror_count op + */ +static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req) +{ + int mirror_count = 1; + + if (!pgio->pg_ops->pg_get_mirror_count) + return 0; + + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) + return -EINVAL; + + if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic)) + return -EINVAL; + + pgio->pg_mirror_count = mirror_count; + + return 0; +} + +/* + * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1) + */ +void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_mirror_count = 1; + pgio->pg_mirror_idx = 0; +} + +static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) +{ + pgio->pg_mirror_count = 1; + pgio->pg_mirror_idx = 0; + pgio->pg_mirrors = pgio->pg_mirrors_static; + kfree(pgio->pg_mirrors_dynamic); + pgio->pg_mirrors_dynamic = NULL; +} + static bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { @@ -821,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, struct nfs_pageio_descriptor *pgio) { size_t size; + struct file_lock_context *flctx; if (prev) { if (!nfs_match_open_context(req->wb_context, prev->wb_context)) return false; - if (req->wb_context->dentry->d_inode->i_flock != NULL && + flctx = req->wb_context->dentry->d_inode->i_flctx; + if (flctx != NULL && + !(list_empty_careful(&flctx->flc_posix) && + list_empty_careful(&flctx->flc_flock)) && !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context)) return false; @@ -858,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_page *prev = NULL; - if (desc->pg_count != 0) { - prev = nfs_list_entry(desc->pg_list.prev); + + if (mirror->pg_count != 0) { + prev = nfs_list_entry(mirror->pg_list.prev); } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); - desc->pg_base = req->wb_pgbase; + mirror->pg_base = req->wb_pgbase; } if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; nfs_list_remove_request(req); - nfs_list_add_request(req, &desc->pg_list); - desc->pg_count += req->wb_bytes; + nfs_list_add_request(req, &mirror->pg_list); + mirror->pg_count += req->wb_bytes; return 1; } @@ -879,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, */ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { - if (!list_empty(&desc->pg_list)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + + + if (!list_empty(&mirror->pg_list)) { int error = desc->pg_ops->pg_doio(desc); if (error < 0) desc->pg_error = error; else - desc->pg_bytes_written += desc->pg_count; + mirror->pg_bytes_written += mirror->pg_count; } - if (list_empty(&desc->pg_list)) { - desc->pg_count = 0; - desc->pg_base = 0; + if (list_empty(&mirror->pg_list)) { + mirror->pg_count = 0; + mirror->pg_base = 0; } } @@ -906,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_page *subreq; unsigned int bytes_left = 0; unsigned int offset, pgbase; @@ -929,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_pageio_doio(desc); if (desc->pg_error < 0) return 0; - if (desc->pg_recoalesce) + if (mirror->pg_recoalesce) return 0; /* retry add_request for this subreq */ nfs_page_group_lock(req, false); @@ -967,14 +1091,16 @@ err_ptr: static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); LIST_HEAD(head); do { - list_splice_init(&desc->pg_list, &head); - desc->pg_bytes_written -= desc->pg_count; - desc->pg_count = 0; - desc->pg_base = 0; - desc->pg_recoalesce = 0; + list_splice_init(&mirror->pg_list, &head); + mirror->pg_bytes_written -= mirror->pg_count; + mirror->pg_count = 0; + mirror->pg_base = 0; + mirror->pg_recoalesce = 0; + desc->pg_moreio = 0; while (!list_empty(&head)) { @@ -988,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) return 0; break; } - } while (desc->pg_recoalesce); + } while (mirror->pg_recoalesce); return 1; } -int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, +static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc, struct nfs_page *req) { int ret; @@ -1005,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, break; ret = nfs_do_recoalesce(desc); } while (ret); + return ret; } +int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + u32 midx; + unsigned int pgbase, offset, bytes; + struct nfs_page *dupreq, *lastreq; + + pgbase = req->wb_pgbase; + offset = req->wb_offset; + bytes = req->wb_bytes; + + nfs_pageio_setup_mirroring(desc, req); + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + if (midx) { + nfs_page_group_lock(req, false); + + /* find the last request */ + for (lastreq = req->wb_head; + lastreq->wb_this_page != req->wb_head; + lastreq = lastreq->wb_this_page) + ; + + dupreq = nfs_create_request(req->wb_context, + req->wb_page, lastreq, pgbase, bytes); + + if (IS_ERR(dupreq)) { + nfs_page_group_unlock(req); + return 0; + } + + nfs_lock_request(dupreq); + nfs_page_group_unlock(req); + dupreq->wb_offset = offset; + dupreq->wb_index = req->wb_index; + } else + dupreq = req; + + if (nfs_pgio_has_mirroring(desc)) + desc->pg_mirror_idx = midx; + if (!nfs_pageio_add_request_mirror(desc, dupreq)) + return 0; + } + + return 1; +} + +/* + * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an + * nfs_pageio_descriptor + * @desc: pointer to io descriptor + */ +static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, + u32 mirror_idx) +{ + struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx]; + u32 restore_idx = desc->pg_mirror_idx; + + if (nfs_pgio_has_mirroring(desc)) + desc->pg_mirror_idx = mirror_idx; + for (;;) { + nfs_pageio_doio(desc); + if (!mirror->pg_recoalesce) + break; + if (!nfs_do_recoalesce(desc)) + break; + } + desc->pg_mirror_idx = restore_idx; +} + /* * nfs_pageio_resend - Transfer requests to new descriptor and resend * @hdr - the pgio header to move request from @@ -1041,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, EXPORT_SYMBOL_GPL(nfs_pageio_resend); /** - * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor + * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor * @desc: pointer to io descriptor */ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) { - for (;;) { - nfs_pageio_doio(desc); - if (!desc->pg_recoalesce) - break; - if (!nfs_do_recoalesce(desc)) - break; - } + u32 midx; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) + nfs_pageio_complete_mirror(desc, midx); + + if (desc->pg_ops->pg_cleanup) + desc->pg_ops->pg_cleanup(desc); + nfs_pageio_cleanup_mirroring(desc); } /** @@ -1068,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc) */ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) { - if (!list_empty(&desc->pg_list)) { - struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev); - if (index != prev->wb_index + 1) - nfs_pageio_complete(desc); + struct nfs_pgio_mirror *mirror; + struct nfs_page *prev; + u32 midx; + + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + if (!list_empty(&mirror->pg_list)) { + prev = nfs_list_entry(mirror->pg_list.prev); + if (index != prev->wb_index + 1) + nfs_pageio_complete_mirror(desc, midx); + } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0a5dda4d85c2..4f802b02fbb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -34,6 +34,7 @@ #include "pnfs.h" #include "iostat.h" #include "nfs4trace.h" +#include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock); */ static LIST_HEAD(pnfs_modules_tbl); +static int +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, + enum pnfs_iomode iomode, bool sync); + /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * find_pnfs_driver_locked(u32 id) @@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) struct inode *inode = lo->plh_inode; if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + if (!list_empty(&lo->plh_segs)) + WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); spin_unlock(&inode->i_lock); pnfs_free_layout_hdr(lo); @@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } +/* Return true if layoutreturn is needed */ +static bool +pnfs_layout_need_return(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_segment *s; + + if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + return false; + + list_for_each_entry(s, &lo->plh_segs, pls_list) + if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) + return false; + + return true; +} + +static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, + struct pnfs_layout_hdr *lo, struct inode *inode) +{ + lo = lseg->pls_layout; + inode = lo->plh_inode; + + spin_lock(&inode->i_lock); + if (pnfs_layout_need_return(lo, lseg)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode; + + stateid = lo->plh_stateid; + iomode = lo->plh_return_iomode; + /* decreased in pnfs_send_layoutreturn() */ + lo->plh_block_lgets++; + lo->plh_return_iomode = 0; + spin_unlock(&inode->i_lock); + pnfs_get_layout_hdr(lo); + + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } else + spin_unlock(&inode->i_lock); +} + void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { @@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + + /* Handle the case where refcount != 1 */ + if (atomic_add_unless(&lseg->pls_refcount, -1, 1)) + return; + lo = lseg->pls_layout; inode = lo->plh_inode; + /* Do we need a layoutreturn? */ + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); @@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_get_layout_hdr(lo); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); + pnfs_clear_retry_layoutget(lo); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, return !pnfs_seqid_is_newer(seqid, lo->plh_barrier); } +static bool +pnfs_layout_returning(const struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) +{ + return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && + (lo->plh_return_iomode == IOMODE_ANY || + lo->plh_return_iomode == range->iomode); +} + /* lget is set to 1 if called from inside send_layoutget call chain */ static bool -pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget) +pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, int lget) { return lo->plh_block_lgets || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && - (atomic_read(&lo->plh_outstanding) > lget)); + (atomic_read(&lo->plh_outstanding) > lget)) || + pnfs_layout_returning(lo, range); } int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); - if (pnfs_layoutgets_blocked(lo, 1)) { + if (pnfs_layoutgets_blocked(lo, range, 1)) { status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; @@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, pnfs_layout_io_set_failed(lo, range->iomode); } return NULL; - } + } else + pnfs_layout_clear_fail_bit(lo, + pnfs_iomode_to_fail_bit(range->iomode)); return lseg; } @@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode, } } +void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +{ + clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); +} + +static int +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, + enum pnfs_iomode iomode, bool sync) +{ + struct inode *ino = lo->plh_inode; + struct nfs4_layoutreturn *lrp; + int status = 0; + + lrp = kzalloc(sizeof(*lrp), GFP_NOFS); + if (unlikely(lrp == NULL)) { + status = -ENOMEM; + spin_lock(&ino->i_lock); + lo->plh_block_lgets--; + pnfs_clear_layoutreturn_waitbit(lo); + rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); + spin_unlock(&ino->i_lock); + pnfs_put_layout_hdr(lo); + goto out; + } + + lrp->args.stateid = stateid; + lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; + lrp->args.inode = ino; + lrp->args.range.iomode = iomode; + lrp->args.range.offset = 0; + lrp->args.range.length = NFS4_MAX_UINT64; + lrp->args.layout = lo; + lrp->clp = NFS_SERVER(ino)->nfs_client; + lrp->cred = lo->plh_lc_cred; + + status = nfs4_proc_layoutreturn(lrp, sync); +out: + dprintk("<-- %s status: %d\n", __func__, status); + return status; +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino) struct pnfs_layout_hdr *lo = NULL; struct nfs_inode *nfsi = NFS_I(ino); LIST_HEAD(tmp_list); - struct nfs4_layoutreturn *lrp; nfs4_stateid stateid; int status = 0, empty; @@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); - if (unlikely(lrp == NULL)) { - status = -ENOMEM; - spin_lock(&ino->i_lock); - lo->plh_block_lgets--; - spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); - goto out; - } - - lrp->args.stateid = stateid; - lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; - lrp->args.inode = ino; - lrp->args.layout = lo; - lrp->clp = NFS_SERVER(ino)->nfs_client; - lrp->cred = lo->plh_lc_cred; - - status = nfs4_proc_layoutreturn(lrp); + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode) bool pnfs_roc(struct inode *ino) { + struct nfs_inode *nfsi = NFS_I(ino); + struct nfs_open_context *ctx; + struct nfs4_state *state; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg, *tmp; + nfs4_stateid stateid; LIST_HEAD(tmp_list); - bool found = false; + bool found = false, layoutreturn = false; spin_lock(&ino->i_lock); - lo = NFS_I(ino)->layout; + lo = nfsi->layout; if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) - goto out_nolayout; + goto out_noroc; + + /* Don't return layout if we hold a delegation */ + if (nfs4_check_delegation(ino, FMODE_READ)) + goto out_noroc; + + list_for_each_entry(ctx, &nfsi->open_files, list) { + state = ctx->state; + /* Don't return layout if there is open file state */ + if (state != NULL && state->state != 0) + goto out_noroc; + } + + pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { mark_lseg_invalid(lseg, &tmp_list); found = true; } if (!found) - goto out_nolayout; + goto out_noroc; lo->plh_block_lgets++; pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); return true; -out_nolayout: +out_noroc: + if (lo) { + stateid = lo->plh_stateid; + layoutreturn = + test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); + if (layoutreturn) { + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + } + } spin_unlock(&ino->i_lock); + if (layoutreturn) + pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); return false; } @@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) struct nfs_inode *nfsi = NFS_I(ino); struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg; + nfs4_stateid stateid; u32 current_seqid; - bool found = false; + bool found = false, layoutreturn = false; spin_lock(&ino->i_lock); list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) @@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); out: + if (!found) { + stateid = lo->plh_stateid; + layoutreturn = + test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags); + if (layoutreturn) { + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + } + } spin_unlock(&ino->i_lock); + if (layoutreturn) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); + } return found; } @@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && + !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && pnfs_lseg_range_match(&lseg->pls_range, range)) { ret = pnfs_get_lseg(lseg); break; @@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, return ret; } +/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) +{ + if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) + return 1; + return nfs_wait_bit_killable(key); +} + +static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + /* + * send layoutcommit as it can hold up layoutreturn due to lseg + * reference + */ + pnfs_layoutcommit_inode(lo->plh_inode, false); + return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, + pnfs_layoutget_retry_bit_wait, + TASK_UNINTERRUPTIBLE); +} + +static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo) +{ + unsigned long *bitlock = &lo->plh_flags; + + clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET); +} + /* * Layout segment is retreived from the server if not cached. * The appropriate layout segment is referenced and returned to the caller. @@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino, if (pnfs_within_mdsthreshold(ctx, ino, iomode)) goto out; +lookup_again: + first = false; spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { @@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino, } /* if LAYOUTGET already failed once we don't try again */ - if (pnfs_layout_io_test_failed(lo, iomode)) + if (pnfs_layout_io_test_failed(lo, iomode) && + !pnfs_should_retry_layoutget(lo)) goto out_unlock; - /* Check to see if the layout for the given range already exists */ - lseg = pnfs_find_lseg(lo, &arg); - if (lseg) - goto out_unlock; + first = list_empty(&lo->plh_segs); + if (first) { + /* The first layoutget for the file. Need to serialize per + * RFC 5661 Errata 3208. + */ + if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, + &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET, + TASK_UNINTERRUPTIBLE); + pnfs_put_layout_hdr(lo); + goto lookup_again; + } + } else { + /* Check to see if the layout for the given range + * already exists + */ + lseg = pnfs_find_lseg(lo, &arg); + if (lseg) + goto out_unlock; + } + + /* + * Because we free lsegs before sending LAYOUTRETURN, we need to wait + * for LAYOUTRETURN even if first is true. + */ + if (!lseg && pnfs_should_retry_layoutget(lo) && + test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + dprintk("%s wait for layoutreturn\n", __func__); + if (pnfs_prepare_to_retry_layoutget(lo)) { + if (first) + pnfs_clear_first_layoutget(lo); + pnfs_put_layout_hdr(lo); + dprintk("%s retrying\n", __func__); + goto lookup_again; + } + goto out_put_layout_hdr; + } - if (pnfs_layoutgets_blocked(lo, 0)) + if (pnfs_layoutgets_blocked(lo, &arg, 0)) goto out_unlock; atomic_inc(&lo->plh_outstanding); - - first = list_empty(&lo->plh_layouts) ? true : false; spin_unlock(&ino->i_lock); - if (first) { + if (list_empty(&lo->plh_layouts)) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ spin_lock(&clp->cl_lock); - list_add_tail(&lo->plh_layouts, &server->layouts); + if (list_empty(&lo->plh_layouts)) + list_add_tail(&lo->plh_layouts, &server->layouts); spin_unlock(&clp->cl_lock); } @@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino, arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); + pnfs_clear_retry_layoutget(lo); atomic_dec(&lo->plh_outstanding); out_put_layout_hdr: + if (first) + pnfs_clear_first_layoutget(lo); pnfs_put_layout_hdr(lo); out: dprintk("%s: inode %s/%llu pNFS layout segment %s for " @@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, 1)) { + if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } @@ -1440,24 +1652,79 @@ out_forget_reply: goto out; } +static void +pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *return_range) +{ + struct pnfs_layout_segment *lseg, *next; + + dprintk("%s:Begin lo %p\n", __func__, lo); + + if (list_empty(&lo->plh_segs)) + return; + + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + if (should_free_lseg(&lseg->pls_range, return_range)) { + dprintk("%s: marking lseg %p iomode %d " + "offset %llu length %llu\n", __func__, + lseg, lseg->pls_range.iomode, + lseg->pls_range.offset, + lseg->pls_range.length); + set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + mark_lseg_invalid(lseg, tmp_list); + } +} + +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; + int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); + struct pnfs_layout_range range = { + .iomode = lseg->pls_range.iomode, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + LIST_HEAD(free_me); + + spin_lock(&inode->i_lock); + /* set failure bit so that pnfs path will be retried later */ + pnfs_layout_set_fail_bit(lo, iomode); + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); + if (lo->plh_return_iomode == 0) + lo->plh_return_iomode = range.iomode; + else if (lo->plh_return_iomode != range.iomode) + lo->plh_return_iomode = IOMODE_ANY; + /* + * mark all matching lsegs so that we are sure to have no live + * segments at hand when sending layoutreturn. See pnfs_put_lseg() + * for how it works. + */ + pnfs_mark_matching_lsegs_return(lo, &free_me, &range); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); +} +EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); + void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - if (pgio->pg_dreq == NULL) - rd_size = i_size_read(pgio->pg_inode) - req_offset(req); - else - rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - rd_size, - IOMODE_READ, - GFP_KERNEL); + if (pgio->pg_lseg == NULL) { + if (pgio->pg_dreq == NULL) + rd_size = i_size_read(pgio->pg_inode) - req_offset(req); + else + rd_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + rd_size, + IOMODE_READ, + GFP_KERNEL); + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_read_mds(pgio); @@ -1469,27 +1736,36 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - WARN_ON_ONCE(pgio->pg_lseg != NULL); - - pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, - req->wb_context, - req_offset(req), - wb_size, - IOMODE_RW, - GFP_NOFS); + if (pgio->pg_lseg == NULL) + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + req->wb_context, + req_offset(req), + wb_size, + IOMODE_RW, + GFP_NOFS); /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_write_mds(pgio); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write); +void +pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc) +{ + if (desc->pg_lseg) { + pnfs_put_lseg(desc->pg_lseg); + desc->pg_lseg = NULL; + } +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup); + /* * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number * of bytes (maximum @req->wb_bytes) that can be coalesced. */ size_t -pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, - struct nfs_page *req) +pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, struct nfs_page *req) { unsigned int size; u64 seg_end, req_start, seg_left; @@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, seg_end = end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length); req_start = req_offset(req); - WARN_ON_ONCE(req_start > seg_end); + WARN_ON_ONCE(req_start >= seg_end); /* start of request is past the last byte of this segment */ - if (req_start >= seg_end) + if (req_start >= seg_end) { + /* reference the new lseg */ + if (pgio->pg_ops->pg_cleanup) + pgio->pg_ops->pg_cleanup(pgio); + if (pgio->pg_ops->pg_init) + pgio->pg_ops->pg_init(pgio, req); return 0; + } /* adjust 'size' iff there are fewer bytes left in the * segment than what nfs_generic_pg_test returned */ @@ -1571,10 +1853,12 @@ static void pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { - list_splice_tail_init(&hdr->pages, &desc->pg_list); + list_splice_tail_init(&hdr->pages, &mirror->pg_list); nfs_pageio_reset_write_mds(desc); - desc->pg_recoalesce = 1; + mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); } @@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; - desc->pg_lseg = NULL; trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); if (trypnfs == PNFS_NOT_ATTEMPTED) pnfs_write_through_mds(desc, hdr); - pnfs_put_lseg(lseg); } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) @@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); return -ENOMEM; } nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); + hdr->lseg = pnfs_get_lseg(desc->pg_lseg); ret = nfs_generic_pgio(desc, hdr); - if (ret != 0) { - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; - } else + if (!ret) pnfs_do_write(desc, hdr, desc->pg_ioflags); + return ret; } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); @@ -1687,10 +1968,12 @@ static void pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { - list_splice_tail_init(&hdr->pages, &desc->pg_list); + list_splice_tail_init(&hdr->pages, &mirror->pg_list); nfs_pageio_reset_read_mds(desc); - desc->pg_recoalesce = 1; + mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); } @@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr, return trypnfs; } +/* Resend all requests through pnfs. */ +int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) +{ + struct nfs_pageio_descriptor pgio; + + nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); + return nfs_pageio_resend(&pgio, hdr); +} +EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs); + static void pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) { const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; struct pnfs_layout_segment *lseg = desc->pg_lseg; enum pnfs_try_status trypnfs; + int err = 0; - desc->pg_lseg = NULL; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); - if (trypnfs == PNFS_NOT_ATTEMPTED) + if (trypnfs == PNFS_TRY_AGAIN) + err = pnfs_read_resend_pnfs(hdr); + if (trypnfs == PNFS_NOT_ATTEMPTED || err) pnfs_read_through_mds(desc, hdr); - pnfs_put_lseg(lseg); } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) @@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&desc->pg_list); - ret = -ENOMEM; - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; - return ret; + desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + return -ENOMEM; } nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); ret = nfs_generic_pgio(desc, hdr); - if (ret != 0) { - pnfs_put_lseg(desc->pg_lseg); - desc->pg_lseg = NULL; - } else + if (!ret) pnfs_do_read(desc, hdr); return ret; } @@ -1966,6 +2256,7 @@ clear_layoutcommitting: pnfs_clear_layoutcommitting(inode); goto out; } +EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 9ae5b765b073..635f0865671c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -38,6 +38,25 @@ enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */ + NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */ +}; + +/* Individual ip address */ +struct nfs4_pnfs_ds_addr { + struct sockaddr_storage da_addr; + size_t da_addrlen; + struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *da_remotestr; /* human readable addr+port */ +}; + +struct nfs4_pnfs_ds { + struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ + char *ds_remotestr; /* comma sep list of addrs */ + struct list_head ds_addrs; + struct nfs_client *ds_clp; + atomic_t ds_count; + unsigned long ds_state; +#define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; struct pnfs_layout_segment { @@ -53,19 +72,34 @@ struct pnfs_layout_segment { enum pnfs_try_status { PNFS_ATTEMPTED = 0, PNFS_NOT_ATTEMPTED = 1, + PNFS_TRY_AGAIN = 2, }; #ifdef CONFIG_NFS_V4_1 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" +/* + * Default data server connection timeout and retrans vaules. + * Set by module parameters dataserver_timeo and dataserver_retrans. + */ +#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */ +#define NFS4_DEF_DS_RETRANS 5 + +/* error codes for internal use */ +#define NFS4ERR_RESET_TO_MDS 12001 +#define NFS4ERR_RESET_TO_PNFS 12002 + enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ + NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ + NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ + NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */ }; enum layoutdriver_policy_flags { @@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type { struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode); void (*mark_request_commit) (struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo); + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); void (*clear_request_commit) (struct nfs_page *req, struct nfs_commit_info *cinfo); int (*scan_commit_lists) (struct nfs_commit_info *cinfo, @@ -154,6 +189,7 @@ struct pnfs_layout_hdr { u32 plh_barrier; /* ignore lower seqids */ unsigned long plh_retry_timestamp; unsigned long plh_flags; + enum pnfs_iomode plh_return_iomode; loff_t plh_lwb; /* last write byte for layoutcommit */ struct rpc_cred *plh_lc_cred; /* layoutcommit cred */ struct inode *plh_inode; @@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags); -extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); +extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); @@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page * int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size); +void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc); size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req); @@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range, struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); void pnfs_ld_read_done(struct nfs_pgio_header *); +int pnfs_read_resend_pnfs(struct nfs_pgio_header *); struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, loff_t pos, u64 count, enum pnfs_iomode iomode, gfp_t gfp_flags); +void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *); int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); +void pnfs_error_mark_layout_for_return(struct inode *inode, + struct pnfs_layout_segment *lseg); /* nfs4_deviceid_flags */ enum { @@ -275,6 +317,43 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); +/* pnfs_nfs.c */ +void pnfs_generic_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo); +void pnfs_generic_commit_release(void *calldata); +void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data); +void pnfs_generic_rw_release(void *data); +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo); +int pnfs_generic_commit_pagelist(struct inode *inode, + struct list_head *mds_pages, + int how, + struct nfs_commit_info *cinfo, + int (*initiate_commit)(struct nfs_commit_data *data, + int how)); +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); +void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); +void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, + gfp_t gfp_flags); +void nfs4_pnfs_v3_ds_connect_unload(void); +void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, + struct nfs4_deviceid_node *devid, unsigned int timeo, + unsigned int retrans, u32 version, u32 minor_version, + rpc_authflavor_t au_flavor); +struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, + struct xdr_stream *xdr, + gfp_t gfp_flags); +void pnfs_layout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx); + +static inline bool nfs_have_layout(struct inode *inode) +{ + return NFS_I(inode)->layout != NULL; +} + static inline struct nfs4_deviceid_node * nfs4_get_deviceid(struct nfs4_deviceid_node *d) { @@ -282,6 +361,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d) return d; } +static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) + atomic_inc(&lo->plh_refcount); +} + +static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) { + atomic_dec(&lo->plh_refcount); + /* wake up waiters for LAYOUTRETURN as that is not needed */ + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + } +} + +static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo) +{ + return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags); +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -317,16 +416,22 @@ pnfs_get_ds_info(struct inode *inode) return ld->get_ds_info(inode); } +static inline void +pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node) +{ + set_bit(NFS_DEVICEID_INVALID, &node->flags); +} + static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) + struct nfs_commit_info *cinfo, u32 ds_commit_idx) { struct inode *inode = req->wb_context->dentry->d_inode; struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (lseg == NULL || ld->mark_request_commit == NULL) return false; - ld->mark_request_commit(req, lseg, cinfo); + ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx); return true; } @@ -352,15 +457,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max); } -static inline void -pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, - struct nfs_commit_info *cinfo) -{ - if (cinfo->ds == NULL || cinfo->ds->nwritten == 0) - return; - NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); -} - static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) @@ -427,6 +523,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id) #endif /* NFS_DEBUG */ #else /* CONFIG_NFS_V4_1 */ +static inline bool nfs_have_layout(struct inode *inode) +{ + return false; +} + static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) { } @@ -513,7 +614,7 @@ pnfs_get_ds_info(struct inode *inode) static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) + struct nfs_commit_info *cinfo, u32 ds_commit_idx) { return false; } @@ -531,12 +632,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo, return 0; } -static inline void -pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list, - struct nfs_commit_info *cinfo) -{ -} - static inline struct nfs_page * pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo, struct page *page) @@ -568,6 +663,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) return NULL; } +static inline void nfs4_pnfs_v3_ds_connect_unload(void) +{ +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c new file mode 100644 index 000000000000..54e36b38fb5f --- /dev/null +++ b/fs/nfs/pnfs_nfs.c @@ -0,0 +1,870 @@ +/* + * Common NFS I/O operations for the pnfs file based + * layout drivers. + * + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. + * + * Tom Haynes <loghyr@primarydata.com> + */ + +#include <linux/nfs_fs.h> +#include <linux/nfs_page.h> +#include <linux/sunrpc/addr.h> +#include <linux/module.h> + +#include "nfs4session.h" +#include "internal.h" +#include "pnfs.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS + +void pnfs_generic_rw_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + nfs_put_client(hdr->ds_clp); + hdr->mds_ops->rpc_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_generic_rw_release); + +/* Fake up some data that will cause nfs_commit_release to retry the writes. */ +void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data) +{ + struct nfs_page *first = nfs_list_entry(data->pages.next); + + data->task.tk_status = 0; + memcpy(&data->verf.verifier, &first->wb_verf, + sizeof(data->verf.verifier)); + data->verf.verifier.data[0]++; /* ensure verifier mismatch */ +} +EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes); + +void pnfs_generic_write_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *wdata = data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} +EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done); + +void pnfs_generic_commit_release(void *calldata) +{ + struct nfs_commit_data *data = calldata; + + data->completion_ops->completion(data); + pnfs_put_lseg(data->lseg); + nfs_put_client(data->ds_clp); + nfs_commitdata_release(data); +} +EXPORT_SYMBOL_GPL(pnfs_generic_commit_release); + +/* The generic layer is about to remove the req from the commit list. + * If this will make the bucket empty, it will need to put the lseg reference. + * Note this must be called holding the inode (/cinfo) lock + */ +void +pnfs_generic_clear_request_commit(struct nfs_page *req, + struct nfs_commit_info *cinfo) +{ + struct pnfs_layout_segment *freeme = NULL; + + if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) + goto out; + cinfo->ds->nwritten--; + if (list_is_singular(&req->wb_list)) { + struct pnfs_commit_bucket *bucket; + + bucket = list_first_entry(&req->wb_list, + struct pnfs_commit_bucket, + written); + freeme = bucket->wlseg; + bucket->wlseg = NULL; + } +out: + nfs_request_remove_commit_list(req, cinfo); + pnfs_put_lseg_locked(freeme); +} +EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); + +static int +pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, + struct nfs_commit_info *cinfo, int max) +{ + struct nfs_page *req, *tmp; + int ret = 0; + + list_for_each_entry_safe(req, tmp, src, wb_list) { + if (!nfs_lock_request(req)) + continue; + kref_get(&req->wb_kref); + if (cond_resched_lock(cinfo->lock)) + list_safe_reset_next(req, tmp, wb_list); + nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); + nfs_list_add_request(req, dst); + ret++; + if ((ret == max) && !cinfo->dreq) + break; + } + return ret; +} + +static int +pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, + struct nfs_commit_info *cinfo, + int max) +{ + struct list_head *src = &bucket->written; + struct list_head *dst = &bucket->committing; + int ret; + + lockdep_assert_held(cinfo->lock); + ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); + if (ret) { + cinfo->ds->nwritten -= ret; + cinfo->ds->ncommitting += ret; + bucket->clseg = bucket->wlseg; + if (list_empty(src)) + bucket->wlseg = NULL; + else + pnfs_get_lseg(bucket->clseg); + } + return ret; +} + +/* Move reqs from written to committing lists, returning count + * of number moved. + */ +int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, + int max) +{ + int i, rv = 0, cnt; + + lockdep_assert_held(cinfo->lock); + for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { + cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], + cinfo, max); + max -= cnt; + rv += cnt; + } + return rv; +} +EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists); + +/* Pull everything off the committing lists and dump into @dst. */ +void pnfs_generic_recover_commit_reqs(struct list_head *dst, + struct nfs_commit_info *cinfo) +{ + struct pnfs_commit_bucket *b; + struct pnfs_layout_segment *freeme; + int i; + + lockdep_assert_held(cinfo->lock); +restart: + for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { + if (pnfs_generic_transfer_commit_list(&b->written, dst, + cinfo, 0)) { + freeme = b->wlseg; + b->wlseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + spin_lock(cinfo->lock); + goto restart; + } + } + cinfo->ds->nwritten = 0; +} +EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); + +static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) +{ + struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; + struct pnfs_commit_bucket *bucket; + struct pnfs_layout_segment *freeme; + int i; + + for (i = idx; i < fl_cinfo->nbuckets; i++) { + bucket = &fl_cinfo->buckets[i]; + if (list_empty(&bucket->committing)) + continue; + nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i); + spin_lock(cinfo->lock); + freeme = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + pnfs_put_lseg(freeme); + } +} + +static unsigned int +pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, + struct list_head *list) +{ + struct pnfs_ds_commit_info *fl_cinfo; + struct pnfs_commit_bucket *bucket; + struct nfs_commit_data *data; + int i; + unsigned int nreq = 0; + + fl_cinfo = cinfo->ds; + bucket = fl_cinfo->buckets; + for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { + if (list_empty(&bucket->committing)) + continue; + data = nfs_commitdata_alloc(); + if (!data) + break; + data->ds_commit_index = i; + spin_lock(cinfo->lock); + data->lseg = bucket->clseg; + bucket->clseg = NULL; + spin_unlock(cinfo->lock); + list_add(&data->pages, list); + nreq++; + } + + /* Clean up on error */ + pnfs_generic_retry_commit(cinfo, i); + return nreq; +} + +/* This follows nfs_commit_list pretty closely */ +int +pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, + int how, struct nfs_commit_info *cinfo, + int (*initiate_commit)(struct nfs_commit_data *data, + int how)) +{ + struct nfs_commit_data *data, *tmp; + LIST_HEAD(list); + unsigned int nreq = 0; + + if (!list_empty(mds_pages)) { + data = nfs_commitdata_alloc(); + if (data != NULL) { + data->lseg = NULL; + list_add(&data->pages, &list); + nreq++; + } else { + nfs_retry_commit(mds_pages, NULL, cinfo, 0); + pnfs_generic_retry_commit(cinfo, 0); + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + return -ENOMEM; + } + } + + nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); + + if (nreq == 0) { + cinfo->completion_ops->error_cleanup(NFS_I(inode)); + goto out; + } + + atomic_add(nreq, &cinfo->mds->rpcs_out); + + list_for_each_entry_safe(data, tmp, &list, pages) { + list_del_init(&data->pages); + if (!data->lseg) { + nfs_init_commit(data, mds_pages, NULL, cinfo); + nfs_initiate_commit(NFS_CLIENT(inode), data, + NFS_PROTO(data->inode), + data->mds_ops, how, 0); + } else { + struct pnfs_commit_bucket *buckets; + + buckets = cinfo->ds->buckets; + nfs_init_commit(data, + &buckets[data->ds_commit_index].committing, + data->lseg, + cinfo); + initiate_commit(data, how); + } + } +out: + cinfo->ds->ncommitting = 0; + return PNFS_ATTEMPTED; +} +EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); + +/* + * Data server cache + * + * Data servers can be mapped to different device ids. + * nfs4_pnfs_ds reference counting + * - set to 1 on allocation + * - incremented when a device id maps a data server already in the cache. + * - decremented when deviceid is removed from the cache. + */ +static DEFINE_SPINLOCK(nfs4_ds_cache_lock); +static LIST_HEAD(nfs4_data_server_cache); + +/* Debug routines */ +static void +print_ds(struct nfs4_pnfs_ds *ds) +{ + if (ds == NULL) { + printk(KERN_WARNING "%s NULL device\n", __func__); + return; + } + printk(KERN_WARNING " ds %s\n" + " ref count %d\n" + " client %p\n" + " cl_exchange_flags %x\n", + ds->ds_remotestr, + atomic_read(&ds->ds_count), ds->ds_clp, + ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); +} + +static bool +same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) +{ + struct sockaddr_in *a, *b; + struct sockaddr_in6 *a6, *b6; + + if (addr1->sa_family != addr2->sa_family) + return false; + + switch (addr1->sa_family) { + case AF_INET: + a = (struct sockaddr_in *)addr1; + b = (struct sockaddr_in *)addr2; + + if (a->sin_addr.s_addr == b->sin_addr.s_addr && + a->sin_port == b->sin_port) + return true; + break; + + case AF_INET6: + a6 = (struct sockaddr_in6 *)addr1; + b6 = (struct sockaddr_in6 *)addr2; + + /* LINKLOCAL addresses must have matching scope_id */ + if (ipv6_addr_src_scope(&a6->sin6_addr) == + IPV6_ADDR_SCOPE_LINKLOCAL && + a6->sin6_scope_id != b6->sin6_scope_id) + return false; + + if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) && + a6->sin6_port == b6->sin6_port) + return true; + break; + + default: + dprintk("%s: unhandled address family: %u\n", + __func__, addr1->sa_family); + return false; + } + + return false; +} + +static bool +_same_data_server_addrs_locked(const struct list_head *dsaddrs1, + const struct list_head *dsaddrs2) +{ + struct nfs4_pnfs_ds_addr *da1, *da2; + + /* step through both lists, comparing as we go */ + for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node), + da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node); + da1 != NULL && da2 != NULL; + da1 = list_entry(da1->da_node.next, typeof(*da1), da_node), + da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) { + if (!same_sockaddr((struct sockaddr *)&da1->da_addr, + (struct sockaddr *)&da2->da_addr)) + return false; + } + if (da1 == NULL && da2 == NULL) + return true; + + return false; +} + +/* + * Lookup DS by addresses. nfs4_ds_cache_lock is held + */ +static struct nfs4_pnfs_ds * +_data_server_lookup_locked(const struct list_head *dsaddrs) +{ + struct nfs4_pnfs_ds *ds; + + list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + return ds; + return NULL; +} + +static void destroy_ds(struct nfs4_pnfs_ds *ds) +{ + struct nfs4_pnfs_ds_addr *da; + + dprintk("--> %s\n", __func__); + ifdebug(FACILITY) + print_ds(ds); + + nfs_put_client(ds->ds_clp); + + while (!list_empty(&ds->ds_addrs)) { + da = list_first_entry(&ds->ds_addrs, + struct nfs4_pnfs_ds_addr, + da_node); + list_del_init(&da->da_node); + kfree(da->da_remotestr); + kfree(da); + } + + kfree(ds->ds_remotestr); + kfree(ds); +} + +void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) +{ + if (atomic_dec_and_lock(&ds->ds_count, + &nfs4_ds_cache_lock)) { + list_del_init(&ds->ds_node); + spin_unlock(&nfs4_ds_cache_lock); + destroy_ds(ds); + } +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put); + +/* + * Create a string with a human readable address and port to avoid + * complicated setup around many dprinks. + */ +static char * +nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da; + char *remotestr; + size_t len; + char *p; + + len = 3; /* '{', '}' and eol */ + list_for_each_entry(da, dsaddrs, da_node) { + len += strlen(da->da_remotestr) + 1; /* string plus comma */ + } + + remotestr = kzalloc(len, gfp_flags); + if (!remotestr) + return NULL; + + p = remotestr; + *(p++) = '{'; + len--; + list_for_each_entry(da, dsaddrs, da_node) { + size_t ll = strlen(da->da_remotestr); + + if (ll > len) + goto out_err; + + memcpy(p, da->da_remotestr, ll); + p += ll; + len -= ll; + + if (len < 1) + goto out_err; + (*p++) = ','; + len--; + } + if (len < 2) + goto out_err; + *(p++) = '}'; + *p = '\0'; + return remotestr; +out_err: + kfree(remotestr); + return NULL; +} + +/* + * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if + * uncached and return cached struct nfs4_pnfs_ds. + */ +struct nfs4_pnfs_ds * +nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; + char *remotestr; + + if (list_empty(dsaddrs)) { + dprintk("%s: no addresses defined\n", __func__); + goto out; + } + + ds = kzalloc(sizeof(*ds), gfp_flags); + if (!ds) + goto out; + + /* this is only used for debugging, so it's ok if its NULL */ + remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); + + spin_lock(&nfs4_ds_cache_lock); + tmp_ds = _data_server_lookup_locked(dsaddrs); + if (tmp_ds == NULL) { + INIT_LIST_HEAD(&ds->ds_addrs); + list_splice_init(dsaddrs, &ds->ds_addrs); + ds->ds_remotestr = remotestr; + atomic_set(&ds->ds_count, 1); + INIT_LIST_HEAD(&ds->ds_node); + ds->ds_clp = NULL; + list_add(&ds->ds_node, &nfs4_data_server_cache); + dprintk("%s add new data server %s\n", __func__, + ds->ds_remotestr); + } else { + kfree(remotestr); + kfree(ds); + atomic_inc(&tmp_ds->ds_count); + dprintk("%s data server %s found, inc'ed ds_count to %d\n", + __func__, tmp_ds->ds_remotestr, + atomic_read(&tmp_ds->ds_count)); + ds = tmp_ds; + } + spin_unlock(&nfs4_ds_cache_lock); +out: + return ds; +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add); + +static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +{ + might_sleep(); + wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, + TASK_KILLABLE); +} + +static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) +{ + smp_mb__before_atomic(); + clear_bit(NFS4DS_CONNECTING, &ds->ds_state); + smp_mb__after_atomic(); + wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); +} + +static struct nfs_client *(*get_v3_ds_connect)( + struct nfs_client *mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, + int ds_proto, + unsigned int ds_timeo, + unsigned int ds_retrans, + rpc_authflavor_t au_flavor); + +static bool load_v3_ds_connect(void) +{ + if (!get_v3_ds_connect) { + get_v3_ds_connect = symbol_request(nfs3_set_ds_client); + WARN_ON_ONCE(!get_v3_ds_connect); + } + + return(get_v3_ds_connect != NULL); +} + +void __exit nfs4_pnfs_v3_ds_connect_unload(void) +{ + if (get_v3_ds_connect) { + symbol_put(nfs3_set_ds_client); + get_v3_ds_connect = NULL; + } +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); + +static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, + struct nfs4_pnfs_ds *ds, + unsigned int timeo, + unsigned int retrans, + rpc_authflavor_t au_flavor) +{ + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; + int status = 0; + + dprintk("--> %s DS %s au_flavor %d\n", __func__, + ds->ds_remotestr, au_flavor); + + if (!load_v3_ds_connect()) + goto out; + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = get_v3_ds_connect(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, au_flavor); + if (!IS_ERR(clp)) + break; + } + + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + smp_wmb(); + ds->ds_clp = clp; + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); +out: + return status; +} + +static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, + struct nfs4_pnfs_ds *ds, + unsigned int timeo, + unsigned int retrans, + u32 minor_version, + rpc_authflavor_t au_flavor) +{ + struct nfs_client *clp = ERR_PTR(-EIO); + struct nfs4_pnfs_ds_addr *da; + int status = 0; + + dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, + au_flavor); + + list_for_each_entry(da, &ds->ds_addrs, da_node) { + dprintk("%s: DS %s: trying address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + + clp = nfs4_set_ds_client(mds_srv->nfs_client, + (struct sockaddr *)&da->da_addr, + da->da_addrlen, IPPROTO_TCP, + timeo, retrans, minor_version, + au_flavor); + if (!IS_ERR(clp)) + break; + } + + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time); + if (status) + goto out_put; + + smp_wmb(); + ds->ds_clp = clp; + dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + +/* + * Create an rpc connection to the nfs4_pnfs_ds data server. + * Currently only supports IPv4 and IPv6 addresses. + * If connection fails, make devid unavailable. + */ +void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, + struct nfs4_deviceid_node *devid, unsigned int timeo, + unsigned int retrans, u32 version, + u32 minor_version, rpc_authflavor_t au_flavor) +{ + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { + int err = 0; + + if (version == 3) { + err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, + retrans, au_flavor); + } else if (version == 4) { + err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, + retrans, minor_version, + au_flavor); + } else { + dprintk("%s: unsupported DS version %d\n", __func__, + version); + err = -EPROTONOSUPPORT; + } + + if (err) + nfs4_mark_deviceid_unavailable(devid); + nfs4_clear_ds_conn_bit(ds); + } else { + nfs4_wait_ds_connect(ds); + } +} +EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); + +/* + * Currently only supports ipv4, ipv6 and one multi-path address. + */ +struct nfs4_pnfs_ds_addr * +nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags) +{ + struct nfs4_pnfs_ds_addr *da = NULL; + char *buf, *portstr; + __be16 port; + int nlen, rlen; + int tmp[2]; + __be32 *p; + char *netid, *match_netid; + size_t len, match_netid_len; + char *startsep = ""; + char *endsep = ""; + + + /* r_netid */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_err; + nlen = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, nlen); + if (unlikely(!p)) + goto out_err; + + netid = kmalloc(nlen+1, gfp_flags); + if (unlikely(!netid)) + goto out_err; + + netid[nlen] = '\0'; + memcpy(netid, p, nlen); + + /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_free_netid; + rlen = be32_to_cpup(p); + + p = xdr_inline_decode(xdr, rlen); + if (unlikely(!p)) + goto out_free_netid; + + /* port is ".ABC.DEF", 8 chars max */ + if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) { + dprintk("%s: Invalid address, length %d\n", __func__, + rlen); + goto out_free_netid; + } + buf = kmalloc(rlen + 1, gfp_flags); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_free_netid; + } + buf[rlen] = '\0'; + memcpy(buf, p, rlen); + + /* replace port '.' with '-' */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot in port\n", + __func__); + goto out_free_buf; + } + *portstr = '-'; + + /* find '.' between address and port */ + portstr = strrchr(buf, '.'); + if (!portstr) { + dprintk("%s: Failed finding expected dot between address and " + "port\n", __func__); + goto out_free_buf; + } + *portstr = '\0'; + + da = kzalloc(sizeof(*da), gfp_flags); + if (unlikely(!da)) + goto out_free_buf; + + INIT_LIST_HEAD(&da->da_node); + + if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr, + sizeof(da->da_addr))) { + dprintk("%s: error parsing address %s\n", __func__, buf); + goto out_free_da; + } + + portstr++; + sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]); + port = htons((tmp[0] << 8) | (tmp[1])); + + switch (da->da_addr.ss_family) { + case AF_INET: + ((struct sockaddr_in *)&da->da_addr)->sin_port = port; + da->da_addrlen = sizeof(struct sockaddr_in); + match_netid = "tcp"; + match_netid_len = 3; + break; + + case AF_INET6: + ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port; + da->da_addrlen = sizeof(struct sockaddr_in6); + match_netid = "tcp6"; + match_netid_len = 4; + startsep = "["; + endsep = "]"; + break; + + default: + dprintk("%s: unsupported address family: %u\n", + __func__, da->da_addr.ss_family); + goto out_free_da; + } + + if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) { + dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n", + __func__, netid, match_netid); + goto out_free_da; + } + + /* save human readable address */ + len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7; + da->da_remotestr = kzalloc(len, gfp_flags); + + /* NULL is ok, only used for dprintk */ + if (da->da_remotestr) + snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep, + buf, endsep, ntohs(port)); + + dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr); + kfree(buf); + kfree(netid); + return da; + +out_free_da: + kfree(da); +out_free_buf: + dprintk("%s: Error parsing DS addr: %s\n", __func__, buf); + kfree(buf); +out_free_netid: + kfree(netid); +out_err: + return NULL; +} +EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr); + +void +pnfs_layout_mark_request_commit(struct nfs_page *req, + struct pnfs_layout_segment *lseg, + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) +{ + struct list_head *list; + struct pnfs_commit_bucket *buckets; + + spin_lock(cinfo->lock); + buckets = cinfo->ds->buckets; + list = &buckets[ds_commit_idx].written; + if (list_empty(list)) { + /* Non-empty buckets hold a reference on the lseg. That ref + * is normally transferred to the COMMIT call and released + * there. It could also be released if the last req is pulled + * off due to a rewrite, in which case it will be done in + * pnfs_common_clear_request_commit + */ + WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); + buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); + } + set_bit(PG_COMMIT_TO_DS, &req->wb_flags); + cinfo->ds->nwritten++; + spin_unlock(cinfo->lock); + + nfs_request_add_commit_list(req, list, cinfo); +} +EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b09cc23d6f43..c63189acd052 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); if (status == 0) - nfs_setattr_update_inode(inode, sattr); + nfs_setattr_update_inode(inode, sattr, fattr); dprintk("NFS reply setattr: %d\n", status); return status; } @@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { - struct inode *inode = hdr->inode; - if (task->tk_status >= 0) - nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr); + nfs_writeback_update_inode(hdr); return 0; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index beff2769c5c5..568ecf0a880f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read); void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { + struct nfs_pgio_mirror *mirror; + pgio->pg_ops = &nfs_pgio_rw_ops; - pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; + + /* read path should never have more than one mirror */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + mirror = &pgio->pg_mirrors[0]; + mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); @@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct nfs_page *new; unsigned int len; struct nfs_pageio_descriptor pgio; + struct nfs_pgio_mirror *pgm; len = nfs_page_length(page); if (len == 0) @@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, &nfs_async_read_completion_ops); nfs_pageio_add_request(&pgio, new); nfs_pageio_complete(&pgio); - NFS_I(inode)->read_io += pgio.pg_bytes_written; + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio.pg_mirror_count != 1); + + pgm = &pgio.pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + return 0; } @@ -168,13 +182,14 @@ out: static void nfs_initiate_read(struct nfs_pgio_header *hdr, struct rpc_message *msg, + const struct nfs_rpc_ops *rpc_ops, struct rpc_task_setup *task_setup_data, int how) { struct inode *inode = hdr->inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; task_setup_data->flags |= swap_flags; - NFS_PROTO(inode)->read_setup(hdr, msg); + rpc_ops->read_setup(hdr, msg); } static void @@ -269,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_add_stats(inode, NFSIOS_READPAGES, 1); + nfs_inc_stats(inode, NFSIOS_READPAGES); /* * Try to flush any pending writes to the file.. @@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { struct nfs_pageio_descriptor pgio; + struct nfs_pgio_mirror *pgm; struct nfs_readdesc desc = { .pgio = &pgio, }; @@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&pgio); - NFS_I(inode)->read_io += pgio.pg_bytes_written; - npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio.pg_mirror_count != 1); + + pgm = &pgio.pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: put_nfs_open_context(desc.ctx); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 31a11b0e885d..322b2de02988 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -311,7 +311,6 @@ const struct super_operations nfs_sops = { .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, .drop_inode = nfs_drop_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .evict_inode = nfs_evict_inode, .umount_begin = nfs_umount_begin, @@ -405,12 +404,15 @@ void __exit unregister_nfs_fs(void) unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct super_block *sb) +bool nfs_sb_active(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - if (atomic_inc_return(&server->active) == 1) - atomic_inc(&sb->s_active); + if (!atomic_inc_not_zero(&sb->s_active)) + return false; + if (atomic_inc_return(&server->active) != 1) + atomic_dec(&sb->s_active); + return true; } EXPORT_SYMBOL_GPL(nfs_sb_active); @@ -2569,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, error = nfs_bdi_register(server); if (error) { mntroot = ERR_PTR(error); - goto error_splat_bdi; + goto error_splat_super; } server->super = s; } @@ -2601,9 +2603,6 @@ error_splat_root: dput(mntroot); mntroot = ERR_PTR(error); error_splat_super: - if (server && !s->s_root) - bdi_unregister(&server->backing_dev_info); -error_splat_bdi: deactivate_locked_super(s); goto out; } @@ -2651,27 +2650,19 @@ out: EXPORT_SYMBOL_GPL(nfs_fs_mount); /* - * Ensure that we unregister the bdi before kill_anon_super - * releases the device name - */ -void nfs_put_super(struct super_block *s) -{ - struct nfs_server *server = NFS_SB(s); - - bdi_unregister(&server->backing_dev_info); -} -EXPORT_SYMBOL_GPL(nfs_put_super); - -/* * Destroy an NFS2/3 superblock */ void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); + dev_t dev = s->s_dev; + + generic_shutdown_super(s); - kill_anon_super(s); nfs_fscache_release_super_cookie(s); + nfs_free_server(server); + free_anon_bdev(dev); } EXPORT_SYMBOL_GPL(nfs_kill_super); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f83b02dc9166..849ed784d6ac 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -473,13 +473,18 @@ try_again: do { /* * Subrequests are always contiguous, non overlapping - * and in order. If not, it's a programming error. + * and in order - but may be repeated (mirrored writes). */ - WARN_ON_ONCE(subreq->wb_offset != - (head->wb_offset + total_bytes)); - - /* keep track of how many bytes this group covers */ - total_bytes += subreq->wb_bytes; + if (subreq->wb_offset == (head->wb_offset + total_bytes)) { + /* keep track of how many bytes this group covers */ + total_bytes += subreq->wb_bytes; + } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset || + ((subreq->wb_offset + subreq->wb_bytes) > + (head->wb_offset + total_bytes)))) { + nfs_page_group_unlock(head); + spin_unlock(&inode->i_lock); + return ERR_PTR(-EIO); + } if (!nfs_lock_request(subreq)) { /* releases page group bit lock and @@ -575,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); + nfs_inc_stats(inode, NFSIOS_WRITEPAGES); nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); @@ -670,7 +675,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) nfs_lock_request(req); spin_lock(&inode->i_lock); - if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) + if (!nfsi->nrequests && + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) inode->i_version++; /* * Swap-space should not get truncated. Hence no need to plug the race @@ -681,9 +687,11 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); } - nfsi->npages++; + nfsi->nrequests++; /* this a head request for a page group - mark it as having an - * extra reference so sub groups can follow suit */ + * extra reference so sub groups can follow suit. + * This flag also informs pgio layer when to bump nrequests when + * adding subrequests. */ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); @@ -709,7 +717,11 @@ static void nfs_inode_remove_request(struct nfs_page *req) wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } - nfsi->npages--; + nfsi->nrequests--; + spin_unlock(&inode->i_lock); + } else { + spin_lock(&inode->i_lock); + nfsi->nrequests--; spin_unlock(&inode->i_lock); } @@ -777,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, nfs_list_add_request(req, dst); cinfo->mds->ncommit++; spin_unlock(cinfo->lock); - if (!cinfo->dreq) { - inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, - BDI_RECLAIMABLE); - __mark_inode_dirty(req->wb_context->dentry->d_inode, - I_DIRTY_DATASYNC); - } + if (!cinfo->dreq) + nfs_mark_page_unstable(req->wb_page); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list); @@ -835,9 +842,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo); */ void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) + struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - if (pnfs_mark_request_commit(req, lseg, cinfo)) + if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx)) return; nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); } @@ -846,7 +853,7 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ @@ -893,7 +900,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) } if (nfs_write_need_commit(hdr)) { memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); - nfs_mark_request_commit(req, hdr->lseg, &cinfo); + nfs_mark_request_commit(req, hdr->lseg, &cinfo, + hdr->pgio_mirror_idx); goto next; } remove_req: @@ -1084,6 +1092,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) { struct nfs_open_context *ctx = nfs_file_open_context(file); struct nfs_lock_context *l_ctx; + struct file_lock_context *flctx = file_inode(file)->i_flctx; struct nfs_page *req; int do_flush, status; /* @@ -1102,7 +1111,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page) do_flush = req->wb_page != page || req->wb_context != ctx; /* for now, flush if more than 1 request in page_group */ do_flush |= req->wb_this_page != req; - if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { + if (l_ctx && flctx && + !(list_empty_careful(&flctx->flc_posix) && + list_empty_careful(&flctx->flc_flock))) { do_flush |= l_ctx->lockowner.l_owner != current->files || l_ctx->lockowner.l_pid != current->tgid; } @@ -1163,6 +1174,13 @@ out: return PageUptodate(page) != 0; } +static bool +is_whole_file_wrlock(struct file_lock *fl) +{ + return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX && + fl->fl_type == F_WRLCK; +} + /* If we know the page is up to date, and we're not using byte range locks (or * if we have the whole file locked for writing), it may be more efficient to * extend the write to cover the entire page in order to avoid fragmentation @@ -1173,17 +1191,36 @@ out: */ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) { + int ret; + struct file_lock_context *flctx = inode->i_flctx; + struct file_lock *fl; + if (file->f_flags & O_DSYNC) return 0; if (!nfs_write_pageuptodate(page, inode)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; - if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && - inode->i_flock->fl_end == OFFSET_MAX && - inode->i_flock->fl_type != F_RDLCK)) - return 1; - return 0; + if (!flctx || (list_empty_careful(&flctx->flc_flock) && + list_empty_careful(&flctx->flc_posix))) + return 0; + + /* Check to see if there are whole file write locks */ + ret = 0; + spin_lock(&flctx->flc_lock); + if (!list_empty(&flctx->flc_posix)) { + fl = list_first_entry(&flctx->flc_posix, struct file_lock, + fl_list); + if (is_whole_file_wrlock(fl)) + ret = 1; + } else if (!list_empty(&flctx->flc_flock)) { + fl = list_first_entry(&flctx->flc_flock, struct file_lock, + fl_list); + if (fl->fl_type == F_WRLCK) + ret = 1; + } + spin_unlock(&flctx->flc_lock); + return ret; } /* @@ -1233,15 +1270,15 @@ static int flush_task_priority(int how) static void nfs_initiate_write(struct nfs_pgio_header *hdr, struct rpc_message *msg, + const struct nfs_rpc_ops *rpc_ops, struct rpc_task_setup *task_setup_data, int how) { - struct inode *inode = hdr->inode; int priority = flush_task_priority(how); task_setup_data->priority = priority; - NFS_PROTO(inode)->write_setup(hdr, msg); + rpc_ops->write_setup(hdr, msg); - nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, + nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client, &task_setup_data->rpc_client, msg, hdr); } @@ -1291,8 +1328,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write); void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio) { + struct nfs_pgio_mirror *mirror; + pgio->pg_ops = &nfs_pgio_rw_ops; - pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; + + nfs_pageio_stop_mirroring(pgio); + + mirror = &pgio->pg_mirrors[0]; + mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize; } EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds); @@ -1334,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode) return 0; } +static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, + struct nfs_fattr *fattr) +{ + struct nfs_pgio_args *argp = &hdr->args; + struct nfs_pgio_res *resp = &hdr->res; + + if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + return; + if (argp->offset + resp->count != fattr->size) + return; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + return; + /* Set attribute barrier */ + nfs_fattr_set_barrier(fattr); +} + +void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) +{ + struct nfs_fattr *fattr = hdr->res.fattr; + struct inode *inode = hdr->inode; + + if (fattr == NULL) + return; + spin_lock(&inode->i_lock); + nfs_writeback_check_extend(hdr, fattr); + nfs_post_op_update_inode_force_wcc_locked(inode, fattr); + spin_unlock(&inode->i_lock); +} +EXPORT_SYMBOL_GPL(nfs_writeback_update_inode); + /* * This function is called when the WRITE call is complete. */ @@ -1458,6 +1531,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data) EXPORT_SYMBOL_GPL(nfs_commitdata_release); int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, + const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, int how, int flags) { @@ -1479,7 +1553,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .priority = priority, }; /* Set up the initial task struct. */ - NFS_PROTO(data->inode)->commit_setup(data, &msg); + nfs_ops->commit_setup(data, &msg); dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); @@ -1547,19 +1621,17 @@ EXPORT_SYMBOL_GPL(nfs_init_commit); void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) + struct nfs_commit_info *cinfo, + u32 ds_commit_idx) { struct nfs_page *req; while (!list_empty(page_list)) { req = nfs_list_entry(page_list->next); nfs_list_remove_request(req); - nfs_mark_request_commit(req, lseg, cinfo); - if (!cinfo->dreq) { - dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, - BDI_RECLAIMABLE); - } + nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx); + if (!cinfo->dreq) + nfs_clear_page_commit(req->wb_page); nfs_unlock_and_release_request(req); } } @@ -1582,10 +1654,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); - return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops, - how, 0); + return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), + data->mds_ops, how, 0); out_bad: - nfs_retry_commit(head, NULL, cinfo); + nfs_retry_commit(head, NULL, cinfo, 0); cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1735,7 +1807,7 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. */ - if (nfsi->commit_info.ncommit <= (nfsi->npages >> 1)) + if (nfsi->commit_info.ncommit <= (nfsi->nrequests >> 1)) goto out_mark_dirty; /* don't wait for the COMMIT response */ diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 73395156bdb4..683bf718aead 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,16 @@ config NFSD_V4 If unsure, say N. +config NFSD_PNFS + bool "NFSv4.1 server support for Parallel NFS (pNFS)" + depends on NFSD_V4 + help + This option enables support for the parallel NFS features of the + minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS + server. + + If unsure, say N. + config NFSD_V4_SECURITY_LABEL bool "Provide Security Label support for NFSv4 server" depends on NFSD_V4 && SECURITY diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index af32ef06b4fe..9a6028e120c6 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -2,9 +2,14 @@ # Makefile for the Linux nfs server # +ccflags-y += -I$(src) # needed for trace events + obj-$(CONFIG_NFSD) += nfsd.o -nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ +# this one should be compiled first, as the tracing macros can easily blow up +nfsd-y += trace.o + +nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o @@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \ nfs4acl.o nfs4callback.o nfs4recover.o +nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c new file mode 100644 index 000000000000..03d647bf195d --- /dev/null +++ b/fs/nfsd/blocklayout.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/exportfs.h> +#include <linux/genhd.h> +#include <linux/slab.h> + +#include <linux/nfsd/debug.h> + +#include "blocklayoutxdr.h" +#include "pnfs.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +static int +nfsd4_block_get_device_info_simple(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev; + struct pnfs_block_volume *b; + + dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + + sizeof(struct pnfs_block_volume), GFP_KERNEL); + if (!dev) + return -ENOMEM; + gdp->gd_device = dev; + + dev->nr_volumes = 1; + b = &dev->volumes[0]; + + b->type = PNFS_BLOCK_VOLUME_SIMPLE; + b->simple.sig_len = PNFS_BLOCK_UUID_LEN; + return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len, + &b->simple.offset); +} + +static __be32 +nfsd4_block_proc_getdeviceinfo(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdp) +{ + if (sb->s_bdev != sb->s_bdev->bd_contains) + return nfserr_inval; + return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp)); +} + +static __be32 +nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, + struct nfsd4_layoutget *args) +{ + struct nfsd4_layout_seg *seg = &args->lg_seg; + struct super_block *sb = inode->i_sb; + u32 block_size = (1 << inode->i_blkbits); + struct pnfs_block_extent *bex; + struct iomap iomap; + u32 device_generation = 0; + int error; + + /* + * We do not attempt to support I/O smaller than the fs block size, + * or not aligned to it. + */ + if (args->lg_minlength < block_size) { + dprintk("pnfsd: I/O too small\n"); + goto out_layoutunavailable; + } + if (seg->offset & (block_size - 1)) { + dprintk("pnfsd: I/O misaligned\n"); + goto out_layoutunavailable; + } + + /* + * Some clients barf on non-zero block numbers for NONE or INVALID + * layouts, so make sure to zero the whole structure. + */ + error = -ENOMEM; + bex = kzalloc(sizeof(*bex), GFP_KERNEL); + if (!bex) + goto out_error; + args->lg_content = bex; + + error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length, + &iomap, seg->iomode != IOMODE_READ, + &device_generation); + if (error) { + if (error == -ENXIO) + goto out_layoutunavailable; + goto out_error; + } + + if (iomap.length < args->lg_minlength) { + dprintk("pnfsd: extent smaller than minlength\n"); + goto out_layoutunavailable; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (seg->iomode == IOMODE_READ) + bex->es = PNFS_BLOCK_READ_DATA; + else + bex->es = PNFS_BLOCK_READWRITE_DATA; + bex->soff = (iomap.blkno << 9); + break; + case IOMAP_UNWRITTEN: + if (seg->iomode & IOMODE_RW) { + /* + * Crack monkey special case from section 2.3.1. + */ + if (args->lg_minlength == 0) { + dprintk("pnfsd: no soup for you!\n"); + goto out_layoutunavailable; + } + + bex->es = PNFS_BLOCK_INVALID_DATA; + bex->soff = (iomap.blkno << 9); + break; + } + /*FALLTHRU*/ + case IOMAP_HOLE: + if (seg->iomode == IOMODE_READ) { + bex->es = PNFS_BLOCK_NONE_DATA; + break; + } + /*FALLTHRU*/ + case IOMAP_DELALLOC: + default: + WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type); + goto out_layoutunavailable; + } + + error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation); + if (error) + goto out_error; + bex->foff = iomap.offset; + bex->len = iomap.length; + + seg->offset = iomap.offset; + seg->length = iomap.length; + + dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es); + return 0; + +out_error: + seg->length = 0; + return nfserrno(error); +out_layoutunavailable: + seg->length = 0; + return nfserr_layoutunavailable; +} + +static __be32 +nfsd4_block_proc_layoutcommit(struct inode *inode, + struct nfsd4_layoutcommit *lcp) +{ + loff_t new_size = lcp->lc_last_wr + 1; + struct iattr iattr = { .ia_valid = 0 }; + struct iomap *iomaps; + int nr_iomaps; + int error; + + nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, + lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits); + if (nr_iomaps < 0) + return nfserrno(nr_iomaps); + + if (lcp->lc_mtime.tv_nsec == UTIME_NOW || + timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) + lcp->lc_mtime = current_fs_time(inode->i_sb); + iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; + iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; + + if (new_size > i_size_read(inode)) { + iattr.ia_valid |= ATTR_SIZE; + iattr.ia_size = new_size; + } + + error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, + nr_iomaps, &iattr); + kfree(iomaps); + return nfserrno(error); +} + +const struct nfsd4_layout_ops bl_layout_ops = { + .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, + .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, + .proc_layoutget = nfsd4_block_proc_layoutget, + .encode_layoutget = nfsd4_block_encode_layoutget, + .proc_layoutcommit = nfsd4_block_proc_layoutcommit, +}; diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c new file mode 100644 index 000000000000..9aa2796da90d --- /dev/null +++ b/fs/nfsd/blocklayoutxdr.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/sunrpc/svc.h> +#include <linux/exportfs.h> +#include <linux/nfs4.h> + +#include "nfsd.h" +#include "blocklayoutxdr.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + + +__be32 +nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp) +{ + struct pnfs_block_extent *b = lgp->lg_content; + int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(__be32) + len); + if (!p) + return nfserr_toosmall; + + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(1); /* we always return a single extent */ + + p = xdr_encode_opaque_fixed(p, &b->vol_id, + sizeof(struct nfsd4_deviceid)); + p = xdr_encode_hyper(p, b->foff); + p = xdr_encode_hyper(p, b->len); + p = xdr_encode_hyper(p, b->soff); + *p++ = cpu_to_be32(b->es); + return 0; +} + +static int +nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int len; + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + len = 4 + 4 + 8 + 4 + b->simple.sig_len; + p = xdr_reserve_space(xdr, len); + if (!p) + return -ETOOSMALL; + + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(1); /* single signature */ + p = xdr_encode_hyper(p, b->simple.offset); + p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len); + break; + default: + return -ENOTSUPP; + } + + return len; +} + +__be32 +nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp) +{ + struct pnfs_block_deviceaddr *dev = gdp->gd_device; + int len = sizeof(__be32), ret, i; + __be32 *p; + + p = xdr_reserve_space(xdr, len + sizeof(__be32)); + if (!p) + return nfserr_resource; + + for (i = 0; i < dev->nr_volumes; i++) { + ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]); + if (ret < 0) + return nfserrno(ret); + len += ret; + } + + /* + * Fill in the overall length and number of volumes at the beginning + * of the layout. + */ + *p++ = cpu_to_be32(len); + *p++ = cpu_to_be32(dev->nr_volumes); + return 0; +} + +int +nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size) +{ + struct iomap *iomaps; + u32 nr_iomaps, expected, i; + + if (len < sizeof(u32)) { + dprintk("%s: extent array too small: %u\n", __func__, len); + return -EINVAL; + } + + nr_iomaps = be32_to_cpup(p++); + expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE; + if (len != expected) { + dprintk("%s: extent array size mismatch: %u/%u\n", + __func__, len, expected); + return -EINVAL; + } + + iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); + if (!iomaps) { + dprintk("%s: failed to allocate extent array\n", __func__); + return -ENOMEM; + } + + for (i = 0; i < nr_iomaps; i++) { + struct pnfs_block_extent bex; + + memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); + p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + + p = xdr_decode_hyper(p, &bex.foff); + if (bex.foff & (block_size - 1)) { + dprintk("%s: unaligned offset 0x%llx\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.len); + if (bex.len & (block_size - 1)) { + dprintk("%s: unaligned length 0x%llx\n", + __func__, bex.foff); + goto fail; + } + p = xdr_decode_hyper(p, &bex.soff); + if (bex.soff & (block_size - 1)) { + dprintk("%s: unaligned disk offset 0x%llx\n", + __func__, bex.soff); + goto fail; + } + bex.es = be32_to_cpup(p++); + if (bex.es != PNFS_BLOCK_READWRITE_DATA) { + dprintk("%s: incorrect extent state %d\n", + __func__, bex.es); + goto fail; + } + + iomaps[i].offset = bex.foff; + iomaps[i].length = bex.len; + } + + *iomapp = iomaps; + return nr_iomaps; +fail: + kfree(iomaps); + return -EINVAL; +} diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h new file mode 100644 index 000000000000..fdc79037c0e7 --- /dev/null +++ b/fs/nfsd/blocklayoutxdr.h @@ -0,0 +1,62 @@ +#ifndef _NFSD_BLOCKLAYOUTXDR_H +#define _NFSD_BLOCKLAYOUTXDR_H 1 + +#include <linux/blkdev.h> +#include "xdr4.h" + +struct iomap; +struct xdr_stream; + +enum pnfs_block_extent_state { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, + PNFS_BLOCK_NONE_DATA = 3, +}; + +struct pnfs_block_extent { + struct nfsd4_deviceid vol_id; + u64 foff; + u64 len; + u64 soff; + enum pnfs_block_extent_state es; +}; +#define NFS4_BLOCK_EXTENT_SIZE 44 + +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, +}; + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } simple; + }; +}; + +struct pnfs_block_deviceaddr { + u32 nr_volumes; + struct pnfs_block_volume volumes[]; +}; + +__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdp); +__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, + struct nfsd4_layoutget *lgp); +int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, + u32 block_size); + +#endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 30a739d896ff..c3e3b6e55ae2 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -20,6 +20,7 @@ #include "nfsd.h" #include "nfsfh.h" #include "netns.h" +#include "pnfs.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) exp.ex_client = dom; exp.cd = cd; + exp.ex_devid_map = NULL; /* expiry */ err = -EINVAL; @@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if (!gid_valid(exp.ex_anon_gid)) goto out4; err = 0; + + nfsd4_setup_layout_type(&exp); } expp = svc_export_lookup(&exp); @@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_fslocs.locations = NULL; new->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = 0; + new->ex_layout_type = 0; new->ex_uuid = NULL; new->cd = item->cd; } @@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) new->ex_anon_uid = item->ex_anon_uid; new->ex_anon_gid = item->ex_anon_gid; new->ex_fsid = item->ex_fsid; + new->ex_devid_map = item->ex_devid_map; + item->ex_devid_map = NULL; new->ex_uuid = item->ex_uuid; item->ex_uuid = NULL; new->ex_fslocs.locations = item->ex_fslocs.locations; @@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem) item->ex_fslocs.locations_count = 0; new->ex_fslocs.migrated = item->ex_fslocs.migrated; item->ex_fslocs.migrated = 0; + new->ex_layout_type = item->ex_layout_type; new->ex_nflavors = item->ex_nflavors; for (i = 0; i < MAX_SECINFO_LIST; i++) { new->ex_flavors[i] = item->ex_flavors[i]; diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 04dc8c167b0c..1f52bfcc436f 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -56,6 +56,8 @@ struct svc_export { struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST]; + enum pnfs_layouttype ex_layout_type; + struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; }; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ed2b1151b171..58277859a467 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -546,6 +546,102 @@ out: return status; } +#ifdef CONFIG_NFSD_PNFS +/* + * CB_LAYOUTRECALL4args + * + * struct layoutrecall_file4 { + * nfs_fh4 lor_fh; + * offset4 lor_offset; + * length4 lor_length; + * stateid4 lor_stateid; + * }; + * + * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) { + * case LAYOUTRECALL4_FILE: + * layoutrecall_file4 lor_layout; + * case LAYOUTRECALL4_FSID: + * fsid4 lor_fsid; + * case LAYOUTRECALL4_ALL: + * void; + * }; + * + * struct CB_LAYOUTRECALL4args { + * layouttype4 clora_type; + * layoutiomode4 clora_iomode; + * bool clora_changed; + * layoutrecall4 clora_recall; + * }; + */ +static void encode_cb_layout4args(struct xdr_stream *xdr, + const struct nfs4_layout_stateid *ls, + struct nfs4_cb_compound_hdr *hdr) +{ + __be32 *p; + + BUG_ON(hdr->minorversion == 0); + + p = xdr_reserve_space(xdr, 5 * 4); + *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL); + *p++ = cpu_to_be32(ls->ls_layout_type); + *p++ = cpu_to_be32(IOMODE_ANY); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(RETURN_FILE); + + encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle); + + p = xdr_reserve_space(xdr, 2 * 8); + p = xdr_encode_hyper(p, 0); + xdr_encode_hyper(p, NFS4_MAX_UINT64); + + encode_stateid4(xdr, &ls->ls_recall_sid); + + hdr->nops++; +} + +static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req, + struct xdr_stream *xdr, + const struct nfsd4_callback *cb) +{ + const struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfs4_cb_compound_hdr hdr = { + .ident = 0, + .minorversion = cb->cb_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_layout4args(xdr, ls, &hdr); + encode_cb_nops(&hdr); +} + +static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfsd4_callback *cb) +{ + struct nfs4_cb_compound_hdr hdr; + enum nfsstat4 nfserr; + int status; + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + goto out; + if (cb) { + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status)) + goto out; + } + status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); + if (unlikely(status)) + goto out; + if (unlikely(nfserr != NFS4_OK)) + status = nfs_cb_stat_to_errno(nfserr); +out: + return status; +} +#endif /* CONFIG_NFSD_PNFS */ + /* * RPC procedure tables */ @@ -563,6 +659,9 @@ out: static struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NULL, NULL, cb_null, cb_null), PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall), +#ifdef CONFIG_NFSD_PNFS + PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout), +#endif }; static struct rpc_version nfs_cb_version4 = { @@ -774,8 +873,12 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) { if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); - dprintk("%s slot is busy\n", __func__); - return false; + /* Race breaker */ + if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + dprintk("%s slot is busy\n", __func__); + return false; + } + rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); } return true; } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c new file mode 100644 index 000000000000..6904213a4363 --- /dev/null +++ b/fs/nfsd/nfs4layouts.c @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/kmod.h> +#include <linux/file.h> +#include <linux/jhash.h> +#include <linux/sched.h> +#include <linux/sunrpc/addr.h> + +#include "pnfs.h" +#include "netns.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_PNFS + +struct nfs4_layout { + struct list_head lo_perstate; + struct nfs4_layout_stateid *lo_state; + struct nfsd4_layout_seg lo_seg; +}; + +static struct kmem_cache *nfs4_layout_cache; +static struct kmem_cache *nfs4_layout_stateid_cache; + +static struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct lock_manager_operations nfsd4_layouts_lm_ops; + +const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { + [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops, +}; + +/* pNFS device ID to export fsid mapping */ +#define DEVID_HASH_BITS 8 +#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS) +#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1) +static u64 nfsd_devid_seq = 1; +static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE]; +static DEFINE_SPINLOCK(nfsd_devid_lock); + +static inline u32 devid_hashfn(u64 idx) +{ + return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK; +} + +static void +nfsd4_alloc_devid_map(const struct svc_fh *fhp) +{ + const struct knfsd_fh *fh = &fhp->fh_handle; + size_t fsid_len = key_len(fh->fh_fsid_type); + struct nfsd4_deviceid_map *map, *old; + int i; + + map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL); + if (!map) + return; + + map->fsid_type = fh->fh_fsid_type; + memcpy(&map->fsid, fh->fh_fsid, fsid_len); + + spin_lock(&nfsd_devid_lock); + if (fhp->fh_export->ex_devid_map) + goto out_unlock; + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + list_for_each_entry(old, &nfsd_devid_hash[i], hash) { + if (old->fsid_type != fh->fh_fsid_type) + continue; + if (memcmp(old->fsid, fh->fh_fsid, + key_len(old->fsid_type))) + continue; + + fhp->fh_export->ex_devid_map = old; + goto out_unlock; + } + } + + map->idx = nfsd_devid_seq++; + list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]); + fhp->fh_export->ex_devid_map = map; + map = NULL; + +out_unlock: + spin_unlock(&nfsd_devid_lock); + kfree(map); +} + +struct nfsd4_deviceid_map * +nfsd4_find_devid_map(int idx) +{ + struct nfsd4_deviceid_map *map, *ret = NULL; + + rcu_read_lock(); + list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash) + if (map->idx == idx) + ret = map; + rcu_read_unlock(); + + return ret; +} + +int +nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation) +{ + if (!fhp->fh_export->ex_devid_map) { + nfsd4_alloc_devid_map(fhp); + if (!fhp->fh_export->ex_devid_map) + return -ENOMEM; + } + + id->fsid_idx = fhp->fh_export->ex_devid_map->idx; + id->generation = device_generation; + id->pad = 0; + return 0; +} + +void nfsd4_setup_layout_type(struct svc_export *exp) +{ + struct super_block *sb = exp->ex_path.mnt->mnt_sb; + + if (!(exp->ex_flags & NFSEXP_PNFS)) + return; + + if (sb->s_export_op->get_uuid && + sb->s_export_op->map_blocks && + sb->s_export_op->commit_blocks) + exp->ex_layout_type = LAYOUT_BLOCK_VOLUME; +} + +static void +nfsd4_free_layout_stateid(struct nfs4_stid *stid) +{ + struct nfs4_layout_stateid *ls = layoutstateid(stid); + struct nfs4_client *clp = ls->ls_stid.sc_client; + struct nfs4_file *fp = ls->ls_stid.sc_file; + + trace_layoutstate_free(&ls->ls_stid.sc_stateid); + + spin_lock(&clp->cl_lock); + list_del_init(&ls->ls_perclnt); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_del_init(&ls->ls_perfile); + spin_unlock(&fp->fi_lock); + + vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); + fput(ls->ls_file); + + if (ls->ls_recalled) + atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); + + kmem_cache_free(nfs4_layout_stateid_cache, ls); +} + +static int +nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) +{ + struct file_lock *fl; + int status; + + fl = locks_alloc_lock(); + if (!fl) + return -ENOMEM; + locks_init_lock(fl); + fl->fl_lmops = &nfsd4_layouts_lm_ops; + fl->fl_flags = FL_LAYOUT; + fl->fl_type = F_RDLCK; + fl->fl_end = OFFSET_MAX; + fl->fl_owner = ls; + fl->fl_pid = current->tgid; + fl->fl_file = ls->ls_file; + + status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); + if (status) { + locks_free_lock(fl); + return status; + } + BUG_ON(fl != NULL); + return 0; +} + +static struct nfs4_layout_stateid * +nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, + struct nfs4_stid *parent, u32 layout_type) +{ + struct nfs4_client *clp = cstate->clp; + struct nfs4_file *fp = parent->sc_file; + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stp; + + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); + if (!stp) + return NULL; + stp->sc_free = nfsd4_free_layout_stateid; + get_nfs4_file(fp); + stp->sc_file = fp; + + ls = layoutstateid(stp); + INIT_LIST_HEAD(&ls->ls_perclnt); + INIT_LIST_HEAD(&ls->ls_perfile); + spin_lock_init(&ls->ls_lock); + INIT_LIST_HEAD(&ls->ls_layouts); + ls->ls_layout_type = layout_type; + nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, + NFSPROC4_CLNT_CB_LAYOUT); + + if (parent->sc_type == NFS4_DELEG_STID) + ls->ls_file = get_file(fp->fi_deleg_file); + else + ls->ls_file = find_any_file(fp); + BUG_ON(!ls->ls_file); + + if (nfsd4_layout_setlease(ls)) { + put_nfs4_file(fp); + kmem_cache_free(nfs4_layout_stateid_cache, ls); + return NULL; + } + + spin_lock(&clp->cl_lock); + stp->sc_type = NFS4_LAYOUT_STID; + list_add(&ls->ls_perclnt, &clp->cl_lo_states); + spin_unlock(&clp->cl_lock); + + spin_lock(&fp->fi_lock); + list_add(&ls->ls_perfile, &fp->fi_lo_states); + spin_unlock(&fp->fi_lock); + + trace_layoutstate_alloc(&ls->ls_stid.sc_stateid); + return ls; +} + +__be32 +nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_stid *stid; + unsigned char typemask = NFS4_LAYOUT_STID; + __be32 status; + + if (create) + typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + + status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + net_generic(SVC_NET(rqstp), nfsd_net_id)); + if (status) + goto out; + + if (!fh_match(&cstate->current_fh.fh_handle, + &stid->sc_file->fi_fhandle)) { + status = nfserr_bad_stateid; + goto out_put_stid; + } + + if (stid->sc_type != NFS4_LAYOUT_STID) { + ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); + nfs4_put_stid(stid); + + status = nfserr_jukebox; + if (!ls) + goto out; + } else { + ls = container_of(stid, struct nfs4_layout_stateid, ls_stid); + + status = nfserr_bad_stateid; + if (stateid->si_generation > stid->sc_stateid.si_generation) + goto out_put_stid; + if (layout_type != ls->ls_layout_type) + goto out_put_stid; + } + + *lsp = ls; + return 0; + +out_put_stid: + nfs4_put_stid(stid); +out: + return status; +} + +static void +nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) +{ + spin_lock(&ls->ls_lock); + if (ls->ls_recalled) + goto out_unlock; + + ls->ls_recalled = true; + atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); + if (list_empty(&ls->ls_layouts)) + goto out_unlock; + + trace_layout_recall(&ls->ls_stid.sc_stateid); + + atomic_inc(&ls->ls_stid.sc_count); + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + nfsd4_run_cb(&ls->ls_recall); + +out_unlock: + spin_unlock(&ls->ls_lock); +} + +static inline u64 +layout_end(struct nfsd4_layout_seg *seg) +{ + u64 end = seg->offset + seg->length; + return end >= seg->offset ? end : NFS4_MAX_UINT64; +} + +static void +layout_update_len(struct nfsd4_layout_seg *lo, u64 end) +{ + if (end == NFS4_MAX_UINT64) + lo->length = NFS4_MAX_UINT64; + else + lo->length = end - lo->offset; +} + +static bool +layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s) +{ + if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode) + return false; + if (layout_end(&lo->lo_seg) <= s->offset) + return false; + if (layout_end(s) <= lo->lo_seg.offset) + return false; + return true; +} + +static bool +layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new) +{ + if (lo->iomode != new->iomode) + return false; + if (layout_end(new) < lo->offset) + return false; + if (layout_end(lo) < new->offset) + return false; + + lo->offset = min(lo->offset, new->offset); + layout_update_len(lo, max(layout_end(lo), layout_end(new))); + return true; +} + +static __be32 +nfsd4_recall_conflict(struct nfs4_layout_stateid *ls) +{ + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout_stateid *l, *n; + __be32 nfserr = nfs_ok; + + assert_spin_locked(&fp->fi_lock); + + list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) { + if (l != ls) { + nfsd4_recall_file_layout(l); + nfserr = nfserr_recallconflict; + } + } + + return nfserr; +} + +__be32 +nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) +{ + struct nfsd4_layout_seg *seg = &lgp->lg_seg; + struct nfs4_file *fp = ls->ls_stid.sc_file; + struct nfs4_layout *lp, *new = NULL; + __be32 nfserr; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + spin_unlock(&ls->ls_lock); + spin_unlock(&fp->fi_lock); + + new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL); + if (!new) + return nfserr_jukebox; + memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg)); + new->lo_state = ls; + + spin_lock(&fp->fi_lock); + nfserr = nfsd4_recall_conflict(ls); + if (nfserr) + goto out; + spin_lock(&ls->ls_lock); + list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) { + if (layouts_try_merge(&lp->lo_seg, seg)) + goto done; + } + + atomic_inc(&ls->ls_stid.sc_count); + list_add_tail(&new->lo_perstate, &ls->ls_layouts); + new = NULL; +done: + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t)); + spin_unlock(&ls->ls_lock); +out: + spin_unlock(&fp->fi_lock); + if (new) + kmem_cache_free(nfs4_layout_cache, new); + return nfserr; +} + +static void +nfsd4_free_layouts(struct list_head *reaplist) +{ + while (!list_empty(reaplist)) { + struct nfs4_layout *lp = list_first_entry(reaplist, + struct nfs4_layout, lo_perstate); + + list_del(&lp->lo_perstate); + nfs4_put_stid(&lp->lo_state->ls_stid); + kmem_cache_free(nfs4_layout_cache, lp); + } +} + +static void +nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg, + struct list_head *reaplist) +{ + struct nfsd4_layout_seg *lo = &lp->lo_seg; + u64 end = layout_end(lo); + + if (seg->offset <= lo->offset) { + if (layout_end(seg) >= end) { + list_move_tail(&lp->lo_perstate, reaplist); + return; + } + lo->offset = layout_end(seg); + } else { + /* retain the whole layout segment on a split. */ + if (layout_end(seg) < end) { + dprintk("%s: split not supported\n", __func__); + return; + } + end = seg->offset; + } + + layout_update_len(lo, end); +} + +__be32 +nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls; + struct nfs4_layout *lp, *n; + LIST_HEAD(reaplist); + __be32 nfserr; + int found = 0; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid, + false, lrp->lr_layout_type, + &ls); + if (nfserr) { + trace_layout_return_lookup_fail(&lrp->lr_sid); + return nfserr; + } + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) { + if (layouts_overlapping(lp, &lrp->lr_seg)) { + nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist); + found++; + } + } + if (!list_empty(&ls->ls_layouts)) { + if (found) { + update_stateid(&ls->ls_stid.sc_stateid); + memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid, + sizeof(stateid_t)); + } + lrp->lrs_present = 1; + } else { + trace_layoutstate_unhash(&ls->ls_stid.sc_stateid); + nfs4_unhash_stid(&ls->ls_stid); + lrp->lrs_present = 0; + } + spin_unlock(&ls->ls_lock); + + nfs4_put_stid(&ls->ls_stid); + nfsd4_free_layouts(&reaplist); + return nfs_ok; +} + +__be32 +nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct nfs4_layout_stateid *ls, *n; + struct nfs4_client *clp = cstate->clp; + struct nfs4_layout *lp, *t; + LIST_HEAD(reaplist); + + lrp->lrs_present = 0; + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { + if (ls->ls_layout_type != lrp->lr_layout_type) + continue; + + if (lrp->lr_return_type == RETURN_FSID && + !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle, + &cstate->current_fh.fh_handle)) + continue; + + spin_lock(&ls->ls_lock); + list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) { + if (lrp->lr_seg.iomode == IOMODE_ANY || + lrp->lr_seg.iomode == lp->lo_seg.iomode) + list_move_tail(&lp->lo_perstate, &reaplist); + } + spin_unlock(&ls->ls_lock); + } + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); + return 0; +} + +static void +nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls, + struct list_head *reaplist) +{ + spin_lock(&ls->ls_lock); + list_splice_init(&ls->ls_layouts, reaplist); + spin_unlock(&ls->ls_lock); +} + +void +nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&clp->cl_lock); + list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) + nfsd4_return_all_layouts(ls, &reaplist); + spin_unlock(&clp->cl_lock); + + nfsd4_free_layouts(&reaplist); +} + +void +nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) +{ + struct nfs4_layout_stateid *ls, *n; + LIST_HEAD(reaplist); + + spin_lock(&fp->fi_lock); + list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) { + if (ls->ls_stid.sc_client == clp) + nfsd4_return_all_layouts(ls, &reaplist); + } + spin_unlock(&fp->fi_lock); + + nfsd4_free_layouts(&reaplist); +} + +static void +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +{ + struct nfs4_client *clp = ls->ls_stid.sc_client; + char addr_str[INET6_ADDRSTRLEN]; + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[8]; + int error; + + rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str)); + + trace_layout_recall_fail(&ls->ls_stid.sc_stateid); + + printk(KERN_WARNING + "nfsd: client %s failed to respond to layout recall. " + " Fencing..\n", addr_str); + + argv[0] = "/sbin/nfsd-recall-failed"; + argv[1] = addr_str; + argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[3] = NULL; + + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (error) { + printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n", + addr_str, error); + } +} + +static int +nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + switch (task->tk_status) { + case 0: + return 1; + case -NFS4ERR_NOMATCHING_LAYOUT: + trace_layout_recall_done(&ls->ls_stid.sc_stateid); + task->tk_status = 0; + return 1; + case -NFS4ERR_DELAY: + /* Poll the client until it's done with the layout */ + /* FIXME: cap number of retries. + * The pnfs standard states that we need to only expire + * the client after at-least "lease time" .eg lease-time * 2 + * when failing to communicate a recall + */ + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + default: + /* + * Unknown error or non-responding client, we'll need to fence. + */ + nfsd4_cb_layout_fail(ls); + return -1; + } +} + +static void +nfsd4_cb_layout_release(struct nfsd4_callback *cb) +{ + struct nfs4_layout_stateid *ls = + container_of(cb, struct nfs4_layout_stateid, ls_recall); + LIST_HEAD(reaplist); + + trace_layout_recall_release(&ls->ls_stid.sc_stateid); + + nfsd4_return_all_layouts(ls, &reaplist); + nfsd4_free_layouts(&reaplist); + nfs4_put_stid(&ls->ls_stid); +} + +static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { + .done = nfsd4_cb_layout_done, + .release = nfsd4_cb_layout_release, +}; + +static bool +nfsd4_layout_lm_break(struct file_lock *fl) +{ + /* + * We don't want the locks code to timeout the lease for us; + * we'll remove it ourself if a layout isn't returned + * in time: + */ + fl->fl_break_time = 0; + nfsd4_recall_file_layout(fl->fl_owner); + return false; +} + +static int +nfsd4_layout_lm_change(struct file_lock *onlist, int arg, + struct list_head *dispose) +{ + BUG_ON(!(arg & F_UNLCK)); + return lease_modify(onlist, arg, dispose); +} + +static const struct lock_manager_operations nfsd4_layouts_lm_ops = { + .lm_break = nfsd4_layout_lm_break, + .lm_change = nfsd4_layout_lm_change, +}; + +int +nfsd4_init_pnfs(void) +{ + int i; + + for (i = 0; i < DEVID_HASH_SIZE; i++) + INIT_LIST_HEAD(&nfsd_devid_hash[i]); + + nfs4_layout_cache = kmem_cache_create("nfs4_layout", + sizeof(struct nfs4_layout), 0, 0, NULL); + if (!nfs4_layout_cache) + return -ENOMEM; + + nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", + sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + if (!nfs4_layout_stateid_cache) { + kmem_cache_destroy(nfs4_layout_cache); + return -ENOMEM; + } + return 0; +} + +void +nfsd4_exit_pnfs(void) +{ + int i; + + kmem_cache_destroy(nfs4_layout_cache); + kmem_cache_destroy(nfs4_layout_stateid_cache); + + for (i = 0; i < DEVID_HASH_SIZE; i++) { + struct nfsd4_deviceid_map *map, *n; + + list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash) + kfree(map); + } +} diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0beb023f25ac..92b9d97aff4f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/file.h> +#include <linux/falloc.h> #include <linux/slab.h> #include "idmap.h" @@ -42,6 +43,8 @@ #include "current_stateid.h" #include "netns.h" #include "acl.h" +#include "pnfs.h" +#include "trace.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -772,7 +775,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), @@ -1014,6 +1017,44 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 +nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate, int flags) +{ + __be32 status = nfserr_notsupp; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &fallocate->falloc_stateid, + WR_STATE, &file); + if (status != nfs_ok) { + dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + return status; + } + + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + fallocate->falloc_offset, + fallocate->falloc_length, + flags); + fput(file); + return status; +} + +static __be32 +nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, 0); +} + +static __be32 +nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE); +} + +static __be32 nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek) { @@ -1139,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status == nfserr_same ? nfs_ok : status; } +#ifdef CONFIG_NFSD_PNFS +static const struct nfsd4_layout_ops * +nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) +{ + if (!exp->ex_layout_type) { + dprintk("%s: export does not support pNFS\n", __func__); + return NULL; + } + + if (exp->ex_layout_type != layout_type) { + dprintk("%s: layout type %d not supported\n", + __func__, layout_type); + return NULL; + } + + return nfsd4_layout_ops[layout_type]; +} + +static __be32 +nfsd4_getdeviceinfo(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_getdeviceinfo *gdp) +{ + const struct nfsd4_layout_ops *ops; + struct nfsd4_deviceid_map *map; + struct svc_export *exp; + __be32 nfserr; + + dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n", + __func__, + gdp->gd_layout_type, + gdp->gd_devid.fsid_idx, gdp->gd_devid.generation, + gdp->gd_maxcount); + + map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx); + if (!map) { + dprintk("%s: couldn't find device ID to export mapping!\n", + __func__); + return nfserr_noent; + } + + exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid); + if (IS_ERR(exp)) { + dprintk("%s: could not find device id\n", __func__); + return nfserr_noent; + } + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(exp, gdp->gd_layout_type); + if (!ops) + goto out; + + nfserr = nfs_ok; + if (gdp->gd_maxcount != 0) + nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp); + + gdp->gd_notify_types &= ops->notify_types; +out: + exp_put(exp); + return nfserr; +} + +static __be32 +nfsd4_layoutget(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutget *lgp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + int accmode; + + switch (lgp->lg_seg.iomode) { + case IOMODE_READ: + accmode = NFSD_MAY_READ; + break; + case IOMODE_RW: + accmode = NFSD_MAY_READ | NFSD_MAY_WRITE; + break; + default: + dprintk("%s: invalid iomode %d\n", + __func__, lgp->lg_seg.iomode); + nfserr = nfserr_badiomode; + goto out; + } + + nfserr = fh_verify(rqstp, current_fh, 0, accmode); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type); + if (!ops) + goto out; + + /* + * Verify minlength and range as per RFC5661: + * o If loga_length is less than loga_minlength, + * the metadata server MUST return NFS4ERR_INVAL. + * o If the sum of loga_offset and loga_minlength exceeds + * NFS4_UINT64_MAX, and loga_minlength is not + * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result. + * o If the sum of loga_offset and loga_length exceeds + * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX, + * the error NFS4ERR_INVAL MUST result. + */ + nfserr = nfserr_inval; + if (lgp->lg_seg.length < lgp->lg_minlength || + (lgp->lg_minlength != NFS4_MAX_UINT64 && + lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) || + (lgp->lg_seg.length != NFS4_MAX_UINT64 && + lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset)) + goto out; + if (lgp->lg_seg.length == 0) + goto out; + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid, + true, lgp->lg_layout_type, &ls); + if (nfserr) { + trace_layout_get_lookup_fail(&lgp->lg_sid); + goto out; + } + + nfserr = nfserr_recallconflict; + if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) + goto out_put_stid; + + nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode, + current_fh, lgp); + if (nfserr) + goto out_put_stid; + + nfserr = nfsd4_insert_layout(lgp, ls); + +out_put_stid: + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutcommit(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutcommit *lcp) +{ + const struct nfsd4_layout_seg *seg = &lcp->lc_seg; + struct svc_fh *current_fh = &cstate->current_fh; + const struct nfsd4_layout_ops *ops; + loff_t new_size = lcp->lc_last_wr + 1; + struct inode *inode; + struct nfs4_layout_stateid *ls; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); + if (!ops) + goto out; + inode = current_fh->fh_dentry->d_inode; + + nfserr = nfserr_inval; + if (new_size <= seg->offset) { + dprintk("pnfsd: last write before layout segment\n"); + goto out; + } + if (new_size > seg->offset + seg->length) { + dprintk("pnfsd: last write beyond layout segment\n"); + goto out; + } + if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { + dprintk("pnfsd: layoutcommit beyond EOF\n"); + goto out; + } + + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, + false, lcp->lc_layout_type, + &ls); + if (nfserr) { + trace_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + nfserr = ops->proc_layoutcommit(inode, lcp); + if (nfserr) + goto out_put_stid; + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = 1; + lcp->lc_newsize = new_size; + } else { + lcp->lc_size_chg = 0; + } + +out_put_stid: + nfs4_put_stid(&ls->ls_stid); +out: + return nfserr; +} + +static __be32 +nfsd4_layoutreturn(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp) +{ + struct svc_fh *current_fh = &cstate->current_fh; + __be32 nfserr; + + nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP); + if (nfserr) + goto out; + + nfserr = nfserr_layoutunavailable; + if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type)) + goto out; + + switch (lrp->lr_seg.iomode) { + case IOMODE_READ: + case IOMODE_RW: + case IOMODE_ANY: + break; + default: + dprintk("%s: invalid iomode %d\n", __func__, + lrp->lr_seg.iomode); + nfserr = nfserr_inval; + goto out; + } + + switch (lrp->lr_return_type) { + case RETURN_FILE: + nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp); + break; + case RETURN_FSID: + case RETURN_ALL: + nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp); + break; + default: + dprintk("%s: invalid return_type %d\n", __func__, + lrp->lr_return_type); + nfserr = nfserr_inval; + break; + } +out: + return nfserr; +} +#endif /* CONFIG_NFSD_PNFS */ + /* * NULL call. */ @@ -1331,7 +1625,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - rqstp->rq_usedeferral = false; + clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -1447,7 +1741,7 @@ encode_op: BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ - rqstp->rq_usedeferral = true; + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } @@ -1640,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd op_encode_channel_attrs_maxsz) * sizeof(__be32); } +#ifdef CONFIG_NFSD_PNFS +/* + * At this stage we don't really know what layout driver will handle the request, + * so we need to define an arbitrary upper bound here. + */ +#define MAX_LAYOUT_SIZE 128 +static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* logr_return_on_close */ + + op_encode_stateid_maxsz + + 1 /* nr of layouts */ + + MAX_LAYOUT_SIZE) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* locr_newsize */ + + 2 /* ns_size */) * sizeof(__be32); +} + +static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) +{ + return (op_encode_hdr_size + + 1 /* lrs_stateid */ + + op_encode_stateid_maxsz) * sizeof(__be32); +} +#endif /* CONFIG_NFSD_PNFS */ + static struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = (nfsd4op_func)nfsd4_access, @@ -1927,8 +2251,45 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = { + .op_func = (nfsd4op_func)nfsd4_getdeviceinfo, + .op_flags = ALLOWED_WITHOUT_FH, + .op_name = "OP_GETDEVICEINFO", + }, + [OP_LAYOUTGET] = { + .op_func = (nfsd4op_func)nfsd4_layoutget, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTGET", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize, + }, + [OP_LAYOUTCOMMIT] = { + .op_func = (nfsd4op_func)nfsd4_layoutcommit, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTCOMMIT", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize, + }, + [OP_LAYOUTRETURN] = { + .op_func = (nfsd4op_func)nfsd4_layoutreturn, + .op_flags = OP_MODIFIES_SOMETHING, + .op_name = "OP_LAYOUTRETURN", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize, + }, +#endif /* CONFIG_NFSD_PNFS */ /* NFSv4.2 operations */ + [OP_ALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_allocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_ALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + }, + [OP_DEALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_deallocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_DEALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index a25490ae6c62..1c307f02baa8 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -245,10 +245,11 @@ struct nfs4_dir_ctx { }; static int -nfsd4_build_namelist(void *arg, const char *name, int namlen, +nfsd4_build_namelist(struct dir_context *__ctx, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { - struct nfs4_dir_ctx *ctx = arg; + struct nfs4_dir_ctx *ctx = + container_of(__ctx, struct nfs4_dir_ctx, ctx); struct name_list *entry; if (namlen != HEXDIR_LEN - 1) @@ -582,7 +583,7 @@ nfs4_reset_recoverydir(char *recdir) if (status) return status; status = -ENOTDIR; - if (S_ISDIR(path.dentry->d_inode->i_mode)) { + if (d_is_dir(path.dentry)) { strcpy(user_recovery_dirname, recdir); status = 0; } @@ -704,7 +705,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct cld_upcall *tmp, *cup; struct cld_msg __user *cmsg = (struct cld_msg __user *)src; uint32_t xid; - struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); struct cld_net *cn = nn->cld_net; @@ -1425,7 +1426,7 @@ nfsd4_client_tracking_init(struct net *net) nn->client_tracking_ops = &nfsd4_legacy_tracking_ops; status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); if (!status) { - status = S_ISDIR(path.dentry->d_inode->i_mode); + status = d_is_dir(path.dentry); path_put(&path); if (status) goto do_init; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e9c3afe4b5d3..8ba1d888f1e6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -41,13 +41,14 @@ #include <linux/ratelimit.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/addr.h> -#include <linux/hash.h> +#include <linux/jhash.h> #include "xdr4.h" #include "xdr4cb.h" #include "vfs.h" #include "current_stateid.h" #include "netns.h" +#include "pnfs.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp) clp->cl_time = get_seconds(); } -static inline void -renew_client(struct nfs4_client *clp) -{ - struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); - - spin_lock(&nn->client_lock); - renew_client_locked(clp); - spin_unlock(&nn->client_lock); -} - static void put_client_renew_locked(struct nfs4_client *clp) { struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); @@ -275,29 +266,26 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static void nfsd4_free_file(struct nfs4_file *f) +static void nfsd4_free_file_rcu(struct rcu_head *rcu) { - kmem_cache_free(file_slab, f); + struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); + + kmem_cache_free(file_slab, fp); } -static inline void +void put_nfs4_file(struct nfs4_file *fi) { might_lock(&state_lock); if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del(&fi->fi_hash); + hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); - nfsd4_free_file(fi); + WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); + call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } } -static inline void -get_nfs4_file(struct nfs4_file *fi) -{ - atomic_inc(&fi->fi_ref); -} - static struct file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { @@ -355,7 +343,7 @@ find_readable_file(struct nfs4_file *f) return ret; } -static struct file * +struct file * find_any_file(struct nfs4_file *f) { struct file *ret; @@ -405,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh) return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1); } -static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) -{ - return fh1->fh_size == fh2->fh_size && - !memcmp(fh1->fh_base.fh_pad, - fh2->fh_base.fh_pad, - fh1->fh_size); -} - static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; static void @@ -491,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) __nfs4_file_put_access(fp, O_RDONLY); } -static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { struct nfs4_stid *stid; @@ -594,7 +574,7 @@ static int delegation_blocked(struct knfsd_fh *fh) } spin_unlock(&blocked_delegations_lock); } - hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + hash = jhash(&fh->fh_base, fh->fh_size, 0); if (test_bit(hash&255, bd->set[0]) && test_bit((hash>>8)&255, bd->set[0]) && test_bit((hash>>16)&255, bd->set[0])) @@ -613,7 +593,7 @@ static void block_delegations(struct knfsd_fh *fh) u32 hash; struct bloom_pair *bd = &blocked_delegations; - hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); + hash = jhash(&fh->fh_base, fh->fh_size, 0); spin_lock(&blocked_delegations_lock); __set_bit(hash&255, bd->set[bd->new]); @@ -685,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp) struct file *filp = NULL; spin_lock(&fp->fi_lock); - if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) + if (fp->fi_deleg_file && --fp->fi_delegees == 0) swap(filp, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); if (filp) { - vfs_setlease(filp, F_UNLCK, NULL, NULL); + vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp); fput(filp); } } -static void unhash_stid(struct nfs4_stid *s) +void nfs4_unhash_stid(struct nfs4_stid *s) { s->sc_type = 0; } @@ -1003,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp) list_del_init(&stp->st_locks); unhash_ol_stateid(stp); - unhash_stid(&stp->st_stid); + nfs4_unhash_stid(&stp->st_stid); } static void release_lock_stateid(struct nfs4_ol_stateid *stp) @@ -1440,7 +1420,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); - if (cses->flags & SESSION4_BACK_CHAN) { + { struct sockaddr *sa = svc_addr(rqstp); /* * This is a little silly; with sessions there's no real @@ -1515,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses) static int STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) { - if (clid->cl_boot == nn->boot_time) + /* + * We're assuming the clid was not given out from a boot + * precisely 2^32 (about 136 years) before this one. That seems + * a safe assumption: + */ + if (clid->cl_boot == (u32)nn->boot_time) return 0; dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n", clid->cl_boot, clid->cl_id, nn->boot_time); @@ -1555,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) INIT_LIST_HEAD(&clp->cl_lru); INIT_LIST_HEAD(&clp->cl_callbacks); INIT_LIST_HEAD(&clp->cl_revoked); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&clp->cl_lo_states); +#endif spin_lock_init(&clp->cl_lock); rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; @@ -1650,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp) nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_revoked)) { - dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); + dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); nfs4_put_stid(&dp->dl_stid); } @@ -1659,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp) nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } + nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); @@ -1711,15 +1700,14 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source) return 0; } -static long long +static int compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) { - long long res; - - res = o1->len - o2->len; - if (res) - return res; - return (long long)memcmp(o1->data, o2->data, o1->len); + if (o1->len < o2->len) + return -1; + if (o1->len > o2->len) + return 1; + return memcmp(o1->data, o2->data, o1->len); } static int same_name(const char *n1, const char *n2) @@ -1907,7 +1895,7 @@ add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) static struct nfs4_client * find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) { - long long cmp; + int cmp; struct rb_node *node = root->rb_node; struct nfs4_client *clp; @@ -2143,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp, static void nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid) { - /* pNFS is not supported */ +#ifdef CONFIG_NFSD_PNFS + new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS; +#else new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS; +#endif /* Referrals are supported, Migration is not. */ new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER; @@ -3057,10 +3048,9 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) +static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, + struct nfs4_file *fp) { - unsigned int hashval = file_hashval(fh); - lockdep_assert_held(&state_lock); atomic_set(&fp->fi_ref, 1); @@ -3073,7 +3063,11 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); +#ifdef CONFIG_NFSD_PNFS + INIT_LIST_HEAD(&fp->fi_lo_states); + atomic_set(&fp->fi_lo_recalls, 0); +#endif + hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -3227,7 +3221,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, } else nfs4_free_openowner(&oo->oo_owner); spin_unlock(&clp->cl_lock); - return oo; + return ret; } static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { @@ -3294,30 +3288,28 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) /* search file_hashtbl[] for file */ static struct nfs4_file * -find_file_locked(struct knfsd_fh *fh) +find_file_locked(struct knfsd_fh *fh, unsigned int hashval) { - unsigned int hashval = file_hashval(fh); struct nfs4_file *fp; - lockdep_assert_held(&state_lock); - - hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { - if (nfsd_fh_match(&fp->fi_fhandle, fh)) { - get_nfs4_file(fp); - return fp; + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { + if (fh_match(&fp->fi_fhandle, fh)) { + if (atomic_inc_not_zero(&fp->fi_ref)) + return fp; } } return NULL; } -static struct nfs4_file * +struct nfs4_file * find_file(struct knfsd_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); - spin_lock(&state_lock); - fp = find_file_locked(fh); - spin_unlock(&state_lock); + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); return fp; } @@ -3325,11 +3317,18 @@ static struct nfs4_file * find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + if (fp) + return fp; spin_lock(&state_lock); - fp = find_file_locked(fh); - if (fp == NULL) { - nfsd4_init_file(new, fh); + fp = find_file_locked(fh, hashval); + if (likely(fp == NULL)) { + nfsd4_init_file(fh, hashval, new); fp = new; } spin_unlock(&state_lock); @@ -3471,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl) } static int -nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose) +nfsd_change_deleg_cb(struct file_lock *onlist, int arg, + struct list_head *dispose) { if (arg & F_UNLCK) return lease_modify(onlist, arg, dispose); @@ -3849,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp) /* Race breaker */ if (fp->fi_deleg_file) { status = 0; - atomic_inc(&fp->fi_delegees); + ++fp->fi_delegees; hash_delegation_locked(dp, fp); goto out_unlock; } fp->fi_deleg_file = filp; - atomic_set(&fp->fi_delegees, 1); + fp->fi_delegees = 1; hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); @@ -3891,11 +3891,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, status = nfs4_setlease(dp); goto out; } - atomic_inc(&fp->fi_delegees); if (fp->fi_had_conflict) { status = -EAGAIN; goto out_unlock; } + ++fp->fi_delegees; hash_delegation_locked(dp, fp); status = 0; out_unlock: @@ -4127,7 +4127,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, nfs4_put_stateowner(so); } if (open->op_file) - nfsd4_free_file(open->op_file); + kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); } @@ -4288,7 +4288,7 @@ laundromat_main(struct work_struct *laundry) static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) { - if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -4439,7 +4439,7 @@ out_unlock: return status; } -static __be32 +__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn) @@ -4853,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, + stp->st_stid.sc_file); + nfsd4_close_open_stateid(stp); /* put reference from nfs4_preprocess_seqid_op */ @@ -5059,7 +5062,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, } else nfs4_free_lockowner(&lo->lo_owner); spin_unlock(&clp->cl_lock); - return lo; + return ret; } static void @@ -5550,10 +5553,11 @@ out_nfserr: static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { - struct file_lock **flpp; + struct file_lock *fl; int status = false; struct file *filp = find_any_file(fp); struct inode *inode; + struct file_lock_context *flctx; if (!filp) { /* Any valid lock stateid should have some sort of access */ @@ -5562,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } inode = file_inode(filp); + flctx = inode->i_flctx; - spin_lock(&inode->i_lock); - for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { - if ((*flpp)->fl_owner == (fl_owner_t)lowner) { - status = true; - break; + if (flctx && !list_empty_careful(&flctx->flc_posix)) { + spin_lock(&flctx->flc_lock); + list_for_each_entry(fl, &flctx->flc_posix, fl_list) { + if (fl->fl_owner == (fl_owner_t)lowner) { + status = true; + break; + } } + spin_unlock(&flctx->flc_lock); } - spin_unlock(&inode->i_lock); fput(filp); return status; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index eeea7a90eb87..5fb7e78169a6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -47,6 +47,7 @@ #include "state.h" #include "cache.h" #include "netns.h" +#include "pnfs.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) return ret; } +/* + * We require the high 32 bits of 'seconds' to be 0, and + * we ignore all 32 bits of 'nseconds'. + */ +static __be32 +nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv) +{ + DECODE_HEAD; + u64 sec; + + READ_BUF(12); + p = xdr_decode_hyper(p, &sec); + tv->tv_sec = sec; + tv->tv_nsec = be32_to_cpup(p++); + if (tv->tv_nsec >= (u32)1000000000) + return nfserr_inval; + + DECODE_TAIL; +} + static __be32 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) { @@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, { int expected_len, len = 0; u32 dummy32; - u64 sec; char *buf; DECODE_HEAD; @@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: - /* We require the high 32 bits of 'seconds' to be 0, and we ignore - all 32 bits of 'nseconds'. */ - READ_BUF(12); len += 12; - p = xdr_decode_hyper(p, &sec); - iattr->ia_atime.tv_sec = (time_t)sec; - iattr->ia_atime.tv_nsec = be32_to_cpup(p++); - if (iattr->ia_atime.tv_nsec >= (u32)1000000000) - return nfserr_inval; + status = nfsd4_decode_time(argp, &iattr->ia_atime); + if (status) + return status; iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); break; case NFS4_SET_TO_SERVER_TIME: @@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, dummy32 = be32_to_cpup(p++); switch (dummy32) { case NFS4_SET_TO_CLIENT_TIME: - /* We require the high 32 bits of 'seconds' to be 0, and we ignore - all 32 bits of 'nseconds'. */ - READ_BUF(12); len += 12; - p = xdr_decode_hyper(p, &sec); - iattr->ia_mtime.tv_sec = sec; - iattr->ia_mtime.tv_nsec = be32_to_cpup(p++); - if (iattr->ia_mtime.tv_nsec >= (u32)1000000000) - return nfserr_inval; + status = nfsd4_decode_time(argp, &iattr->ia_mtime); + if (status) + return status; iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); break; case NFS4_SET_TO_SERVER_TIME: @@ -1513,6 +1523,156 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str DECODE_TAIL; } +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, + struct nfsd4_getdeviceinfo *gdev) +{ + DECODE_HEAD; + u32 num, i; + + READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4); + COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid)); + gdev->gd_layout_type = be32_to_cpup(p++); + gdev->gd_maxcount = be32_to_cpup(p++); + num = be32_to_cpup(p++); + if (num) { + READ_BUF(4 * num); + gdev->gd_notify_types = be32_to_cpup(p++); + for (i = 1; i < num; i++) { + if (be32_to_cpup(p++)) { + status = nfserr_inval; + goto out; + } + } + } + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutget *lgp) +{ + DECODE_HEAD; + + READ_BUF(36); + lgp->lg_signal = be32_to_cpup(p++); + lgp->lg_layout_type = be32_to_cpup(p++); + lgp->lg_seg.iomode = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &lgp->lg_seg.offset); + p = xdr_decode_hyper(p, &lgp->lg_seg.length); + p = xdr_decode_hyper(p, &lgp->lg_minlength); + + status = nfsd4_decode_stateid(argp, &lgp->lg_sid); + if (status) + return status; + + READ_BUF(4); + lgp->lg_maxcount = be32_to_cpup(p++); + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutcommit *lcp) +{ + DECODE_HEAD; + u32 timechange; + + READ_BUF(20); + p = xdr_decode_hyper(p, &lcp->lc_seg.offset); + p = xdr_decode_hyper(p, &lcp->lc_seg.length); + lcp->lc_reclaim = be32_to_cpup(p++); + + status = nfsd4_decode_stateid(argp, &lcp->lc_sid); + if (status) + return status; + + READ_BUF(4); + lcp->lc_newoffset = be32_to_cpup(p++); + if (lcp->lc_newoffset) { + READ_BUF(8); + p = xdr_decode_hyper(p, &lcp->lc_last_wr); + } else + lcp->lc_last_wr = 0; + READ_BUF(4); + timechange = be32_to_cpup(p++); + if (timechange) { + status = nfsd4_decode_time(argp, &lcp->lc_mtime); + if (status) + return status; + } else { + lcp->lc_mtime.tv_nsec = UTIME_NOW; + } + READ_BUF(8); + lcp->lc_layout_type = be32_to_cpup(p++); + + /* + * Save the layout update in XDR format and let the layout driver deal + * with it later. + */ + lcp->lc_up_len = be32_to_cpup(p++); + if (lcp->lc_up_len > 0) { + READ_BUF(lcp->lc_up_len); + READMEM(lcp->lc_up_layout, lcp->lc_up_len); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp, + struct nfsd4_layoutreturn *lrp) +{ + DECODE_HEAD; + + READ_BUF(16); + lrp->lr_reclaim = be32_to_cpup(p++); + lrp->lr_layout_type = be32_to_cpup(p++); + lrp->lr_seg.iomode = be32_to_cpup(p++); + lrp->lr_return_type = be32_to_cpup(p++); + if (lrp->lr_return_type == RETURN_FILE) { + READ_BUF(16); + p = xdr_decode_hyper(p, &lrp->lr_seg.offset); + p = xdr_decode_hyper(p, &lrp->lr_seg.length); + + status = nfsd4_decode_stateid(argp, &lrp->lr_sid); + if (status) + return status; + + READ_BUF(4); + lrp->lrf_body_len = be32_to_cpup(p++); + if (lrp->lrf_body_len > 0) { + READ_BUF(lrp->lrf_body_len); + READMEM(lrp->lrf_body, lrp->lrf_body_len); + } + } else { + lrp->lr_seg.offset = 0; + lrp->lr_seg.length = NFS4_MAX_UINT64; + } + + DECODE_TAIL; +} +#endif /* CONFIG_NFSD_PNFS */ + +static __be32 +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + struct nfsd4_fallocate *fallocate) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); + if (status) + return status; + + READ_BUF(16); + p = xdr_decode_hyper(p, &fallocate->falloc_offset); + xdr_decode_hyper(p, &fallocate->falloc_length); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { @@ -1590,11 +1750,19 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session, [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid, [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn, +#else [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp, +#endif [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence, [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -1604,10 +1772,10 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, @@ -1714,7 +1882,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - argp->rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); DECODE_TAIL; } @@ -1795,9 +1963,12 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, } else end++; + if (found_esc) + end = next; + str = end; } - pathlen = htonl(xdr->buf->len - pathlen_offset); + pathlen = htonl(count); write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); return 0; } @@ -1886,7 +2057,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr, goto out_free; } p = xdr_encode_opaque(p, dentry->d_name.name, len); - dprintk("/%s", dentry->d_name.name); + dprintk("/%pd", dentry); spin_unlock(&dentry->d_lock); dput(dentry); ncomponents--; @@ -2519,6 +2690,30 @@ out_acl: get_parent_attributes(exp, &stat); p = xdr_encode_hyper(p, stat.ino); } +#ifdef CONFIG_NFSD_PNFS + if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) || + (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) { + if (exp->ex_layout_type) { + p = xdr_reserve_space(xdr, 8); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(exp->ex_layout_type); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(0); + } + } + + if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + *p++ = cpu_to_be32(stat.blksize); + } +#endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { status = nfsd4_encode_security_label(xdr, rqstp, context, contextlen); @@ -2748,16 +2943,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, if (entry_bytes > cd->rd_maxcount) goto fail; cd->rd_maxcount -= entry_bytes; - if (!cd->rd_dircount) - goto fail; /* * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so * let's always let through the first entry, at least: */ - name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; + if (!cd->rd_dircount) + goto fail; + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) goto fail; cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; @@ -3236,10 +3432,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { - WARN_ON_ONCE(resp->rqstp->rq_splice_ok); + WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); return nfserr_resource; } - if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) { + if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); return nfserr_resource; } @@ -3256,7 +3452,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, goto err_truncate; } - if (file->f_op->splice_read && resp->rqstp->rq_splice_ok) + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) err = nfsd4_encode_splice_read(resp, read, file, maxcount); else err = nfsd4_encode_readv(resp, read, file, maxcount); @@ -3794,6 +3990,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, return nfserr; } +#ifdef CONFIG_NFSD_PNFS +static __be32 +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_getdeviceinfo *gdev) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops = + nfsd4_layout_ops[gdev->gd_layout_type]; + u32 starting_len = xdr->buf->len, needed_len; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + goto out; + + nfserr = nfserr_resource; + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + + *p++ = cpu_to_be32(gdev->gd_layout_type); + + /* If maxcount is 0 then just update notifications */ + if (gdev->gd_maxcount != 0) { + nfserr = ops->encode_getdeviceinfo(xdr, gdev); + if (nfserr) { + /* + * We don't bother to burden the layout drivers with + * enforcing gd_maxcount, just tell the client to + * come back with a bigger buffer if it's not enough. + */ + if (xdr->buf->len + 4 > gdev->gd_maxcount) + goto toosmall; + goto out; + } + } + + nfserr = nfserr_resource; + if (gdev->gd_notify_types) { + p = xdr_reserve_space(xdr, 4 + 4); + if (!p) + goto out; + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(gdev->gd_notify_types); + } else { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out; + *p++ = 0; + } + + nfserr = 0; +out: + kfree(gdev->gd_device); + dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr)); + return nfserr; + +toosmall: + dprintk("%s: maxcount too small\n", __func__); + needed_len = xdr->buf->len + 4 /* notifications */; + xdr_truncate_encode(xdr, starting_len); + p = xdr_reserve_space(xdr, 4); + if (!p) { + nfserr = nfserr_resource; + } else { + *p++ = cpu_to_be32(needed_len); + nfserr = nfserr_toosmall; + } + goto out; +} + +static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutget *lgp) +{ + struct xdr_stream *xdr = &resp->xdr; + const struct nfsd4_layout_ops *ops = + nfsd4_layout_ops[lgp->lg_layout_type]; + __be32 *p; + + dprintk("%s: err %d\n", __func__, nfserr); + if (nfserr) + goto out; + + nfserr = nfserr_resource; + p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); + if (!p) + goto out; + + *p++ = cpu_to_be32(1); /* we always set return-on-close */ + *p++ = cpu_to_be32(lgp->lg_sid.si_generation); + p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, + sizeof(stateid_opaque_t)); + + *p++ = cpu_to_be32(1); /* we always return a single layout */ + p = xdr_encode_hyper(p, lgp->lg_seg.offset); + p = xdr_encode_hyper(p, lgp->lg_seg.length); + *p++ = cpu_to_be32(lgp->lg_seg.iomode); + *p++ = cpu_to_be32(lgp->lg_layout_type); + + nfserr = ops->encode_layoutget(xdr, lgp); +out: + kfree(lgp->lg_content); + return nfserr; +} + +static __be32 +nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutcommit *lcp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lcp->lc_size_chg); + if (lcp->lc_size_chg) { + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + p = xdr_encode_hyper(p, lcp->lc_newsize); + } + + return nfs_ok; +} + +static __be32 +nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_layoutreturn *lrp) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(lrp->lrs_present); + if (lrp->lrs_present) + return nfsd4_encode_stateid(xdr, &lrp->lr_sid); + return nfs_ok; +} +#endif /* CONFIG_NFSD_PNFS */ + static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_seek *seek) @@ -3870,11 +4216,19 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop, [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop, [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, +#ifdef CONFIG_NFSD_PNFS + [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo, + [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit, + [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget, + [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn, +#else [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop, [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop, [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop, +#endif [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name, [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence, [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop, diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 122f69185ef5..46ec934f5dee 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -165,13 +165,17 @@ int nfsd_reply_cache_init(void) { unsigned int hashsize; unsigned int i; + int status = 0; max_drc_entries = nfsd_cache_size_limit(); atomic_set(&num_drc_entries, 0); hashsize = nfsd_hashsize(max_drc_entries); maskbits = ilog2(hashsize); - register_shrinker(&nfsd_reply_cache_shrinker); + status = register_shrinker(&nfsd_reply_cache_shrinker); + if (status) + return status; + drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep), 0, 0, NULL); if (!drc_slab) @@ -490,7 +494,7 @@ found_entry: /* From the hall of fame of impractical attacks: * Is this a user who tries to snoop on the cache? */ rtn = RC_DOIT; - if (!rqstp->rq_secure && rp->c_secure) + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) goto out; /* Compose RPC reply header */ @@ -579,7 +583,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) spin_lock(&b->cache_lock); drc_mem_usage += bufsize; lru_put_end(b, rp); - rp->c_secure = rqstp->rq_secure; + rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; rp->c_state = RC_DONE; spin_unlock(&b->cache_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ca73ca79a0ee..aa47d75ddb26 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -21,6 +21,7 @@ #include "cache.h" #include "state.h" #include "netns.h" +#include "pnfs.h" /* * We have a single directory with several nodes in it. @@ -231,6 +232,10 @@ static struct file_operations reply_cache_stats_operations = { * payload - write methods */ +static inline struct net *netns(struct file *file) +{ + return file_inode(file)->i_sb->s_fs_info; +} /** * write_unlock_ip - Release all locks used by a client @@ -252,7 +257,7 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) struct sockaddr *sap = (struct sockaddr *)&address; size_t salen = sizeof(address); char *fo_path; - struct net *net = file->f_dentry->d_sb->s_fs_info; + struct net *net = netns(file); /* sanity check */ if (size == 0) @@ -350,7 +355,6 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) int len; struct auth_domain *dom; struct knfsd_fh fh; - struct net *net = file->f_dentry->d_sb->s_fs_info; if (size == 0) return -EINVAL; @@ -385,7 +389,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (!dom) return -ENOMEM; - len = exp_rootfh(net, dom, path, &fh, maxsize); + len = exp_rootfh(netns(file), dom, path, &fh, maxsize); auth_domain_put(dom); if (len) return len; @@ -429,7 +433,7 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) { char *mesg = buf; int rv; - struct net *net = file->f_dentry->d_sb->s_fs_info; + struct net *net = netns(file); if (size > 0) { int newthreads; @@ -480,7 +484,7 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) int len; int npools; int *nthreads; - struct net *net = file->f_dentry->d_sb->s_fs_info; + struct net *net = netns(file); mutex_lock(&nfsd_mutex); npools = nfsd_nrpools(net); @@ -543,8 +547,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) unsigned minor; ssize_t tlen = 0; char *sep; - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size>0) { if (nn->nfsd_serv) @@ -606,7 +609,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) num); sep = " "; - if (len > remaining) + if (len >= remaining) break; remaining -= len; buf += len; @@ -621,7 +624,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) '+' : '-', minor); - if (len > remaining) + if (len >= remaining) break; remaining -= len; buf += len; @@ -629,7 +632,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } len = snprintf(buf, remaining, "\n"); - if (len > remaining) + if (len >= remaining) return -EINVAL; return tlen + len; } @@ -830,10 +833,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size, static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; - struct net *net = file->f_dentry->d_sb->s_fs_info; mutex_lock(&nfsd_mutex); - rv = __write_ports(file, buf, size, net); + rv = __write_ports(file, buf, size, netns(file)); mutex_unlock(&nfsd_mutex); return rv; } @@ -865,8 +867,7 @@ int nfsd_max_blksize; static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { int bsize; @@ -915,8 +916,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) static ssize_t write_maxconn(struct file *file, char *buf, size_t size) { char *mesg = buf; - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); unsigned int maxconn = nn->max_connections; if (size > 0) { @@ -997,8 +997,7 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn); } @@ -1014,8 +1013,7 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) */ static ssize_t write_gracetime(struct file *file, char *buf, size_t size) { - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn); } @@ -1071,8 +1069,7 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size, static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); mutex_lock(&nfsd_mutex); rv = __write_recoverydir(file, buf, size, nn); @@ -1102,8 +1099,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) */ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) { - struct net *net = file->f_dentry->d_sb->s_fs_info; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct nfsd_net *nn = net_generic(netns(file), nfsd_net_id); if (size > 0) { switch(buf[0]) { @@ -1263,9 +1259,12 @@ static int __init init_nfsd(void) retval = nfsd4_init_slabs(); if (retval) goto out_unregister_pernet; - retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; + retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */ + if (retval) + goto out_exit_pnfs; nfsd_stat_init(); /* Statistics */ retval = nfsd_reply_cache_init(); if (retval) @@ -1287,6 +1286,8 @@ out_free_lockd: out_free_stat: nfsd_stat_shutdown(); nfsd_fault_inject_cleanup(); +out_exit_pnfs: + nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); out_unregister_pernet: @@ -1304,6 +1305,7 @@ static void __exit exit_nfsd(void) nfsd_stat_shutdown(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); + nfsd4_exit_pnfs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); unregister_pernet_subsys(&nfsd_net_ops); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 747f3b95bd11..565c4da1a9eb 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -325,22 +325,37 @@ void nfsd_lockd_shutdown(void); #define NFSD4_SUPPORTED_ATTRS_WORD2 0 +/* 4.1 */ +#ifdef CONFIG_NFSD_PNFS +#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES +#define PNFSD_SUPPORTED_ATTRS_WORD2 \ +(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES) +#else +#define PNFSD_SUPPORTED_ATTRS_WORD1 0 +#define PNFSD_SUPPORTED_ATTRS_WORD2 0 +#endif /* CONFIG_NFSD_PNFS */ + #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \ NFSD4_SUPPORTED_ATTRS_WORD0 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \ - NFSD4_SUPPORTED_ATTRS_WORD1 + (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1) #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \ - (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT) + (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_SUPPATTR_EXCLCREAT) +/* 4.2 */ #ifdef CONFIG_NFSD_V4_SECURITY_LABEL -#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ - (NFSD4_1_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SECURITY_LABEL) +#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL #else -#define NFSD4_2_SUPPORTED_ATTRS_WORD2 0 +#define NFSD4_2_SECURITY_ATTRS 0 #endif +#define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ + (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + NFSD4_2_SECURITY_ATTRS) + static inline u32 nfsd_suppattrs0(u32 minorversion) { return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 88026fc6a981..e9fa966fc37f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -86,7 +86,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("nfsd: request from insecure port %s!\n", svc_print_addr(rqstp, buf, sizeof(buf))); @@ -114,8 +114,8 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp, * We're exposing only the directories and symlinks that have to be * traversed on the way to real exports: */ - if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) && - !S_ISLNK(dentry->d_inode->i_mode))) + if (unlikely(!d_is_dir(dentry) && + !d_is_symlink(dentry))) return nfserr_stale; /* * A pseudoroot export gives permission to access only one @@ -259,7 +259,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) goto out; } - if (S_ISDIR(dentry->d_inode->i_mode) && + if (d_is_dir(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED)) { printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %pd2\n", dentry); @@ -414,7 +414,7 @@ static inline void _fh_update_old(struct dentry *dentry, { fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino); fh->ofh_generation = dentry->d_inode->i_generation; - if (S_ISDIR(dentry->d_inode->i_mode) || + if (d_is_dir(dentry) || (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) fh->ofh_dirino = 0; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 08236d70c667..f22920442172 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize) return fhp; } +static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_size != fh2->fh_size) + return false; + if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0) + return false; + return true; +} + +static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) +{ + if (fh1->fh_fsid_type != fh2->fh_fsid_type) + return false; + if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0) + return false; + return true; +} + #ifdef CONFIG_NFSD_V3 /* * The wcc data stored in current_fh should be cleared diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 752d56bbe0ba..9277cc91c21b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -119,6 +119,7 @@ struct svc_program nfsd_program = { static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = { [0] = 1, [1] = 1, + [2] = 1, }; int nfsd_vers(int vers, enum vers_op change) @@ -692,7 +693,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit || rqstp->rq_dropme) { + if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) { dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); return 0; diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h new file mode 100644 index 000000000000..d4c4453674c6 --- /dev/null +++ b/fs/nfsd/pnfs.h @@ -0,0 +1,86 @@ +#ifndef _FS_NFSD_PNFS_H +#define _FS_NFSD_PNFS_H 1 + +#ifdef CONFIG_NFSD_V4 +#include <linux/exportfs.h> +#include <linux/nfsd/export.h> + +#include "state.h" +#include "xdr4.h" + +struct xdr_stream; + +struct nfsd4_deviceid_map { + struct list_head hash; + u64 idx; + int fsid_type; + u32 fsid[]; +}; + +struct nfsd4_layout_ops { + u32 notify_types; + + __be32 (*proc_getdeviceinfo)(struct super_block *sb, + struct nfsd4_getdeviceinfo *gdevp); + __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, + struct nfsd4_getdeviceinfo *gdevp); + + __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, + struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *, + struct nfsd4_layoutget *lgp); + + __be32 (*proc_layoutcommit)(struct inode *inode, + struct nfsd4_layoutcommit *lcp); +}; + +extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; +extern const struct nfsd4_layout_ops bl_layout_ops; + +__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + bool create, u32 layout_type, struct nfs4_layout_stateid **lsp); +__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp, + struct nfs4_layout_stateid *ls); +__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, + struct nfsd4_layoutreturn *lrp); +int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, + u32 device_generation); +struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx); +#endif /* CONFIG_NFSD_V4 */ + +#ifdef CONFIG_NFSD_PNFS +void nfsd4_setup_layout_type(struct svc_export *exp); +void nfsd4_return_all_client_layouts(struct nfs4_client *); +void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp); +int nfsd4_init_pnfs(void); +void nfsd4_exit_pnfs(void); +#else +struct nfs4_client; +struct nfs4_file; + +static inline void nfsd4_setup_layout_type(struct svc_export *exp) +{ +} + +static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp) +{ +} +static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, + struct nfs4_file *fp) +{ +} +static inline void nfsd4_exit_pnfs(void) +{ +} +static inline int nfsd4_init_pnfs(void) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _FS_NFSD_PNFS_H */ diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 2712042a66b1..4f3bfeb11766 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -92,6 +92,7 @@ struct nfs4_stid { /* For a deleg stateid kept around only to process free_stateid's: */ #define NFS4_REVOKED_DELEG_STID 16 #define NFS4_CLOSED_DELEG_STID 32 +#define NFS4_LAYOUT_STID 64 unsigned char sc_type; stateid_t sc_stateid; struct nfs4_client *sc_client; @@ -297,6 +298,9 @@ struct nfs4_client { struct list_head cl_delegations; struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */ struct list_head cl_lru; /* tail queue */ +#ifdef CONFIG_NFSD_PNFS + struct list_head cl_lo_states; /* outstanding layout states */ +#endif struct xdr_netobj cl_name; /* id generated by client */ nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ @@ -463,17 +467,24 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * - * These objects are global. nfsd only keeps one instance of a nfs4_file per - * inode (though it may keep multiple file descriptors open per inode). These - * are tracked in the file_hashtbl which is protected by the state_lock - * spinlock. + * These objects are global. nfsd keeps one instance of a nfs4_file per + * filehandle (though it may keep multiple file descriptors for each). Each + * inode can have multiple filehandles associated with it, so there is + * (potentially) a many to one relationship between this struct and struct + * inode. + * + * These are hashed by filehandle in the file_hashtbl, which is protected by + * the global state_lock spinlock. */ struct nfs4_file { atomic_t fi_ref; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash by "struct inode *" */ + struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; - struct list_head fi_delegations; + union { + struct list_head fi_delegations; + struct rcu_head fi_rcu; + }; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* @@ -486,9 +497,13 @@ struct nfs4_file { atomic_t fi_access[2]; u32 fi_share_deny; struct file *fi_deleg_file; - atomic_t fi_delegees; + int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; +#ifdef CONFIG_NFSD_PNFS + struct list_head fi_lo_states; + atomic_t fi_lo_recalls; +#endif }; /* @@ -521,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) return container_of(s, struct nfs4_ol_stateid, st_stid); } +struct nfs4_layout_stateid { + struct nfs4_stid ls_stid; + struct list_head ls_perclnt; + struct list_head ls_perfile; + spinlock_t ls_lock; + struct list_head ls_layouts; + u32 ls_layout_type; + struct file *ls_file; + struct nfsd4_callback ls_recall; + stateid_t ls_recall_sid; + bool ls_recalled; +}; + +static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s) +{ + return container_of(s, struct nfs4_layout_stateid, ls_stid); +} + /* flags for preprocess_seqid_op() */ #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 @@ -528,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) enum nfsd4_cb_op { NFSPROC4_CLNT_CB_NULL = 0, NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_LAYOUT, NFSPROC4_CLNT_CB_SEQUENCE, }; @@ -538,6 +572,12 @@ struct nfsd_net; extern __be32 nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, stateid_t *stateid, int flags, struct file **filp); +__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, + stateid_t *stateid, unsigned char typemask, + struct nfs4_stid **s, struct nfsd_net *nn); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, + struct kmem_cache *slab); +void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); extern void nfs4_release_reclaim(struct nfsd_net *); @@ -560,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); +struct nfs4_file *find_file(struct knfsd_fh *fh); +void put_nfs4_file(struct nfs4_file *fi); +static inline void get_nfs4_file(struct nfs4_file *fi) +{ + atomic_inc(&fi->fi_ref); +} +struct file *find_any_file(struct nfs4_file *f); + /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c new file mode 100644 index 000000000000..82f89070594c --- /dev/null +++ b/fs/nfsd/trace.c @@ -0,0 +1,5 @@ + +#include "state.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h new file mode 100644 index 000000000000..c668520c344b --- /dev/null +++ b/fs/nfsd/trace.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nfsd + +#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _NFSD_TRACE_H + +#include <linux/tracepoint.h> + +DECLARE_EVENT_CLASS(nfsd_stateid_class, + TP_PROTO(stateid_t *stp), + TP_ARGS(stp), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, si_id) + __field(u32, si_generation) + ), + TP_fast_assign( + __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; + __entry->cl_id = stp->si_opaque.so_clid.cl_id; + __entry->si_id = stp->si_opaque.so_id; + __entry->si_generation = stp->si_generation; + ), + TP_printk("client %08x:%08x stateid %08x:%08x", + __entry->cl_boot, + __entry->cl_id, + __entry->si_id, + __entry->si_generation) +) + +#define DEFINE_STATEID_EVENT(name) \ +DEFINE_EVENT(nfsd_stateid_class, name, \ + TP_PROTO(stateid_t *stp), \ + TP_ARGS(stp)) +DEFINE_STATEID_EVENT(layoutstate_alloc); +DEFINE_STATEID_EVENT(layoutstate_unhash); +DEFINE_STATEID_EVENT(layoutstate_free); +DEFINE_STATEID_EVENT(layout_get_lookup_fail); +DEFINE_STATEID_EVENT(layout_commit_lookup_fail); +DEFINE_STATEID_EVENT(layout_return_lookup_fail); +DEFINE_STATEID_EVENT(layout_recall); +DEFINE_STATEID_EVENT(layout_recall_done); +DEFINE_STATEID_EVENT(layout_recall_fail); +DEFINE_STATEID_EVENT(layout_recall_release); + +#endif /* _NFSD_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 989129e2d6ea..368526582429 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/splice.h> +#include <linux/falloc.h> #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/delay.h> @@ -533,6 +534,26 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif +__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, loff_t len, + int flags) +{ + __be32 err; + int error; + + if (!S_ISREG(file_inode(file)->i_mode)) + return nfserr_inval; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE); + if (err) + return err; + + error = vfs_fallocate(file, flags, offset, len); + if (!error) + error = commit_metadata(fhp); + + return nfserrno(error); +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 @@ -594,9 +615,9 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor export = fhp->fh_export; dentry = fhp->fh_dentry; - if (S_ISREG(dentry->d_inode->i_mode)) + if (d_is_reg(dentry)) map = nfs3_regaccess; - else if (S_ISDIR(dentry->d_inode->i_mode)) + else if (d_is_dir(dentry)) map = nfs3_diraccess; else map = nfs3_anyaccess; @@ -881,7 +902,7 @@ static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - if (file->f_op->splice_read && rqstp->rq_splice_ok) + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) return nfsd_splice_read(rqstp, file, offset, count); else return nfsd_readv(file, offset, vec, vlen, count); @@ -930,7 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, unsigned long *cnt, int *stablep) { struct svc_export *exp; - struct dentry *dentry; struct inode *inode; mm_segment_t oldfs; __be32 err = 0; @@ -938,9 +958,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; + loff_t end = LLONG_MAX; unsigned int pflags = current->flags; - if (rqstp->rq_local) + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* * We want less throttling in balance_dirty_pages() * and shrink_inactive_list() so that nfs to @@ -949,8 +970,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, */ current->flags |= PF_LESS_THROTTLE; - dentry = file->f_path.dentry; - inode = dentry->d_inode; + inode = file_inode(file); exp = fhp->fh_export; use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); @@ -969,10 +989,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, fsnotify_modify(file); if (stable) { - if (use_wgather) + if (use_wgather) { host_err = wait_for_concurrent_writes(file); - else - host_err = vfs_fsync_range(file, offset, offset+*cnt, 0); + } else { + if (*cnt) + end = offset + *cnt - 1; + host_err = vfs_fsync_range(file, offset, end, 0); + } } out_nfserr: @@ -981,7 +1004,7 @@ out_nfserr: err = 0; else err = nfserrno(host_err); - if (rqstp->rq_local) + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); return err; } @@ -1379,7 +1402,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (createmode) { case NFS3_CREATE_UNCHECKED: - if (! S_ISREG(dchild->d_inode->i_mode)) + if (! d_is_reg(dchild)) goto out; else if (truncp) { /* in nfsv4, we need to treat this case a little @@ -1592,7 +1615,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (err) goto out; err = nfserr_isdir; - if (S_ISDIR(tfhp->fh_dentry->d_inode->i_mode)) + if (d_is_dir(tfhp->fh_dentry)) goto out; err = nfserr_perm; if (!len) @@ -1819,10 +1842,12 @@ struct readdir_data { int full; }; -static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) +static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) { - struct readdir_data *buf = __buf; + struct readdir_data *buf = + container_of(ctx, struct readdir_data, ctx); struct buffered_dirent *de = (void *)(buf->dirent + buf->used); unsigned int reclen; @@ -1842,7 +1867,7 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, return 0; } -static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, +static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, struct readdir_cd *cdp, loff_t *offsetp) { struct buffered_dirent *de; @@ -1926,7 +1951,7 @@ static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, */ __be32 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, - struct readdir_cd *cdp, filldir_t func) + struct readdir_cd *cdp, nfsd_filldir_t func) { __be32 err; struct file *file; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index c2ff3f14e5f6..2050cb016998 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -36,7 +36,7 @@ /* * Callback function for readdir */ -typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int); +typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ int nfsd_racache_init(int); @@ -54,6 +54,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); +__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, + struct file *, loff_t, loff_t, int); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, @@ -95,7 +97,7 @@ __be32 nfsd_rename(struct svc_rqst *, __be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type, char *name, int len); __be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *, - loff_t *, struct readdir_cd *, filldir_t); + loff_t *, struct readdir_cd *, nfsd_filldir_t); __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, struct kstatfs *, int access); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 5720e9457f33..0bda93e58e1b 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -428,6 +428,68 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_deviceid { + u64 fsid_idx; + u32 generation; + u32 pad; +}; + +struct nfsd4_layout_seg { + u32 iomode; + u64 offset; + u64 length; +}; + +struct nfsd4_getdeviceinfo { + struct nfsd4_deviceid gd_devid; /* request */ + u32 gd_layout_type; /* request */ + u32 gd_maxcount; /* request */ + u32 gd_notify_types;/* request - response */ + void *gd_device; /* response */ +}; + +struct nfsd4_layoutget { + u64 lg_minlength; /* request */ + u32 lg_signal; /* request */ + u32 lg_layout_type; /* request */ + u32 lg_maxcount; /* request */ + stateid_t lg_sid; /* request/response */ + struct nfsd4_layout_seg lg_seg; /* request/response */ + void *lg_content; /* response */ +}; + +struct nfsd4_layoutcommit { + stateid_t lc_sid; /* request */ + struct nfsd4_layout_seg lc_seg; /* request */ + u32 lc_reclaim; /* request */ + u32 lc_newoffset; /* request */ + u64 lc_last_wr; /* request */ + struct timespec lc_mtime; /* request */ + u32 lc_layout_type; /* request */ + u32 lc_up_len; /* layout length */ + void *lc_up_layout; /* decoded by callback */ + u32 lc_size_chg; /* boolean for response */ + u64 lc_newsize; /* response */ +}; + +struct nfsd4_layoutreturn { + u32 lr_return_type; /* request */ + u32 lr_layout_type; /* request */ + struct nfsd4_layout_seg lr_seg; /* request */ + u32 lr_reclaim; /* request */ + u32 lrf_body_len; /* request */ + void *lrf_body; /* request */ + stateid_t lr_sid; /* request/response */ + u32 lrs_present; /* response */ +}; + +struct nfsd4_fallocate { + /* request */ + stateid_t falloc_stateid; + loff_t falloc_offset; + u64 falloc_length; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -484,8 +546,14 @@ struct nfsd4_op { struct nfsd4_reclaim_complete reclaim_complete; struct nfsd4_test_stateid test_stateid; struct nfsd4_free_stateid free_stateid; + struct nfsd4_getdeviceinfo getdeviceinfo; + struct nfsd4_layoutget layoutget; + struct nfsd4_layoutcommit layoutcommit; + struct nfsd4_layoutreturn layoutreturn; /* NFSv4.2 */ + struct nfsd4_fallocate allocate; + struct nfsd4_fallocate deallocate; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index c5c55dfb91a9..c47f6fdb111a 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -21,3 +21,10 @@ #define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) +#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + 3 + \ + enc_nfs4_fh_sz + 4) +#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + \ + op_dec_sz) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b2e3ff347620..ecdbae19a766 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -31,6 +31,8 @@ #include "alloc.h" #include "dat.h" +static void __nilfs_btree_init(struct nilfs_bmap *bmap); + static struct nilfs_btree_path *nilfs_btree_alloc_path(void) { struct nilfs_btree_path *path; @@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, return ret; } +/** + * nilfs_btree_root_broken - verify consistency of btree root node + * @node: btree root node to be examined + * @ino: inode number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, + unsigned long ino) +{ + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level > NILFS_BTREE_LEVEL_MAX || + nchildren < 0 || + nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { + pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", + ino, level, flags, nchildren); + ret = 1; + } + return ret; +} + int nilfs_btree_broken_node_block(struct buffer_head *bh) { int ret; @@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, /* convert and insert */ dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; - nilfs_btree_init(btree); + __nilfs_btree_init(btree); if (nreq != NULL) { nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); @@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_gather_data = NULL, }; -int nilfs_btree_init(struct nilfs_bmap *bmap) +static void __nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; bmap->b_nchildren_per_block = NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); - return 0; +} + +int nilfs_btree_init(struct nilfs_bmap *bmap) +{ + int ret = 0; + + __nilfs_btree_init(bmap); + + if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), + bmap->b_inode->i_ino)) + ret = -EIO; + return ret; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index e9e3325f29f3..a8c728acb7a8 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -39,21 +39,15 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ struct the_nilfs *nilfs; struct inode *inode = file->f_mapping->host; - int err; - - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - mutex_lock(&inode->i_mutex); + int err = 0; if (nilfs_inode_dirty(inode)) { if (datasync) err = nilfs_construct_dsync_segment(inode->i_sb, inode, - 0, LLONG_MAX); + start, end); else err = nilfs_construct_segment(inode->i_sb); } - mutex_unlock(&inode->i_mutex); nilfs = inode->i_sb->s_fs_info; if (!err) @@ -134,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 57ceaf33d177..748ca238915a 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); inode->i_mapping->a_ops = &empty_aops; - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index e1fa69b341b9..8b5969538f39 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -49,6 +49,8 @@ struct nilfs_iget_args { int for_gc; }; +static int nilfs_iget_test(struct inode *inode, void *opaque); + void nilfs_inode_add_blocks(struct inode *inode, int n) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -348,6 +350,17 @@ const struct address_space_operations nilfs_aops = { .is_partially_uptodate = block_is_partially_uptodate, }; +static int nilfs_insert_inode_locked(struct inode *inode, + struct nilfs_root *root, + unsigned long ino) +{ + struct nilfs_iget_args args = { + .ino = ino, .root = root, .cno = 0, .for_gc = 0 + }; + + return insert_inode_locked4(inode, ino, nilfs_iget_test, &args); +} + struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; @@ -383,7 +396,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); if (err < 0) - goto failed_bmap; + goto failed_after_creation; set_bit(NILFS_I_BMAP, &ii->i_state); /* No lock is needed; iget() ensures it. */ @@ -399,21 +412,24 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) spin_lock(&nilfs->ns_next_gen_lock); inode->i_generation = nilfs->ns_next_generation++; spin_unlock(&nilfs->ns_next_gen_lock); - insert_inode_hash(inode); + if (nilfs_insert_inode_locked(inode, root, ino) < 0) { + err = -EIO; + goto failed_after_creation; + } err = nilfs_init_acl(inode, dir); if (unlikely(err)) - goto failed_acl; /* never occur. When supporting + goto failed_after_creation; /* never occur. When supporting nilfs_init_acl(), proper cancellation of above jobs should be considered */ return inode; - failed_acl: - failed_bmap: + failed_after_creation: clear_nlink(inode); + unlock_new_inode(inode); iput(inode); /* raw_inode will be deleted through - generic_delete_inode() */ + nilfs_evict_inode() */ goto failed; failed_ifile_create_inode: @@ -461,8 +477,8 @@ int nilfs_read_inode_common(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - if (inode->i_nlink == 0 && inode->i_mode == 0) - return -EINVAL; /* this inode is deleted */ + if (inode->i_nlink == 0) + return -ESTALE; /* this inode is deleted */ inode->i_blocks = le64_to_cpu(raw_inode->i_blocks); ii->i_flags = le32_to_cpu(raw_inode->i_flags); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index c4dcd1db57ee..892cf5ffdb8e 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, gfp_mask); - inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; inode->i_op = &def_mdt_iops; inode->i_fop = &def_mdt_fops; @@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct nilfs_shadow_map *shadow) { struct nilfs_mdt_info *mi = NILFS_MDT(inode); - struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, inode, bdi); + nilfs_mapping_init(&shadow->frozen_data, inode); address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); + nilfs_mapping_init(&shadow->frozen_btnodes, inode); mi->mi_shadow = shadow; return 0; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9de78f08989e..0f84b257932c 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -51,9 +51,11 @@ static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode) int err = nilfs_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); + unlock_new_inode(inode); return 0; } inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -182,6 +184,7 @@ out: out_fail: drop_nlink(inode); nilfs_mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -201,11 +204,15 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, inode_inc_link_count(inode); ihold(inode); - err = nilfs_add_nondir(dentry, inode); - if (!err) + err = nilfs_add_link(dentry, inode); + if (!err) { + d_instantiate(dentry, inode); err = nilfs_transaction_commit(dir->i_sb); - else + } else { + inode_dec_link_count(inode); + iput(inode); nilfs_transaction_abort(dir->i_sb); + } return err; } @@ -243,6 +250,7 @@ static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) nilfs_mark_inode_dirty(inode); d_instantiate(dentry, inode); + unlock_new_inode(inode); out: if (!err) err = nilfs_transaction_commit(dir->i_sb); @@ -255,6 +263,7 @@ out_fail: drop_nlink(inode); drop_nlink(inode); nilfs_mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); out_dir: drop_nlink(dir); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 91093cd74f0d..385704027575 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -141,7 +141,6 @@ enum { * @ti_save: Backup of journal_info field of task_struct * @ti_flags: Flags * @ti_count: Nest level - * @ti_garbage: List of inode to be put when releasing semaphore */ struct nilfs_transaction_info { u32 ti_magic; @@ -150,7 +149,6 @@ struct nilfs_transaction_info { one of other filesystems has a bug. */ unsigned short ti_flags; unsigned short ti_count; - struct list_head ti_garbage; }; /* ti_magic */ diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index da276640f776..700ecbcca55d 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, - struct backing_dev_info *bdi) +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode) { mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->private_data = NULL; - mapping->backing_dev_info = bdi; mapping->a_ops = &empty_aops; } diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index ef30c5c2426f..a43b8287d012 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_page(struct page *, bool); void nilfs_clear_dirty_pages(struct address_space *, bool); -void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, - struct backing_dev_info *bdi); +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, sector_t start_blk, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 7ef18fc656c2..0c3f303baf32 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb, ti->ti_count = 0; ti->ti_save = cur_ti; ti->ti_magic = NILFS_TI_MAGIC; - INIT_LIST_HEAD(&ti->ti_garbage); current->journal_info = ti; for (;;) { @@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb) up_write(&nilfs->ns_segctor_sem); current->journal_info = ti->ti_save; - if (!list_empty(&ti->ti_garbage)) - nilfs_dispose_list(nilfs, &ti->ti_garbage, 0); } static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, @@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs, } } +static void nilfs_iput_work_func(struct work_struct *work) +{ + struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info, + sc_iput_work); + struct the_nilfs *nilfs = sci->sc_super->s_fs_info; + + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0); +} + static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs, struct nilfs_root *root) { @@ -1900,8 +1906,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { - struct nilfs_transaction_info *ti = current->journal_info; struct nilfs_inode_info *ii, *n; + int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); + int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { @@ -1912,9 +1919,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, clear_bit(NILFS_I_BUSY, &ii->i_state); brelse(ii->i_bh); ii->i_bh = NULL; - list_move_tail(&ii->i_dirty, &ti->ti_garbage); + list_del_init(&ii->i_dirty); + if (!ii->vfs_inode.i_nlink || during_mount) { + /* + * Defer calling iput() to avoid deadlocks if + * i_nlink == 0 or mount is not yet finished. + */ + list_add_tail(&ii->i_dirty, &sci->sc_iput_queue); + defer_iput = true; + } else { + spin_unlock(&nilfs->ns_inode_lock); + iput(&ii->vfs_inode); + spin_lock(&nilfs->ns_inode_lock); + } } spin_unlock(&nilfs->ns_inode_lock); + + if (defer_iput) + schedule_work(&sci->sc_iput_work); } /* @@ -2583,6 +2605,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_segbufs); INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); + INIT_LIST_HEAD(&sci->sc_iput_queue); + INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); init_timer(&sci->sc_timer); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; @@ -2609,6 +2633,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) ret = nilfs_segctor_construct(sci, SC_LSEG_SR); nilfs_transaction_unlock(sci->sc_super); + flush_work(&sci->sc_iput_work); + } while (ret && retrycount-- > 0); } @@ -2633,6 +2659,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) || sci->sc_seq_request != sci->sc_seq_done); spin_unlock(&sci->sc_state_lock); + if (flush_work(&sci->sc_iput_work)) + flag = true; + if (flag || !nilfs_segctor_confirm(sci)) nilfs_segctor_write_out(sci); @@ -2642,6 +2671,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1); } + if (!list_empty(&sci->sc_iput_queue)) { + nilfs_warning(sci->sc_super, __func__, + "iput queue is not empty\n"); + nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1); + } + WARN_ON(!list_empty(&sci->sc_segbufs)); WARN_ON(!list_empty(&sci->sc_write_logs)); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 38a1d0013314..a48d6de1e02c 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -26,6 +26,7 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> +#include <linux/workqueue.h> #include <linux/nilfs2_fs.h> #include "nilfs.h" @@ -92,6 +93,8 @@ struct nilfs_segsum_pointer { * @sc_nblk_inc: Block count of current generation * @sc_dirty_files: List of files to be written * @sc_gc_inodes: List of GC inodes having blocks to be written + * @sc_iput_queue: list of inodes for which iput should be done + * @sc_iput_work: work struct to defer iput call * @sc_freesegs: array of segment numbers to be freed * @sc_nfreesegs: number of segments on @sc_freesegs * @sc_dsync_inode: inode whose data pages are written for a sync operation @@ -135,6 +138,8 @@ struct nilfs_sc_info { struct list_head sc_dirty_files; struct list_head sc_gc_inodes; + struct list_head sc_iput_queue; + struct work_struct sc_iput_work; __u64 *sc_freesegs; size_t sc_nfreesegs; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 2e5b3ec85b8f..5bc2a1cf73c3 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_state = 0; ii->i_cno = 0; ii->vfs_inode.i_version = 1; - nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } @@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) { struct the_nilfs *nilfs; struct nilfs_root *fsroot; - struct backing_dev_info *bdi; __u64 cno; int err; @@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; - sb->s_bdi = bdi ? : &default_backing_dev_info; + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 9da25fe9ea61..69bd801afb53 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -808,8 +808,7 @@ void nilfs_put_root(struct nilfs_root *root) spin_lock(&nilfs->ns_cptree_lock); rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); - if (root->ifile) - iput(root->ifile); + iput(root->ifile); kfree(root); } diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 22c629eedd82..2a24249b30af 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,5 +1,6 @@ config FSNOTIFY def_bool n + select SRCU source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index caaaf9dfe353..44523f4a6084 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -69,8 +69,8 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) if (old_mask == new_mask) return; - if (fsn_mark->i.inode) - fsnotify_recalc_inode_mask(fsn_mark->i.inode); + if (fsn_mark->inode) + fsnotify_recalc_inode_mask(fsn_mark->inode); } /* diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 30d3addfad75..d2f97ecca6a5 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -115,8 +115,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, return false; /* sorry, fanotify only gives a damn about files and dirs */ - if (!S_ISREG(path->dentry->d_inode->i_mode) && - !S_ISDIR(path->dentry->d_inode->i_mode)) + if (!d_is_reg(path->dentry) && + !d_can_lookup(path->dentry)) return false; if (inode_mark && vfsmnt_mark) { @@ -139,11 +139,12 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, BUG(); } - if (S_ISDIR(path->dentry->d_inode->i_mode) && - (marks_ignored_mask & FS_ISDIR)) + if (d_is_dir(path->dentry) && + !(marks_mask & FS_ISDIR & ~marks_ignored_mask)) return false; - if (event_mask & marks_mask & ~marks_ignored_mask) + if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask & + ~marks_ignored_mask) return true; return false; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c991616acca9..cf275500a665 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -259,16 +259,15 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, struct fsnotify_event *kevent; char __user *start; int ret; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; pr_debug("%s: group=%p\n", __func__, group); + add_wait_queue(&group->notification_waitq, &wait); while (1) { - prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); - mutex_lock(&group->notification_mutex); kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); @@ -289,7 +288,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, if (start != buf) break; - schedule(); + + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); continue; } @@ -318,8 +318,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, buf += ret; count -= ret; } + remove_wait_queue(&group->notification_waitq, &wait); - finish_wait(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; @@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, unsigned int flags, int *destroy) { - __u32 oldmask; + __u32 oldmask = 0; spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { + __u32 tmask = fsn_mark->mask & ~mask; + + if (flags & FAN_MARK_ONDIR) + tmask &= ~FAN_ONDIR; + oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask)); + fsnotify_set_mark_mask_locked(fsn_mark, tmask); } else { - oldmask = fsn_mark->ignored_mask; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask)); + __u32 tmask = fsn_mark->ignored_mask & ~mask; + if (flags & FAN_MARK_ONDIR) + tmask &= ~FAN_ONDIR; + + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); } + *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); - *destroy = !(oldmask & ~mask); - return mask & oldmask; } @@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { + __u32 tmask = fsn_mark->mask | mask; + + if (flags & FAN_MARK_ONDIR) + tmask |= FAN_ONDIR; + oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); + fsnotify_set_mark_mask_locked(fsn_mark, tmask); } else { __u32 tmask = fsn_mark->ignored_mask | mask; + if (flags & FAN_MARK_ONDIR) + tmask |= FAN_ONDIR; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } - - if (!(flags & FAN_MARK_ONDIR)) { - __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); - } - spin_unlock(&fsn_mark->lock); return mask & ~oldmask; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 9d7e2b9659cb..58b7cdb63da9 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -20,25 +20,24 @@ #if defined(CONFIG_INOTIFY_USER) || defined(CONFIG_FANOTIFY) -static int show_fdinfo(struct seq_file *m, struct file *f, - int (*show)(struct seq_file *m, struct fsnotify_mark *mark)) +static void show_fdinfo(struct seq_file *m, struct file *f, + void (*show)(struct seq_file *m, + struct fsnotify_mark *mark)) { struct fsnotify_group *group = f->private_data; struct fsnotify_mark *mark; - int ret = 0; mutex_lock(&group->mark_mutex); list_for_each_entry(mark, &group->marks_list, g_list) { - ret = show(m, mark); - if (ret) + show(m, mark); + if (seq_has_overflowed(m)) break; } mutex_unlock(&group->mark_mutex); - return ret; } #if defined(CONFIG_EXPORTFS) -static int show_mark_fhandle(struct seq_file *m, struct inode *inode) +static void show_mark_fhandle(struct seq_file *m, struct inode *inode) { struct { struct file_handle handle; @@ -52,98 +51,85 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); - return 0; + return; } f.handle.handle_type = ret; f.handle.handle_bytes = size * sizeof(u32); - ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", - f.handle.handle_bytes, f.handle.handle_type); + seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:", + f.handle.handle_bytes, f.handle.handle_type); for (i = 0; i < f.handle.handle_bytes; i++) - ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]); - - return ret; + seq_printf(m, "%02x", (int)f.handle.f_handle[i]); } #else -static int show_mark_fhandle(struct seq_file *m, struct inode *inode) +static void show_mark_fhandle(struct seq_file *m, struct inode *inode) { - return 0; } #endif #ifdef CONFIG_INOTIFY_USER -static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) { struct inotify_inode_mark *inode_mark; struct inode *inode; - int ret = 0; if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE))) - return 0; + return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); - inode = igrab(mark->i.inode); + inode = igrab(mark->inode); if (inode) { - ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x " - "mask:%x ignored_mask:%x ", - inode_mark->wd, inode->i_ino, - inode->i_sb->s_dev, - mark->mask, mark->ignored_mask); - ret |= show_mark_fhandle(m, inode); - ret |= seq_putc(m, '\n'); + seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ", + inode_mark->wd, inode->i_ino, inode->i_sb->s_dev, + mark->mask, mark->ignored_mask); + show_mark_fhandle(m, inode); + seq_putc(m, '\n'); iput(inode); } - - return ret; } -int inotify_show_fdinfo(struct seq_file *m, struct file *f) +void inotify_show_fdinfo(struct seq_file *m, struct file *f) { - return show_fdinfo(m, f, inotify_fdinfo); + show_fdinfo(m, f, inotify_fdinfo); } #endif /* CONFIG_INOTIFY_USER */ #ifdef CONFIG_FANOTIFY -static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) +static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) { unsigned int mflags = 0; struct inode *inode; - int ret = 0; if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) - return 0; + return; if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = igrab(mark->i.inode); + inode = igrab(mark->inode); if (!inode) - goto out; - ret = seq_printf(m, "fanotify ino:%lx sdev:%x " - "mflags:%x mask:%x ignored_mask:%x ", - inode->i_ino, inode->i_sb->s_dev, - mflags, mark->mask, mark->ignored_mask); - ret |= show_mark_fhandle(m, inode); - ret |= seq_putc(m, '\n'); + return; + seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", + inode->i_ino, inode->i_sb->s_dev, + mflags, mark->mask, mark->ignored_mask); + show_mark_fhandle(m, inode); + seq_putc(m, '\n'); iput(inode); } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { - struct mount *mnt = real_mount(mark->m.mnt); + struct mount *mnt = real_mount(mark->mnt); - ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x " - "ignored_mask:%x\n", mnt->mnt_id, mflags, - mark->mask, mark->ignored_mask); + seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", + mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); } -out: - return ret; } -int fanotify_show_fdinfo(struct seq_file *m, struct file *f) +void fanotify_show_fdinfo(struct seq_file *m, struct file *f) { struct fsnotify_group *group = f->private_data; unsigned int flags = 0; @@ -169,7 +155,7 @@ int fanotify_show_fdinfo(struct seq_file *m, struct file *f) seq_printf(m, "fanotify flags:%x event-flags:%x\n", flags, group->fanotify_data.f_flags); - return show_fdinfo(m, f, fanotify_fdinfo); + show_fdinfo(m, f, fanotify_fdinfo); } #endif /* CONFIG_FANOTIFY */ diff --git a/fs/notify/fdinfo.h b/fs/notify/fdinfo.h index 556afda990e9..9664c4904d6b 100644 --- a/fs/notify/fdinfo.h +++ b/fs/notify/fdinfo.h @@ -10,11 +10,11 @@ struct file; #ifdef CONFIG_PROC_FS #ifdef CONFIG_INOTIFY_USER -extern int inotify_show_fdinfo(struct seq_file *m, struct file *f); +void inotify_show_fdinfo(struct seq_file *m, struct file *f); #endif #ifdef CONFIG_FANOTIFY -extern int fanotify_show_fdinfo(struct seq_file *m, struct file *f); +void fanotify_show_fdinfo(struct seq_file *m, struct file *f); #endif #else /* CONFIG_PROC_FS */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 89326acd4561..dd3fb0b17be7 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -63,14 +63,14 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ - hlist_for_each_entry(alias, &inode->i_dentry, d_alias) { + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); - list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { + list_for_each_entry(child, &alias->d_subdirs, d_child) { if (!child->d_inode) continue; @@ -242,13 +242,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), - struct fsnotify_mark, i.i_list); + struct fsnotify_mark, obj_list); inode_group = inode_mark->group; } if (vfsmount_node) { vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), - struct fsnotify_mark, m.m_list); + struct fsnotify_mark, obj_list); vfsmount_group = vfsmount_mark->group; } diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 3b68b0ae0a97..13a00be516d2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -12,12 +12,19 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; +/* Calculate mask of events for a list of marks */ +extern u32 fsnotify_recalc_mask(struct hlist_head *head); + /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, __u32 mask); +/* Add mark to a proper place in mark list */ +extern int fsnotify_add_mark_list(struct hlist_head *head, + struct fsnotify_mark *mark, + int allow_dups); /* add a mark to an inode */ extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, @@ -31,6 +38,11 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); +/* Destroy all marks in the given list */ +extern void fsnotify_destroy_marks(struct list_head *to_free); +/* Find mark belonging to given group in the list of marks */ +extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, + struct fsnotify_group *group); /* run the list of all marks associated with inode and flag them to be freed */ extern void fsnotify_clear_marks_by_inode(struct inode *inode); /* run the list of all marks associated with vfsmount and flag them to be freed */ diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index dfbf5447eea4..3daf513ee99e 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -31,28 +31,13 @@ #include "../internal.h" /* - * Recalculate the mask of events relevant to a given inode locked. - */ -static void fsnotify_recalc_inode_mask_locked(struct inode *inode) -{ - struct fsnotify_mark *mark; - __u32 new_mask = 0; - - assert_spin_locked(&inode->i_lock); - - hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) - new_mask |= mark->mask; - inode->i_fsnotify_mask = new_mask; -} - -/* * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types * any notifier is interested in hearing for this inode. */ void fsnotify_recalc_inode_mask(struct inode *inode) { spin_lock(&inode->i_lock); - fsnotify_recalc_inode_mask_locked(inode); + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); spin_unlock(&inode->i_lock); __fsnotify_update_child_dentry_flags(inode); @@ -60,23 +45,22 @@ void fsnotify_recalc_inode_mask(struct inode *inode) void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) { - struct inode *inode = mark->i.inode; + struct inode *inode = mark->inode; BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); assert_spin_locked(&mark->lock); spin_lock(&inode->i_lock); - hlist_del_init_rcu(&mark->i.i_list); - mark->i.inode = NULL; + hlist_del_init_rcu(&mark->obj_list); + mark->inode = NULL; /* * this mark is now off the inode->i_fsnotify_marks list and we * hold the inode->i_lock, so this is the perfect time to update the * inode->i_fsnotify_mask */ - fsnotify_recalc_inode_mask_locked(inode); - + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); spin_unlock(&inode->i_lock); } @@ -85,30 +69,19 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) */ void fsnotify_clear_marks_by_inode(struct inode *inode) { - struct fsnotify_mark *mark, *lmark; + struct fsnotify_mark *mark; struct hlist_node *n; LIST_HEAD(free_list); spin_lock(&inode->i_lock); - hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, i.i_list) { - list_add(&mark->i.free_i_list, &free_list); - hlist_del_init_rcu(&mark->i.i_list); + hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) { + list_add(&mark->free_list, &free_list); + hlist_del_init_rcu(&mark->obj_list); fsnotify_get_mark(mark); } spin_unlock(&inode->i_lock); - list_for_each_entry_safe(mark, lmark, &free_list, i.free_i_list) { - struct fsnotify_group *group; - - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); - - fsnotify_destroy_mark(mark, group); - fsnotify_put_mark(mark); - fsnotify_put_group(group); - } + fsnotify_destroy_marks(&free_list); } /* @@ -123,34 +96,13 @@ void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) * given a group and inode, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL */ -static struct fsnotify_mark *fsnotify_find_inode_mark_locked( - struct fsnotify_group *group, - struct inode *inode) -{ - struct fsnotify_mark *mark; - - assert_spin_locked(&inode->i_lock); - - hlist_for_each_entry(mark, &inode->i_fsnotify_marks, i.i_list) { - if (mark->group == group) { - fsnotify_get_mark(mark); - return mark; - } - } - return NULL; -} - -/* - * given a group and inode, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode) { struct fsnotify_mark *mark; spin_lock(&inode->i_lock); - mark = fsnotify_find_inode_mark_locked(group, inode); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); spin_unlock(&inode->i_lock); return mark; @@ -168,10 +120,10 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, assert_spin_locked(&mark->lock); if (mask && - mark->i.inode && + mark->inode && !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; - inode = igrab(mark->i.inode); + inode = igrab(mark->inode); /* * we shouldn't be able to get here if the inode wasn't * already safely held in memory. But bug in case it @@ -192,9 +144,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, int allow_dups) { - struct fsnotify_mark *lmark, *last = NULL; - int ret = 0; - int cmp; + int ret; mark->flags |= FSNOTIFY_MARK_FLAG_INODE; @@ -202,37 +152,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, assert_spin_locked(&mark->lock); spin_lock(&inode->i_lock); - - mark->i.inode = inode; - - /* is mark the first mark? */ - if (hlist_empty(&inode->i_fsnotify_marks)) { - hlist_add_head_rcu(&mark->i.i_list, &inode->i_fsnotify_marks); - goto out; - } - - /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, &inode->i_fsnotify_marks, i.i_list) { - last = lmark; - - if ((lmark->group == group) && !allow_dups) { - ret = -EEXIST; - goto out; - } - - cmp = fsnotify_compare_groups(lmark->group, mark->group); - if (cmp < 0) - continue; - - hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); - goto out; - } - - BUG_ON(last == NULL); - /* mark should be the last entry. last is the current last entry */ - hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); -out: - fsnotify_recalc_inode_mask_locked(inode); + mark->inode = inode; + ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, + allow_dups); + inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); spin_unlock(&inode->i_lock); return ret; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 7d888d77d59a..2cd900c2c737 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -156,7 +156,7 @@ static int idr_callback(int id, void *p, void *data) */ if (fsn_mark) printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", - fsn_mark->group, fsn_mark->i.inode, i_mark->wd); + fsn_mark->group, fsn_mark->inode, i_mark->wd); return 0; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index daf76652fe58..450648697433 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -227,14 +227,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf, struct fsnotify_event *kevent; char __user *start; int ret; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); start = buf; group = file->private_data; + add_wait_queue(&group->notification_waitq, &wait); while (1) { - prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); - mutex_lock(&group->notification_mutex); kevent = get_one_event(group, count); mutex_unlock(&group->notification_mutex); @@ -264,10 +263,10 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (start != buf) break; - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } + remove_wait_queue(&group->notification_waitq, &wait); - finish_wait(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; @@ -434,7 +433,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, if (wd == -1) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); goto out; } @@ -443,7 +442,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, if (unlikely(!found_i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); goto out; } @@ -457,9 +456,9 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " "found_i_mark->group=%p found_i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, - i_mark->fsn_mark.i.inode, found_i_mark, found_i_mark->wd, + i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, found_i_mark->fsn_mark.group, - found_i_mark->fsn_mark.i.inode); + found_i_mark->fsn_mark.inode); goto out; } @@ -471,7 +470,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.i.inode); + i_mark->fsn_mark.group, i_mark->fsn_mark.inode); /* we can't really recover with bad ref cnting.. */ BUG(); } diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 34c38fabf514..92e48c70f0f0 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -110,6 +110,17 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) } } +/* Calculate mask of events for a list of marks */ +u32 fsnotify_recalc_mask(struct hlist_head *head) +{ + u32 new_mask = 0; + struct fsnotify_mark *mark; + + hlist_for_each_entry(mark, head, obj_list) + new_mask |= mark->mask; + return new_mask; +} + /* * Any time a mark is getting freed we end up here. * The caller had better be holding a reference to this mark so we don't actually @@ -133,7 +144,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = mark->i.inode; + inode = mark->inode; fsnotify_destroy_inode_mark(mark); } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) fsnotify_destroy_vfsmount_mark(mark); @@ -150,7 +161,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); spin_lock(&destroy_lock); - list_add(&mark->destroy_list, &destroy_list); + list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); /* @@ -192,6 +203,27 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); } +/* + * Destroy all marks in the given list. The marks must be already detached from + * the original inode / vfsmount. + */ +void fsnotify_destroy_marks(struct list_head *to_free) +{ + struct fsnotify_mark *mark, *lmark; + struct fsnotify_group *group; + + list_for_each_entry_safe(mark, lmark, to_free, free_list) { + spin_lock(&mark->lock); + fsnotify_get_group(mark->group); + group = mark->group; + spin_unlock(&mark->lock); + + fsnotify_destroy_mark(mark, group); + fsnotify_put_mark(mark); + fsnotify_put_group(group); + } +} + void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) { assert_spin_locked(&mark->lock); @@ -245,6 +277,39 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } +/* Add mark into proper place in given list of marks */ +int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, + int allow_dups) +{ + struct fsnotify_mark *lmark, *last = NULL; + int cmp; + + /* is mark the first mark? */ + if (hlist_empty(head)) { + hlist_add_head_rcu(&mark->obj_list, head); + return 0; + } + + /* should mark be in the middle of the current list? */ + hlist_for_each_entry(lmark, head, obj_list) { + last = lmark; + + if ((lmark->group == mark->group) && !allow_dups) + return -EEXIST; + + cmp = fsnotify_compare_groups(lmark->group, mark->group); + if (cmp >= 0) { + hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); + return 0; + } + } + + BUG_ON(last == NULL); + /* mark should be the last entry. last is the current last entry */ + hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); + return 0; +} + /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which @@ -305,7 +370,7 @@ err: spin_unlock(&mark->lock); spin_lock(&destroy_lock); - list_add(&mark->destroy_list, &destroy_list); + list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); wake_up(&destroy_waitq); @@ -323,6 +388,24 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, } /* + * Given a list of marks, find the mark associated with given group. If found + * take a reference to that mark and return it, else return NULL. + */ +struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, + struct fsnotify_group *group) +{ + struct fsnotify_mark *mark; + + hlist_for_each_entry(mark, head, obj_list) { + if (mark->group == group) { + fsnotify_get_mark(mark); + return mark; + } + } + return NULL; +} + +/* * clear any marks in a group in which mark->flags & flags is true */ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, @@ -352,8 +435,8 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group) void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) { assert_spin_locked(&old->lock); - new->i.inode = old->i.inode; - new->m.mnt = old->m.mnt; + new->inode = old->inode; + new->mnt = old->mnt; if (old->group) fsnotify_get_group(old->group); new->group = old->group; @@ -386,8 +469,8 @@ static int fsnotify_mark_destroy(void *ignored) synchronize_srcu(&fsnotify_mark_srcu); - list_for_each_entry_safe(mark, next, &private_destroy_list, destroy_list) { - list_del_init(&mark->destroy_list); + list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { + list_del_init(&mark->g_list); fsnotify_put_mark(mark); } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index faefa72a11eb..326b148e623c 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -32,31 +32,20 @@ void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { - struct fsnotify_mark *mark, *lmark; + struct fsnotify_mark *mark; struct hlist_node *n; struct mount *m = real_mount(mnt); LIST_HEAD(free_list); spin_lock(&mnt->mnt_root->d_lock); - hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, m.m_list) { - list_add(&mark->m.free_m_list, &free_list); - hlist_del_init_rcu(&mark->m.m_list); + hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) { + list_add(&mark->free_list, &free_list); + hlist_del_init_rcu(&mark->obj_list); fsnotify_get_mark(mark); } spin_unlock(&mnt->mnt_root->d_lock); - list_for_each_entry_safe(mark, lmark, &free_list, m.free_m_list) { - struct fsnotify_group *group; - - spin_lock(&mark->lock); - fsnotify_get_group(mark->group); - group = mark->group; - spin_unlock(&mark->lock); - - fsnotify_destroy_mark(mark, group); - fsnotify_put_mark(mark); - fsnotify_put_group(group); - } + fsnotify_destroy_marks(&free_list); } void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) @@ -65,66 +54,35 @@ void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) } /* - * Recalculate the mask of events relevant to a given vfsmount locked. - */ -static void fsnotify_recalc_vfsmount_mask_locked(struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - __u32 new_mask = 0; - - assert_spin_locked(&mnt->mnt_root->d_lock); - - hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) - new_mask |= mark->mask; - m->mnt_fsnotify_mask = new_mask; -} - -/* * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types * any notifier is interested in hearing for this mount point */ void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); + spin_lock(&mnt->mnt_root->d_lock); - fsnotify_recalc_vfsmount_mask_locked(mnt); + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); spin_unlock(&mnt->mnt_root->d_lock); } void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) { - struct vfsmount *mnt = mark->m.mnt; + struct vfsmount *mnt = mark->mnt; + struct mount *m = real_mount(mnt); BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); assert_spin_locked(&mark->lock); spin_lock(&mnt->mnt_root->d_lock); - hlist_del_init_rcu(&mark->m.m_list); - mark->m.mnt = NULL; - - fsnotify_recalc_vfsmount_mask_locked(mnt); + hlist_del_init_rcu(&mark->obj_list); + mark->mnt = NULL; + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); spin_unlock(&mnt->mnt_root->d_lock); } -static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_group *group, - struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - - assert_spin_locked(&mnt->mnt_root->d_lock); - - hlist_for_each_entry(mark, &m->mnt_fsnotify_marks, m.m_list) { - if (mark->group == group) { - fsnotify_get_mark(mark); - return mark; - } - } - return NULL; -} - /* * given a group and vfsmount, find the mark associated with that combination. * if found take a reference to that mark and return it, else return NULL @@ -132,10 +90,11 @@ static struct fsnotify_mark *fsnotify_find_vfsmount_mark_locked(struct fsnotify_ struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt) { + struct mount *m = real_mount(mnt); struct fsnotify_mark *mark; spin_lock(&mnt->mnt_root->d_lock); - mark = fsnotify_find_vfsmount_mark_locked(group, mnt); + mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); spin_unlock(&mnt->mnt_root->d_lock); return mark; @@ -151,9 +110,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, int allow_dups) { struct mount *m = real_mount(mnt); - struct fsnotify_mark *lmark, *last = NULL; - int ret = 0; - int cmp; + int ret; mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; @@ -161,37 +118,9 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, assert_spin_locked(&mark->lock); spin_lock(&mnt->mnt_root->d_lock); - - mark->m.mnt = mnt; - - /* is mark the first mark? */ - if (hlist_empty(&m->mnt_fsnotify_marks)) { - hlist_add_head_rcu(&mark->m.m_list, &m->mnt_fsnotify_marks); - goto out; - } - - /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, &m->mnt_fsnotify_marks, m.m_list) { - last = lmark; - - if ((lmark->group == group) && !allow_dups) { - ret = -EEXIST; - goto out; - } - - cmp = fsnotify_compare_groups(lmark->group, mark->group); - if (cmp < 0) - continue; - - hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); - goto out; - } - - BUG_ON(last == NULL); - /* mark should be the last entry. last is the current last entry */ - hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); -out: - fsnotify_recalc_vfsmount_mask_locked(mnt); + mark->mnt = mnt; + ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); + m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); spin_unlock(&mnt->mnt_root->d_lock); return ret; diff --git a/fs/nsfs.c b/fs/nsfs.c new file mode 100644 index 000000000000..af1b24fa899d --- /dev/null +++ b/fs/nsfs.c @@ -0,0 +1,161 @@ +#include <linux/mount.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/proc_ns.h> +#include <linux/magic.h> +#include <linux/ktime.h> + +static struct vfsmount *nsfs_mnt; + +static const struct file_operations ns_file_operations = { + .llseek = no_llseek, +}; + +static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct inode *inode = dentry->d_inode; + const struct proc_ns_operations *ns_ops = dentry->d_fsdata; + + return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", + ns_ops->name, inode->i_ino); +} + +static void ns_prune_dentry(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + if (inode) { + struct ns_common *ns = inode->i_private; + atomic_long_set(&ns->stashed, 0); + } +} + +const struct dentry_operations ns_dentry_operations = +{ + .d_prune = ns_prune_dentry, + .d_delete = always_delete_dentry, + .d_dname = ns_dname, +}; + +static void nsfs_evict(struct inode *inode) +{ + struct ns_common *ns = inode->i_private; + clear_inode(inode); + ns->ops->put(ns); +} + +void *ns_get_path(struct path *path, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct vfsmount *mnt = mntget(nsfs_mnt); + struct qstr qname = { .name = "", }; + struct dentry *dentry; + struct inode *inode; + struct ns_common *ns; + unsigned long d; + +again: + ns = ns_ops->get(task); + if (!ns) { + mntput(mnt); + return ERR_PTR(-ENOENT); + } + rcu_read_lock(); + d = atomic_long_read(&ns->stashed); + if (!d) + goto slow; + dentry = (struct dentry *)d; + if (!lockref_get_not_dead(&dentry->d_lockref)) + goto slow; + rcu_read_unlock(); + ns_ops->put(ns); +got_it: + path->mnt = mnt; + path->dentry = dentry; + return NULL; +slow: + rcu_read_unlock(); + inode = new_inode_pseudo(mnt->mnt_sb); + if (!inode) { + ns_ops->put(ns); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + inode->i_ino = ns->inum; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_flags |= S_IMMUTABLE; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_fop = &ns_file_operations; + inode->i_private = ns; + + dentry = d_alloc_pseudo(mnt->mnt_sb, &qname); + if (!dentry) { + iput(inode); + mntput(mnt); + return ERR_PTR(-ENOMEM); + } + d_instantiate(dentry, inode); + dentry->d_fsdata = (void *)ns_ops; + d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); + if (d) { + d_delete(dentry); /* make sure ->d_prune() does nothing */ + dput(dentry); + cpu_relax(); + goto again; + } + goto got_it; +} + +int ns_get_name(char *buf, size_t size, struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct ns_common *ns; + int res = -ENOENT; + ns = ns_ops->get(task); + if (ns) { + res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + ns_ops->put(ns); + } + return res; +} + +struct file *proc_ns_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &ns_file_operations) + goto out_invalid; + + return file; + +out_invalid: + fput(file); + return ERR_PTR(-EINVAL); +} + +static const struct super_operations nsfs_ops = { + .statfs = simple_statfs, + .evict_inode = nsfs_evict, +}; +static struct dentry *nsfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, + &ns_dentry_operations, NSFS_MAGIC); +} +static struct file_system_type nsfs = { + .name = "nsfs", + .mount = nsfs_mount, + .kill_sb = kill_anon_super, +}; + +void __init nsfs_init(void) +{ + nsfs_mnt = kern_mount(&nsfs); + if (IS_ERR(nsfs_mnt)) + panic("can't set nsfs up\n"); + nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; +} diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 643faa44f22b..1da9b2d184dc 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -19,6 +19,7 @@ * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/gfp.h> #include <linux/pagemap.h> @@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, count = iov_length(iov, nr_segs); pos = *ppos; /* We can write back this queue in page reclaim. */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); written = 0; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 436f36037e09..b3973c2fd190 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -111,8 +111,8 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, unsigned long dent_ino; int uname_len; - ntfs_debug("Looking up %s in directory inode 0x%lx.", - dent->d_name.name, dir_ino->i_ino); + ntfs_debug("Looking up %pd in directory inode 0x%lx.", + dent, dir_ino->i_ino); /* Convert the name of the dentry to Unicode. */ uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, &uname); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 7e8282dcea2a..c58a1bcfda0f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle, ret = posix_acl_equiv_mode(acl, &mode); if (ret < 0) return ret; - else { - if (ret == 0) - acl = NULL; - ret = ocfs2_acl_set_mode(inode, di_bh, - handle, mode); - if (ret) - return ret; + if (ret == 0) + acl = NULL; - } + ret = ocfs2_acl_set_mode(inode, di_bh, + handle, mode); + if (ret) + return ret; } break; case ACL_TYPE_DEFAULT: diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a93bf9892256..044158bd22be 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5662,7 +5662,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, u32 cpos, u32 phys_cpos, u32 len, int flags, struct ocfs2_cached_dealloc_ctxt *dealloc, - u64 refcount_loc) + u64 refcount_loc, bool refcount_tree_locked) { int ret, credits = 0, extra_blocks = 0; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); @@ -5676,11 +5676,13 @@ int ocfs2_remove_btree_range(struct inode *inode, BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); - ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, - &ref_tree, NULL); - if (ret) { - mlog_errno(ret); - goto bail; + if (!refcount_tree_locked) { + ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (ret) { + mlog_errno(ret); + goto bail; + } } ret = ocfs2_prepare_refcount_change_for_del(inode, @@ -6871,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_unlock; + goto out; } ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, @@ -6929,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (ret) { mlog_errno(ret); need_free = 1; - goto out_commit; + goto out_unlock; } page_end = PAGE_CACHE_SIZE; @@ -6962,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (ret) { mlog_errno(ret); need_free = 1; - goto out_commit; + goto out_unlock; } inode->i_blocks = ocfs2_inode_sector_count(inode); } +out_unlock: + if (pages) + ocfs2_unlock_and_free_pages(pages, num_pages); + out_commit: if (ret < 0 && did_quota) dquot_free_space_nodirty(inode, @@ -6987,15 +6993,11 @@ out_commit: ocfs2_commit_trans(osb, handle); -out_unlock: +out: if (data_ac) ocfs2_free_alloc_context(data_ac); - -out: - if (pages) { - ocfs2_unlock_and_free_pages(pages, num_pages); + if (pages) kfree(pages); - } return ret; } @@ -7021,6 +7023,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, u64 refcount_loc = le64_to_cpu(di->i_refcount_loc); struct ocfs2_extent_tree et; struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_refcount_tree *ref_tree = NULL; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); ocfs2_init_dealloc_ctxt(&dealloc); @@ -7130,9 +7133,18 @@ start: phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, blkno); + if ((flags & OCFS2_EXT_REFCOUNTED) && trunc_len && !ref_tree) { + status = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, + &ref_tree, NULL); + if (status) { + mlog_errno(status); + goto bail; + } + } + status = ocfs2_remove_btree_range(inode, &et, trunc_cpos, phys_cpos, trunc_len, flags, &dealloc, - refcount_loc); + refcount_loc, true); if (status < 0) { mlog_errno(status); goto bail; @@ -7147,6 +7159,8 @@ start: goto start; bail: + if (ref_tree) + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); ocfs2_schedule_truncate_log_flush(osb, 1); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index ca381c584127..fb09b97db162 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -142,7 +142,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_extent_tree *et, u32 cpos, u32 phys_cpos, u32 len, int flags, struct ocfs2_cached_dealloc_ctxt *dealloc, - u64 refcount_loc); + u64 refcount_loc, bool refcount_tree_locked); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct ocfs2_extent_tree *et); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 1ef547e49373..44db1808cdb5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -28,6 +28,7 @@ #include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include <cluster/masklog.h> @@ -47,6 +48,9 @@ #include "ocfs2_trace.h" #include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -506,18 +510,21 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); - * - * Note that we never bother to allocate blocks here, and thus ignore the - * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; + u32 cpos = 0; + int alloc_locked = 0; u64 p_blkno, inode_blocks, contig_blocks; unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + unsigned long len = bh_result->b_size; + unsigned int clusters_to_alloc = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); /* This function won't even be called if the request isn't all * nicely aligned and of the right size, so there's no need @@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* We should already CoW the refcounted extent in case of create. */ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + /* allocate blocks if no p_blkno is found, and create == 1 */ + if (!p_blkno && create) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + alloc_locked = 1; + + /* fill hole, allocate blocks can't be larger than the size + * of the hole */ + clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); + if (clusters_to_alloc > contig_blocks) + clusters_to_alloc = contig_blocks; + + /* allocate extent and insert them into the extent tree */ + ret = ocfs2_extend_allocation(inode, cpos, + clusters_to_alloc, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret < 0) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + } + /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, contig_blocks = max_blocks; bh_result->b_size = contig_blocks << blocksize_bits; bail: + if (alloc_locked) + ocfs2_inode_unlock(inode, 1); return ret; } @@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int ocfs2_is_overwrite(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + int ret = 0; + u32 v_cpos = 0; + u32 p_cpos = 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + return 1; + + return 0; +} + +static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + ssize_t ret = 0; + ssize_t written = 0; + bool orphaned = false; + int is_overwrite = 0; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + size_t count = iter->count; + journal_t *journal = osb->journal->j_journal; + u32 zero_len; + int cluster_align; + loff_t final_size = offset + count; + int append_write = offset >= i_size_read(inode) ? 1 : 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + { + u64 o = offset; + + zero_len = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align = !zero_len; + } + + /* + * when final_size > inode->i_size, inode->i_size will be + * updated after direct write, so add the inode to orphan + * dir first. + */ + if (final_size > i_size_read(inode)) { + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + orphaned = true; + } + + if (append_write) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, offset); + else + ret = ocfs2_extend_no_holes(inode, di_bh, offset, + offset); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + is_overwrite = ocfs2_is_overwrite(osb, inode, offset); + if (is_overwrite < 0) { + mlog_errno(is_overwrite); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; + } + + written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + if (unlikely(written < 0)) { + loff_t i_size = i_size_read(inode); + + if (offset + count > i_size) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (i_size == i_size_read(inode)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + ret = jbd2_journal_force_commit(journal); + if (ret < 0) + mlog_errno(ret); + } + } else if (written < 0 && append_write && !is_overwrite && + !cluster_align) { + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + p_cpos << (osb->s_clustersize_bits - 9), + zero_len >> 9, GFP_KERNEL, false); + if (ret < 0) + mlog_errno(ret); + } + +clean_orphan: + if (orphaned) { + int tmp_ret; + int update_isize = written > 0 ? 1 : 0; + loff_t end = update_isize ? offset + written : 0; + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + update_isize, end); + if (tmp_ret < 0) { + ret = tmp_ret; + goto out; + } + + tmp_ret = jbd2_journal_force_commit(journal); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(tmp_ret); + } + } + +out: + if (ret >= 0) + ret = written; + return ret; +} + static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, @@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * Fallback to buffered I/O if we see an inode without @@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - /* Fallback to buffered I/O if we are appending. */ - if (i_size_read(inode) <= offset) + /* Fallback to buffered I/O if we are appending and + * concurrent O_DIRECT writes are allowed. + */ + if (i_size_read(inode) <= offset && !full_coherency) return 0; - return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + if (rw == READ) + return __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); + else + return ocfs2_direct_IO_write(iocb, iter, offset); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, @@ -894,7 +1124,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) } } -static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc) { int i; @@ -915,7 +1145,11 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) page_cache_release(wc->w_target_page); } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); +} +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + ocfs2_unlock_pages(wc); brelse(wc->w_di_bh); kfree(wc); } @@ -1251,7 +1485,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, NULL); if (ret < 0) { - ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " + mlog(ML_ERROR, "Get physical blkno failed for inode %llu, " "at logical block %llu", (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)v_blkno); @@ -2042,11 +2276,19 @@ out_write_size: ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); + /* unlock pages before dealloc since it needs acquiring j_trans_barrier + * lock, or it will cause a deadlock since journal commit threads holds + * this lock and will ask for the page lock when flushing the data. + * put it here to preserve the unlock order. + */ + ocfs2_unlock_pages(wc); + ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); - ocfs2_free_write_ctxt(wc); + brelse(wc->w_di_bh); + kfree(wc); return copied; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index eb9d48746ab4..16eff45727ee 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1127,10 +1127,10 @@ static int o2hb_thread(void *data) elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n", before_hb.tv_sec, (unsigned long) before_hb.tv_usec, after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec); + elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a96044004064..56c403a563bc 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes) memset(map, 0, bytes); for (node = 0; node < O2NM_MAX_NODES; ++node) { - o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); + if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) + continue; if (!ret) { set_bit(node, map); sc_put(sc); @@ -1736,7 +1737,7 @@ static void o2net_connect_expired(struct work_struct *work) o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); - o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + o2net_set_nn_state(nn, NULL, 0, 0); } spin_unlock(&nn->nn_lock); } diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index dc024367110a..b95e7df5b76a 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -107,12 +107,12 @@ struct o2net_node { struct list_head nn_status_list; /* connects are attempted from when heartbeat comes up until either hb - * goes down, the node is unconfigured, no connect attempts succeed - * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work - * is queued from set_nn_state both from hb up and from itself if a - * connect attempt fails and so can be self-arming. shutdown is - * careful to first mark the nn such that no connects will be attempted - * before canceling delayed connect work and flushing the queue. */ + * goes down, the node is unconfigured, or a connect succeeds. + * connect_work is queued from set_nn_state both from hb up and from + * itself if a connect attempt fails and so can be self-arming. + * shutdown is careful to first mark the nn such that no connects will + * be attempted before canceling delayed connect work and flushing the + * queue. */ struct delayed_work nn_connect_work; unsigned long nn_last_connect_attempt; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index e2e05a106beb..4fda7a5f3088 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -172,7 +172,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, @@ -251,8 +251,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (dl) { mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, - " \"%.*s\": old parent: %llu, new: %llu\n", - dentry->d_name.len, dentry->d_name.name, + " \"%pd\": old parent: %llu, new: %llu\n", + dentry, (unsigned long long)parent_blkno, (unsigned long long)dl->dl_parent_blkno); return 0; @@ -277,8 +277,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog_bug_on_msg(dl->dl_parent_blkno != parent_blkno, - " \"%.*s\": old parent: %llu, new: %llu\n", - dentry->d_name.len, dentry->d_name.name, + " \"%pd\": old parent: %llu, new: %llu\n", + dentry, (unsigned long long)parent_blkno, (unsigned long long)dl->dl_parent_blkno); @@ -406,17 +406,15 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) if (inode) ino = (unsigned long long)OCFS2_I(inode)->ip_blkno; mlog(ML_ERROR, "Dentry is missing cluster lock. " - "inode: %llu, d_flags: 0x%x, d_name: %.*s\n", - ino, dentry->d_flags, dentry->d_name.len, - dentry->d_name.name); + "inode: %llu, d_flags: 0x%x, d_name: %pd\n", + ino, dentry->d_flags, dentry); } goto out; } - mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", - dentry->d_name.len, dentry->d_name.name, - dl->dl_count); + mlog_bug_on_msg(dl->dl_count == 0, "dentry: %pd, count: %u\n", + dentry, dl->dl_count); ocfs2_dentry_lock_put(OCFS2_SB(dentry->d_sb), dl); diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 0717662b4aef..b08050bd3f2e 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -744,7 +744,7 @@ restart: if (ocfs2_read_dir_block(dir, block, &bh, 0)) { /* read error, skip block & hope for the best. * ocfs2_read_dir_block() has released the bh. */ - ocfs2_error(dir->i_sb, "reading directory %llu, " + mlog(ML_ERROR, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); @@ -2073,10 +2073,12 @@ struct ocfs2_empty_dir_priv { unsigned seen_other; unsigned dx_dir; }; -static int ocfs2_empty_dir_filldir(void *priv, const char *name, int name_len, - loff_t pos, u64 ino, unsigned type) +static int ocfs2_empty_dir_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) { - struct ocfs2_empty_dir_priv *p = priv; + struct ocfs2_empty_dir_priv *p = + container_of(ctx, struct ocfs2_empty_dir_priv, ctx); /* * Check the positions of "." and ".." records to be sure @@ -3454,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, int blocksize = dir->i_sb->s_blocksize; status = ocfs2_read_dir_block(dir, 0, &bh, 0); - if (status) { - mlog_errno(status); + if (status) goto bail; - } rec_len = OCFS2_DIR_REC_LEN(namelen); offset = 0; @@ -3478,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = ocfs2_read_dir_block(dir, offset >> sb->s_blocksize_bits, &bh, 0); - if (status) { - mlog_errno(status); + if (status) goto bail; - } + /* move to next block */ de = (struct ocfs2_dir_entry *) bh->b_data; } @@ -3511,7 +3510,6 @@ next: de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); } - status = 0; bail: brelse(bh); if (status) @@ -4477,7 +4475,7 @@ int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh) p_cpos = ocfs2_blocks_to_clusters(dir->i_sb, blkno); ret = ocfs2_remove_btree_range(dir, &et, cpos, p_cpos, clen, 0, - &dealloc, 0); + &dealloc, 0, false); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index b46278f9ae44..fd6bbbbd7d78 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, head = &res->granted; list_for_each_entry(lock, head, list) { - if (lock->ml.cookie == cookie) + /* if lock is found but unlock is pending ignore the bast */ + if (lock->ml.cookie == cookie) { + if (lock->unlock_pending) + break; goto do_ast; + } } mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 149eb556b8c6..825136070d2c 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len) } spin_unlock(&dlm->spinlock); - out += snprintf(buf + out, len - out, "Total on list: %ld\n", total); + out += snprintf(buf + out, len - out, "Total on list: %lu\n", total); return out; } @@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) spin_unlock(&dlm->master_lock); out += snprintf(buf + out, len - out, - "Total: %ld, Longest: %ld\n", total, longest); + "Total: %lu, Longest: %lu\n", total, longest); return out; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 02d315fef432..7df88a6dd626 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm) spin_unlock(&dlm->spinlock); } -int dlm_joined(struct dlm_ctxt *dlm) -{ - int ret = 0; - - spin_lock(&dlm_domain_lock); - - if (dlm->dlm_state == DLM_CTXT_JOINED) - ret = 1; - - spin_unlock(&dlm_domain_lock); - - return ret; -} - int dlm_shutting_down(struct dlm_ctxt *dlm) { int ret = 0; @@ -877,7 +863,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, * to be put in someone's domain map. * Also, explicitly disallow joining at certain troublesome * times (ie. during recovery). */ - if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + if (dlm->dlm_state != DLM_CTXT_LEAVING) { int bit = query->node_idx; spin_lock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h index 2f7f60bfeb3b..fd6122a38dbd 100644 --- a/fs/ocfs2/dlm/dlmdomain.h +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -28,7 +28,6 @@ extern spinlock_t dlm_domain_lock; extern struct list_head dlm_domains; -int dlm_joined(struct dlm_ctxt *dlm); int dlm_shutting_down(struct dlm_ctxt *dlm); void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 215e41abf101..a6944b25fd5b 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -695,14 +695,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, res->inflight_assert_workers); } -static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res) -{ - spin_lock(&res->spinlock); - __dlm_lockres_grab_inflight_worker(dlm, res); - spin_unlock(&res->spinlock); -} - static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -1460,6 +1452,18 @@ way_up_top: /* take care of the easy cases up front */ spin_lock(&res->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&res->hash_node)) { + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + goto way_up_top; + } + if (res->state & (DLM_LOCK_RES_RECOVERING| DLM_LOCK_RES_MIGRATING)) { spin_unlock(&res->spinlock); @@ -1634,6 +1638,7 @@ send_response: } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); + spin_lock(&res->spinlock); ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, DLM_ASSERT_MASTER_MLE_CLEANUP); if (ret < 0) { @@ -1641,7 +1646,8 @@ send_response: response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); } else - dlm_lockres_grab_inflight_worker(dlm, res); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } else { if (res) dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 3365839d2971..ce12e0b1a31f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); + /* Can't schedule DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); break; } } @@ -1656,14 +1659,18 @@ int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, req.namelen = res->lockname.len; memcpy(req.name, res->lockname.name, res->lockname.len); +resend: ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, &req, sizeof(req), nodenum, &status); - /* XXX: negative status not handled properly here. */ if (ret < 0) mlog(ML_ERROR, "Error %d when sending message %u (key " "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG, dlm->key, nodenum); - else { + else if (status == -ENOMEM) { + mlog_errno(status); + msleep(50); + goto resend; + } else { BUG_ON(status < 0); BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); *real_master = (u8) (status & 0xff); @@ -1705,9 +1712,13 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, int ret = dlm_dispatch_assert_master(dlm, res, 0, 0, flags); if (ret < 0) { - mlog_errno(-ENOMEM); - /* retry!? */ - BUG(); + mlog_errno(ret); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + spin_unlock(&dlm->spinlock); + dlm_put(dlm); + /* sender will take care of this and retry */ + return ret; } else __dlm_lockres_grab_inflight_worker(dlm, res); spin_unlock(&res->spinlock); @@ -2015,11 +2026,8 @@ leave: dlm_lockres_drop_inflight_ref(dlm, res); spin_unlock(&res->spinlock); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - if (newlock) - dlm_lock_put(newlock); - } return ret; } @@ -2341,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dead_node, dlm->name); list_del_init(&lock->list); dlm_lock_put(lock); + /* Can't schedule + * DLM_UNLOCK_FREE_LOCK + * - do manually */ + dlm_lock_put(lock); break; } } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 09b7d9dac71d..061ba6a91bf2 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -390,12 +390,6 @@ clear_fields: ip->ip_conn = NULL; } -static struct backing_dev_info dlmfs_backing_dev_info = { - .name = "ocfs2-dlmfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(inode, NULL, mode); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(inode, parent, mode); - inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; ip = DLMFS_I(inode); @@ -565,8 +557,8 @@ static int dlmfs_unlink(struct inode *dir, * to acquire a lock, this basically destroys our lockres. */ status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); if (status < 0) { - mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", - dentry->d_name.len, dentry->d_name.name, status); + mlog(ML_ERROR, "unlink %pd, error %d from destroy\n", + dentry, status); goto bail; } status = simple_unlink(dir, dentry); @@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void) int status; int cleanup_inode = 0, cleanup_worker = 0; - status = bdi_init(&dlmfs_backing_dev_info); - if (status) - return status; - dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -673,7 +661,6 @@ bail: kmem_cache_destroy(dlmfs_inode_cache); if (cleanup_worker) destroy_workqueue(user_dlm_worker); - bdi_destroy(&dlmfs_backing_dev_info); } else printk("OCFS2 User DLM kernel interface loaded\n"); return status; @@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void) rcu_barrier(); kmem_cache_destroy(dlmfs_inode_cache); - bdi_destroy(&dlmfs_backing_dev_info); } MODULE_AUTHOR("Oracle"); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 21262f2b1654..11849a44dc5a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -861,8 +861,13 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo * We set the OCFS2_LOCK_UPCONVERT_FINISHING flag before clearing * the OCFS2_LOCK_BUSY flag to prevent the dc thread from * downconverting the lock before the upconvert has fully completed. + * Do not prevent the dc thread from downconverting if NONBLOCK lock + * had already returned. */ - lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + if (!(lockres->l_flags & OCFS2_LOCK_NONBLOCK_FINISHED)) + lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + else + lockres_clear_flags(lockres, OCFS2_LOCK_NONBLOCK_FINISHED); lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); } @@ -1324,13 +1329,12 @@ static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, /* returns 0 if the mw that was removed was already satisfied, -EBUSY * if the mask still hadn't reached its goal */ -static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, +static int __lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, struct ocfs2_mask_waiter *mw) { - unsigned long flags; int ret = 0; - spin_lock_irqsave(&lockres->l_lock, flags); + assert_spin_locked(&lockres->l_lock); if (!list_empty(&mw->mw_item)) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) ret = -EBUSY; @@ -1338,6 +1342,18 @@ static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, list_del_init(&mw->mw_item); init_completion(&mw->mw_complete); } + + return ret; +} + +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = __lockres_remove_mask_waiter(lockres, mw); spin_unlock_irqrestore(&lockres->l_lock, flags); return ret; @@ -1373,6 +1389,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned long flags; unsigned int gen; int noqueue_attempted = 0; + int dlm_locked = 0; ocfs2_init_mask_waiter(&mw); @@ -1481,6 +1498,7 @@ again: ocfs2_recover_from_dlm_error(lockres, 1); goto out; } + dlm_locked = 1; mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", lockres->l_name); @@ -1514,10 +1532,17 @@ out: if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { wait = 0; - if (lockres_remove_mask_waiter(lockres, &mw)) + spin_lock_irqsave(&lockres->l_lock, flags); + if (__lockres_remove_mask_waiter(lockres, &mw)) { + if (dlm_locked) + lockres_or_flags(lockres, + OCFS2_LOCK_NONBLOCK_FINISHED); + spin_unlock_irqrestore(&lockres->l_lock, flags); ret = -EAGAIN; - else + } else { + spin_unlock_irqrestore(&lockres->l_lock, flags); goto again; + } } if (wait) { ret = ocfs2_wait_for_mask(&mw); @@ -3725,8 +3750,10 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, break; spin_unlock(&dentry_attach_lock); - mlog(0, "d_delete(%.*s);\n", dentry->d_name.len, - dentry->d_name.name); + if (S_ISDIR(dl->dl_inode->i_mode)) + shrink_dcache_parent(dentry); + + mlog(0, "d_delete(%pd);\n", dentry); /* * The following dcache calls may do an diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 324dc93ac896..ba1790e52ff2 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -295,7 +295,7 @@ out: return ret; } -static int ocfs2_set_inode_size(handle_t *handle, +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -441,7 +441,7 @@ out: return status; } -static int ocfs2_truncate_file(struct inode *inode, +int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { @@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, handle_t *handle = NULL; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; - enum ocfs2_alloc_restarted why; + enum ocfs2_alloc_restarted why = RESTART_NONE; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; int did_quota = 0; @@ -709,6 +709,13 @@ leave: return status; } +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + return __ocfs2_extend_allocation(inode, logical_start, + clusters_to_add, mark_unwritten); +} + /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. @@ -1803,7 +1810,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, ret = ocfs2_remove_btree_range(inode, &et, trunc_cpos, phys_cpos, trunc_len, flags, - &dealloc, refcount_loc); + &dealloc, refcount_loc, false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos = 0, end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * one node could wind up truncating another * nodes writes. */ - if (end > i_size_read(inode)) { + if (end > i_size_read(inode) && !full_coherency) { + *direct_io = 0; + break; + } + + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { *direct_io = 0; break; } @@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ ret = ocfs2_check_range_for_holes(inode, saved_pos, count); if (ret == 1) { - *direct_io = 0; + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; ret = 0; } else if (ret < 0) mlog_errno(ret); @@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2357,13 +2383,52 @@ relock: iov_iter_truncate(from, count); if (direct_io) { + loff_t endbyte; + ssize_t written_buffered; written = generic_file_direct_write(iocb, from, *ppos); - if (written < 0) { + if (written < 0 || written == count) { ret = written; goto out_dio; } + + /* + * for completing the rest of the request. + */ + count -= written; + written_buffered = generic_perform_write(file, from, *ppos); + /* + * If generic_file_buffered_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + ret = written_buffered; + goto out_dio; + } + + /* We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = *ppos + written_buffered - 1; + ret = filemap_write_and_wait_range(file->f_mapping, *ppos, + endbyte); + if (ret == 0) { + iocb->ki_pos = *ppos + written_buffered; + written += written_buffered; + invalidate_mapping_pages(mapping, + *ppos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } } else { - current->backing_dev_info = file->f_mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); if (likely(written >= 0)) iocb->ki_pos = *ppos + written; @@ -2374,26 +2439,30 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + if (unlikely(written <= 0)) + goto no_sync; + if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); if (ret < 0) written = ret; - if (!ret && ((old_size != i_size_read(inode)) || - (old_clusters != OCFS2_I(inode)->ip_clusters) || - has_refcount)) { + if (!ret) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } +no_sync: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..e8c62f22215c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 437de7f768c6..3025c0da6b8a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -540,8 +540,7 @@ bail: if (status < 0) make_bad_inode(inode); - if (args && bh) - brelse(bh); + brelse(bh); return status; } @@ -649,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index a9b76de46047..5e86b247c821 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -80,6 +80,10 @@ struct ocfs2_inode_info */ tid_t i_sync_tid; tid_t i_datasync_tid; + + wait_queue_head_t append_dio_wq; + + struct dquot *i_dquot[MAXQUOTAS]; }; /* diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4b0c68849b36..ff531928269e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -50,6 +50,8 @@ #include "sysfile.h" #include "uptodate.h" #include "quota.h" +#include "file.h" +#include "namei.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" @@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot); + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); static int ocfs2_commit_thread(void *arg); static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec); + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; int i; @@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb) for (i = 0; i < replay_map->rm_slots; i++) if (replay_map->rm_replay_slots[i]) ocfs2_queue_recovery_completion(osb->journal, i, NULL, - NULL, NULL); + NULL, NULL, + orphan_reco_type); replay_map->rm_state = REPLAY_DONE; } @@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item { struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; }; /* Does the second half of the recovery process. By this point, the @@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; LIST_HEAD(tmp_la_list); trace_ocfs2_complete_recovery( @@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work) la_dinode = item->lri_la_dinode; tl_dinode = item->lri_tl_dinode; qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; trace_ocfs2_complete_recovery_slot(item->lri_slot, la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, @@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(tl_dinode); } - ret = ocfs2_recover_orphans(osb, item->lri_slot); + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); if (ret < 0) mlog_errno(ret); @@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec) + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_la_recovery_item *item; @@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, - osb->local_alloc_copy, NULL, NULL); + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; @@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); ocfs2_free_replay_slots(osb); } @@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) osb->slot_num, NULL, NULL, - osb->quota_rec); + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); osb->quota_rec = NULL; } } @@ -1360,7 +1374,7 @@ restart: /* queue recovery for our own slot */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); spin_lock(&osb->osb_lock); while (rm->rm_used) { @@ -1419,13 +1433,14 @@ skip_recovery: continue; } ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec); + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); /* queue recovery for offline slots */ - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); bail: mutex_lock(&osb->recovery_lock); @@ -1447,7 +1462,6 @@ bail: * requires that we call do_exit(). And it isn't exported, but * complete_and_exit() seems to be a minimal wrapper around it. */ complete_and_exit(NULL, status); - return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) @@ -1712,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy, NULL); + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); status = 0; done: @@ -1902,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) for (i = 0; i < osb->max_slots; i++) ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, - NULL); + NULL, ORPHAN_NO_NEED_TRUNCATE); /* * We queued a recovery on orphan slots, increment the sequence * number and update LVB so other node will skip the scan for a while @@ -1982,10 +1996,12 @@ struct ocfs2_orphan_filldir_priv { struct ocfs2_super *osb; }; -static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, - loff_t pos, u64 ino, unsigned type) +static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, + int name_len, loff_t pos, u64 ino, + unsigned type) { - struct ocfs2_orphan_filldir_priv *p = priv; + struct ocfs2_orphan_filldir_priv *p = + container_of(ctx, struct ocfs2_orphan_filldir_priv, ctx); struct inode *iter; if (name_len == 1 && !strncmp(".", name, 1)) @@ -1999,6 +2015,13 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len, if (IS_ERR(iter)) return 0; + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return 0; + } + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there * is only ever a single process doing orphan recovery. */ @@ -2107,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, * advertising our state to ocfs2_delete_inode(). */ static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) { int ret = 0; struct inode *inode = NULL; @@ -2131,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; + + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto next; + } + ocfs2_inode_unlock(inode, 0); + + if (inode->i_nlink == 0) { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { + struct buffer_head *di_bh = NULL; + + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto next; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + ocfs2_rw_unlock(inode, 1); + mlog_errno(ret); + goto next; + } + + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(di_bh); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto next; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + if (ret) + mlog_errno(ret); - spin_lock(&oi->ip_lock); - /* Set the proper information to get us going into - * ocfs2_delete_inode. */ - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&oi->ip_lock); + wake_up(&OCFS2_I(inode)->append_dio_wq); + } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ +next: iput(inode); inode = iter; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7f8cde94abfe..f4cd3c3e9fb7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 10d66c75cecb..9581d190f6e1 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -173,7 +173,6 @@ out: static const struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, .page_mkwrite = ocfs2_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 74caffeeee1d..56a768d06aa6 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -904,9 +904,6 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) struct buffer_head *di_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if (!inode) - return -ENOENT; - if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b931e04e3388..b5c3a5ea3ee6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup); + struct ocfs2_dir_lookup_result *lookup, + bool dio); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, @@ -87,15 +88,26 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode); + struct inode *orphan_dir_inode, + bool dio); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, const char *symname); +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2, + int rename); + +static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -678,8 +690,10 @@ static int ocfs2_link(struct dentry *old_dentry, { handle_t *handle; struct inode *inode = old_dentry->d_inode; + struct inode *old_dir = old_dentry->d_parent->d_inode; int err; struct buffer_head *fe_bh = NULL; + struct buffer_head *old_dir_bh = NULL; struct buffer_head *parent_fe_bh = NULL; struct ocfs2_dinode *fe = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); @@ -696,19 +710,33 @@ static int ocfs2_link(struct dentry *old_dentry, dquot_initialize(dir); - err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); + err = ocfs2_double_lock(osb, &old_dir_bh, old_dir, + &parent_fe_bh, dir, 0); if (err < 0) { if (err != -ENOENT) mlog_errno(err); return err; } + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!parent_fe_bh) { + if (old_dir_bh) { + parent_fe_bh = old_dir_bh; + get_bh(parent_fe_bh); + } else { + mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str); + err = -EIO; + goto out; + } + } + if (!dir->i_nlink) { err = -ENOENT; goto out; } - err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, + err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name, old_dentry->d_name.len, &old_de_ino); if (err) { err = -ENOENT; @@ -801,10 +829,11 @@ out_unlock_inode: ocfs2_inode_unlock(inode, 1); out: - ocfs2_inode_unlock(dir, 1); + ocfs2_double_unlock(old_dir, dir); brelse(fe_bh); brelse(parent_fe_bh); + brelse(old_dir_bh); ocfs2_free_dir_lookup_result(&lookup); @@ -927,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir, if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto leave; @@ -979,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir, if (is_unlinkable) { status = ocfs2_orphan_add(osb, handle, inode, fe_bh, - orphan_name, &orphan_insert, orphan_dir); + orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) mlog_errno(status); } @@ -1072,14 +1102,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, } /* - * The only place this should be used is rename! + * The only place this should be used is rename and link! * if they have the same id, then the 1st one is the only one locked. */ static int ocfs2_double_lock(struct ocfs2_super *osb, struct buffer_head **bh1, struct inode *inode1, struct buffer_head **bh2, - struct inode *inode2) + struct inode *inode2, + int rename) { int status; int inode1_is_ancestor, inode2_is_ancestor; @@ -1127,7 +1158,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } /* lock id2 */ status = ocfs2_inode_lock_nested(inode2, bh2, 1, - OI_LS_RENAME1); + rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT); if (status < 0) { if (status != -ENOENT) mlog_errno(status); @@ -1136,7 +1167,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, } /* lock id1 */ - status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); + status = ocfs2_inode_lock_nested(inode1, bh1, 1, + rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT); if (status < 0) { /* * An error return must mean that no cluster locks @@ -1252,7 +1284,7 @@ static int ocfs2_rename(struct inode *old_dir, /* if old and new are the same, this'll just do one lock. */ status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, - &new_dir_bh, new_dir); + &new_dir_bh, new_dir, 1); if (status < 0) { mlog_errno(status); goto bail; @@ -1413,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(new_inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto bail; @@ -1480,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir, if (should_add_orphan) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto bail; @@ -2061,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { int ret; struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } - ret = ocfs2_blkno_stringify(blkno, name); + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); if (ret < 0) { mlog_errno(ret); return ret; @@ -2074,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); + namelen, lookup); if (ret < 0) { mlog_errno(ret); return ret; @@ -2101,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; @@ -2115,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, } ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, - blkno, name, lookup); + blkno, name, lookup, dio); if (ret < 0) { mlog_errno(ret); goto out; @@ -2143,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode) + struct inode *orphan_dir_inode, + bool dio) { struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; trace_ocfs2_orphan_add_begin( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2192,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, - OCFS2_ORPHAN_NAMELEN, inode, + namelen, inode, OCFS2_I(inode)->ip_blkno, orphan_dir_bh, lookup); if (status < 0) { @@ -2200,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto rollback; } - fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); - OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; - /* Record which orphan dir our inode now resides - * in. delete_inode will use this to determine which orphan - * dir to lock. */ - fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } ocfs2_journal_dirty(handle, fe_bh); @@ -2231,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh) + struct buffer_head *orphan_dir_bh, + bool dio) { - char name[OCFS2_ORPHAN_NAMELEN + 1]; + const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; + char name[namelen + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; @@ -2246,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, OCFS2_ORPHAN_NAMELEN); + name, namelen); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + status = ocfs2_find_entry(name, namelen, orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2349,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, - di_blkno, orphan_name, orphan_insert); + di_blkno, orphan_name, orphan_insert, + false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2455,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto leave; @@ -2500,6 +2577,186 @@ leave: return status; } +static int ocfs2_dio_orphan_recovered(struct inode *inode) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return 0; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + return ret; +} + +#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; + +restart: + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, wait for recovery first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, + ocfs2_dio_orphan_recovered(inode), + msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); + goto restart; + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) di_bh->b_data; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail_unlock_inode; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *inode, struct dentry *dentry) @@ -2588,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto out_commit; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e5d059d4f115..5ddecce172fa 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh); + struct buffer_head *orphan_dir_bh, + bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index bbec539230fd..460c6c37e683 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -144,6 +144,12 @@ enum ocfs2_unlock_action { * before the upconvert * has completed */ +#define OCFS2_LOCK_NONBLOCK_FINISHED (0x00001000) /* NONBLOCK cluster + * lock has already + * returned, do not block + * dc thread from + * downconverting */ + struct ocfs2_lock_res_ops; typedef void (*ocfs2_lock_callback)(int status, unsigned long data); @@ -203,6 +209,11 @@ struct ocfs2_lock_res { #endif }; +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE @@ -273,6 +284,8 @@ enum ocfs2_mount_options writes */ OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */ OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ + + OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -487,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO) + return 1; + return 0; +} + + static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) @@ -718,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, return clusters; } +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a10d5d..db64ce2d4667 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -102,7 +102,8 @@ | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \ | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \ | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \ - | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) + | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \ + | OCFS2_FEATURE_INCOMPAT_APPEND_DIO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) @@ -178,6 +179,11 @@ #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000 /* + * Append Direct IO support + */ +#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000 + +/* * backup superblock flag is used to indicate that this volume * has backup superblocks. */ @@ -199,6 +205,7 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -229,6 +236,8 @@ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -729,7 +738,9 @@ struct ocfs2_dinode { inode belongs to. Only valid if allocated from a discontiguous block group */ -/*A0*/ __le64 i_reserved2[3]; +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 1eae330193a6..b6d51333ad02 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -48,6 +48,7 @@ struct ocfs2_quota_recovery { /* In-memory structure with quota header information */ struct ocfs2_mem_dqinfo { unsigned int dqi_type; /* Quota type this structure describes */ + unsigned int dqi_flags; /* Flags OLQF_* */ unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 10b653930ee2..3d0b63d34225 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) ol_dqblk_block_off(sb, c, off); } -/* Compute block number from given offset */ -static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off) -{ - return off >> sb->s_blocksize_bits; -} - static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) { return off & ((1 << sb->s_blocksize_bits) - 1); @@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private) ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); spin_lock(&dq_data_lock); - ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags); ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); spin_unlock(&dq_data_lock); @@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) /* We don't need the lock and we have to acquire quota file locks * which will later depend on this lock */ mutex_unlock(&sb_dqopt(sb)->dqio_mutex); - info->dqi_maxblimit = 0x7fffffffffffffffLL; - info->dqi_maxilimit = 0x7fffffffffffffffLL; + info->dqi_max_spc_limit = 0x7fffffffffffffffLL; + info->dqi_max_ino_limit = 0x7fffffffffffffffLL; oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); if (!oinfo) { mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" @@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) } ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + OCFS2_LOCAL_INFO_OFF); - info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); + oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); oinfo->dqi_libh = bh; /* We crashed when using local quota file? */ - if (!(info->dqi_flags & OLQF_CLEAN)) { + if (!(oinfo->dqi_flags & OLQF_CLEAN)) { rec = OCFS2_SB(sb)->quota_rec; if (!rec) { rec = ocfs2_alloc_quota_recovery(); @@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) } /* Now mark quota file as used */ - info->dqi_flags &= ~OLQF_CLEAN; + oinfo->dqi_flags &= ~OLQF_CLEAN; status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); if (status < 0) { mlog_errno(status); @@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) goto out; /* Mark local file as clean */ - info->dqi_flags |= OLQF_CLEAN; + oinfo->dqi_flags |= OLQF_CLEAN; status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], oinfo->dqi_libh, olq_update_info, diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d81f6e2a97f5..ee541f92dab4 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, get_bh(prev_bh); } - rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; - trace_ocfs2_calc_refcount_meta_credits_iterate( recs_add, (unsigned long long)cpos, clusters, (unsigned long long)le64_to_cpu(rec.r_cpos), diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c index 41ffd36c689c..6a348b0294ab 100644 --- a/fs/ocfs2/reservations.c +++ b/fs/ocfs2/reservations.c @@ -39,7 +39,7 @@ #define OCFS2_CHECK_RESERVATIONS #endif -DEFINE_SPINLOCK(resv_lock); +static DEFINE_SPINLOCK(resv_lock); #define OCFS2_MIN_RESV_WINDOW_BITS 8 #define OCFS2_MAX_RESV_WINDOW_BITS 1024 diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index a88b2a4fcc85..d5493e361a38 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -306,7 +306,7 @@ int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, assert_spin_locked(&osb->osb_lock); BUG_ON(slot_num < 0); - BUG_ON(slot_num > osb->max_slots); + BUG_ON(slot_num >= osb->max_slots); if (!si->si_slots[slot_num].sl_valid) return -ENOENT; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 93c85bc745e1..26675185b886 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -143,6 +143,11 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); static int ocfs2_enable_quotas(struct ocfs2_super *osb); static void ocfs2_disable_quotas(struct ocfs2_super *osb); +static struct dquot **ocfs2_get_dquots(struct inode *inode) +{ + return OCFS2_I(inode)->i_dquot; +} + static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, .alloc_inode = ocfs2_alloc_inode, @@ -155,6 +160,7 @@ static const struct super_operations ocfs2_sops = { .show_options = ocfs2_show_options, .quota_read = ocfs2_quota_read, .quota_write = ocfs2_quota_write, + .get_dquots = ocfs2_get_dquots, }; enum { @@ -185,6 +191,7 @@ enum { Opt_coherency_full, Opt_resv_level, Opt_dir_resv_level, + Opt_journal_async_commit, Opt_err, }; @@ -216,6 +223,7 @@ static const match_table_t tokens = { {Opt_coherency_full, "coherency=full"}, {Opt_resv_level, "resv_level=%u"}, {Opt_dir_resv_level, "dir_resv_level=%u"}, + {Opt_journal_async_commit, "journal_async_commit"}, {Opt_err, NULL} }; @@ -563,6 +571,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) oi->i_sync_tid = 0; oi->i_datasync_tid = 0; + memset(&oi->i_dquot, 0, sizeof(oi->i_dquot)); jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; @@ -993,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) } } -/* Handle quota on quotactl */ -static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) -{ - unsigned int feature[OCFS2_MAXQUOTAS] = { - OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) - return -EINVAL; - - return dquot_enable(sb_dqopt(sb)->files[type], type, - format_id, DQUOT_LIMITS_ENABLED); -} - -/* Handle quota off quotactl */ -static int ocfs2_quota_off(struct super_block *sb, int type) -{ - return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); -} - -static const struct quotactl_ops ocfs2_quotactl_ops = { - .quota_on_meta = ocfs2_quota_on, - .quota_off = ocfs2_quota_off, - .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk, -}; - static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -1493,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb, option < OCFS2_MAX_RESV_LEVEL) mopt->dir_resv_level = option; break; + case Opt_journal_async_commit: + mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1599,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root) if (osb->osb_dir_resv_level != osb->osb_resv_level) seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level); + if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + seq_printf(s, ",journal_async_commit"); + return 0; } @@ -1622,8 +1607,9 @@ static int __init ocfs2_init(void) ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); if (!ocfs2_debugfs_root) { - status = -EFAULT; + status = -ENOMEM; mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); + goto out4; } ocfs2_set_locking_protocol(); @@ -1760,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); + init_waitqueue_head(&oi->append_dio_wq); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); @@ -2071,8 +2059,9 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_op = &ocfs2_sops; sb->s_d_op = &ocfs2_dentry_ops; sb->s_export_op = &ocfs2_export_ops; - sb->s_qcop = &ocfs2_quotactl_ops; + sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->dq_op = &ocfs2_quota_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; @@ -2466,6 +2455,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) goto finally; } + if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT) + jbd2_journal_set_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + else + jbd2_journal_clear_features(osb->journal->j_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); + if (dirty) { /* recover my local alloc if we didn't unmount cleanly. */ status = ocfs2_begin_local_alloc_recovery(osb, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 016f01df3825..85b190dc132f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1284,7 +1284,7 @@ int ocfs2_xattr_get_nolock(struct inode *inode, return -EOPNOTSUPP; if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) - ret = -ENODATA; + return -ENODATA; xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; @@ -5334,16 +5334,6 @@ out: return ret; } -static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, - struct ocfs2_xattr_bucket *bucket, - int offs) -{ - int block_off = offs >> inode->i_sb->s_blocksize_bits; - - offs = offs % inode->i_sb->s_blocksize; - return bucket_block(bucket, block_off) + offs; -} - /* * Truncate the specified xe_off entry in xattr bucket. * bucket is indicated by header_bh and len is the new length. diff --git a/fs/open.c b/fs/open.c index de92c13b58be..33f9cbf2610b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) #endif /* BITS_PER_LONG == 32 */ -int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; @@ -295,9 +295,21 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) sb_start_write(inode->i_sb); ret = file->f_op->fallocate(file, mode, offset, len); + + /* + * Create inotify and fanotify events. + * + * To keep the logic simple always create events if fallocate succeeds. + * This implies that events are even created if the file size remains + * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE. + */ + if (ret == 0) + fsnotify_modify(file); + sb_end_write(inode->i_sb); return ret; } +EXPORT_SYMBOL_GPL(vfs_fallocate); SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { @@ -305,7 +317,7 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) int error = -EBADF; if (f.file) { - error = do_fallocate(f.file, mode, offset, len); + error = vfs_fallocate(f.file, mode, offset, len); fdput(f); } return error; @@ -516,7 +528,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) int err = -EBADF; if (f.file) { - audit_inode(NULL, f.file->f_path.dentry, 0); + audit_file(f.file); err = chmod_common(&f.file->f_path, mode); fdput(f); } @@ -642,7 +654,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) error = mnt_want_write_file(f.file); if (error) goto out_fput; - audit_inode(NULL, f.file->f_path.dentry, 0); + audit_file(f.file); error = chown_common(&f.file->f_path, user, group); mnt_drop_write_file(f.file); out_fput: @@ -655,11 +667,8 @@ int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; - } } return 0; } @@ -959,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode) */ struct file *filp_open(const char *filename, int flags, umode_t mode) { - struct filename name = {.name = filename}; - return file_open_name(&name, flags, mode); + struct filename *name = getname_kernel(filename); + struct file *file = ERR_CAST(name); + + if (!IS_ERR(name)) { + file = file_open_name(name, flags, mode); + putname(name); + } + return file; } EXPORT_SYMBOL(filp_open); diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index e60125976873..34355818a2e0 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,4 +1,4 @@ -config OVERLAYFS_FS +config OVERLAY_FS tristate "Overlay filesystem support" help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 8f91889480d0..900daed3e91d 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -2,6 +2,6 @@ # Makefile for the overlay filesystem. # -obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o +obj-$(CONFIG_OVERLAY_FS) += overlay.o -overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o +overlay-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index ea10a8719107..24f640441bd9 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -191,7 +191,6 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) ovl_set_timestamps(upperdentry, stat); return err; - } static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, @@ -385,7 +384,7 @@ int ovl_copy_up(struct dentry *dentry) struct kstat stat; enum ovl_path_type type = ovl_path_type(dentry); - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) break; next = dget(dentry); @@ -394,7 +393,7 @@ int ovl_copy_up(struct dentry *dentry) parent = dget_parent(next); type = ovl_path_type(parent); - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) break; dput(next); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 15cd91ad9940..d139405d2bfa 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -19,7 +19,7 @@ void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) int err; dget(wdentry); - if (S_ISDIR(wdentry->d_inode->i_mode)) + if (d_is_dir(wdentry)) err = ovl_do_rmdir(wdir, wdentry); else err = ovl_do_unlink(wdir, wdentry); @@ -118,14 +118,14 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, static int ovl_set_opaque(struct dentry *upperdentry) { - return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); + return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); } static void ovl_remove_opaque(struct dentry *upperdentry) { int err; - err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE); if (err) { pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", upperdentry->d_name.name, err); @@ -152,7 +152,7 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, * correct link count. nlink=1 seems to pacify 'find' and * other utilities. */ - if (type == OVL_PATH_MERGE) + if (OVL_TYPE_MERGE(type)) stat->nlink = 1; return 0; @@ -284,8 +284,7 @@ out: return ERR_PTR(err); } -static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, - enum ovl_path_type type) +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) { int err; struct dentry *ret = NULL; @@ -294,8 +293,17 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, err = ovl_check_empty_dir(dentry, &list); if (err) ret = ERR_PTR(err); - else if (type == OVL_PATH_MERGE) - ret = ovl_clear_empty(dentry, &list); + else { + /* + * If no upperdentry then skip clearing whiteouts. + * + * Can race with copy-up, since we don't hold the upperdir + * mutex. Doesn't matter, since copy-up can't create a + * non-empty directory from an empty one. + */ + if (ovl_dentry_upper(dentry)) + ret = ovl_clear_empty(dentry, &list); + } ovl_cache_free(&list); @@ -487,8 +495,7 @@ out: return err; } -static int ovl_remove_and_whiteout(struct dentry *dentry, - enum ovl_path_type type, bool is_dir) +static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) { struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; @@ -499,8 +506,8 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, struct dentry *opaquedir = NULL; int err; - if (is_dir) { - opaquedir = ovl_check_empty_and_clear(dentry, type); + if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { + opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out; @@ -515,9 +522,10 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, if (IS_ERR(whiteout)) goto out_unlock; - if (type == OVL_PATH_LOWER) { + upper = ovl_dentry_upper(dentry); + if (!upper) { upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto kill_whiteout; @@ -529,7 +537,6 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, } else { int flags = 0; - upper = ovl_dentry_upper(dentry); if (opaquedir) upper = opaquedir; err = -ESTALE; @@ -623,7 +630,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) goto out_drop_write; type = ovl_path_type(dentry); - if (type == OVL_PATH_PURE_UPPER) { + if (OVL_TYPE_PURE_UPPER(type)) { err = ovl_remove_upper(dentry, is_dir); } else { const struct cred *old_cred; @@ -648,7 +655,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) cap_raise(override_cred->cap_effective, CAP_CHOWN); old_cred = override_creds(override_cred); - err = ovl_remove_and_whiteout(dentry, type, is_dir); + err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); put_cred(override_cred); @@ -686,7 +693,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool new_create = false; bool cleanup_whiteout = false; bool overwrite = !(flags & RENAME_EXCHANGE); - bool is_dir = S_ISDIR(old->d_inode->i_mode); + bool is_dir = d_is_dir(old); bool new_is_dir = false; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; @@ -705,7 +712,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, /* Don't copy up directory trees */ old_type = ovl_path_type(old); err = -EXDEV; - if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir) goto out; if (new->d_inode) { @@ -713,30 +720,30 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (err) goto out; - if (S_ISDIR(new->d_inode->i_mode)) + if (d_is_dir(new)) new_is_dir = true; new_type = ovl_path_type(new); err = -EXDEV; - if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) goto out; err = 0; - if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) { if (ovl_dentry_lower(old)->d_inode == ovl_dentry_lower(new)->d_inode) goto out; } - if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) { if (ovl_dentry_upper(old)->d_inode == ovl_dentry_upper(new)->d_inode) goto out; } } else { if (ovl_dentry_is_opaque(new)) - new_type = OVL_PATH_UPPER; + new_type = __OVL_PATH_UPPER; else - new_type = OVL_PATH_PURE_UPPER; + new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE; } err = ovl_want_write(old); @@ -756,8 +763,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, goto out_drop_write; } - old_opaque = old_type != OVL_PATH_PURE_UPPER; - new_opaque = new_type != OVL_PATH_PURE_UPPER; + old_opaque = !OVL_TYPE_PURE_UPPER(old_type); + new_opaque = !OVL_TYPE_PURE_UPPER(new_type); if (old_opaque || new_opaque) { err = -ENOMEM; @@ -780,8 +787,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, old_cred = override_creds(override_cred); } - if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { - opaquedir = ovl_check_empty_and_clear(new, new_type); + if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { + opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { opaquedir = NULL; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index af2d18c9fcee..04f124884687 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -205,7 +205,7 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) static bool ovl_is_private_xattr(const char *name) { - return strncmp(name, "trusted.overlay.", 14) == 0; + return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0; } int ovl_setxattr(struct dentry *dentry, const char *name, @@ -235,26 +235,39 @@ out: return err; } +static bool ovl_need_xattr_filter(struct dentry *dentry, + enum ovl_path_type type) +{ + if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER) + return S_ISDIR(dentry->d_inode->i_mode); + else + return false; +} + ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { - if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && - ovl_is_private_xattr(name)) + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); + + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) return -ENODATA; - return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); + return vfs_getxattr(realpath.dentry, name, value, size); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); ssize_t res; int off; - res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + res = vfs_listxattr(realpath.dentry, list, size); if (res <= 0 || size == 0) return res; - if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + if (!ovl_need_xattr_filter(dentry, type)) return res; /* filter out private xattrs */ @@ -279,18 +292,17 @@ int ovl_removexattr(struct dentry *dentry, const char *name) { int err; struct path realpath; - enum ovl_path_type type; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); err = ovl_want_write(dentry); if (err) goto out; - if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && - ovl_is_private_xattr(name)) + err = -ENODATA; + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) goto out_drop_write; - type = ovl_path_real(dentry, &realpath); - if (type == OVL_PATH_LOWER) { + if (!OVL_TYPE_UPPER(type)) { err = vfs_getxattr(realpath.dentry, name, NULL, 0); if (err < 0) goto out_drop_write; @@ -312,7 +324,7 @@ out: static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, struct dentry *realdentry) { - if (type != OVL_PATH_LOWER) + if (OVL_TYPE_UPPER(type)) return false; if (special_file(realdentry->d_inode->i_mode)) @@ -421,5 +433,4 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, } return inode; - } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 814bed33dd07..17ac5afc9ffb 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -12,13 +12,20 @@ struct ovl_entry; enum ovl_path_type { - OVL_PATH_PURE_UPPER, - OVL_PATH_UPPER, - OVL_PATH_MERGE, - OVL_PATH_LOWER, + __OVL_PATH_PURE = (1 << 0), + __OVL_PATH_UPPER = (1 << 1), + __OVL_PATH_MERGE = (1 << 2), }; -extern const char *ovl_opaque_xattr; +#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) +#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE) +#define OVL_TYPE_MERGE_OR_LOWER(type) \ + (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) + +#define OVL_XATTR_PRE_NAME "trusted.overlay." +#define OVL_XATTR_PRE_LEN 16 +#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque" static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { @@ -130,6 +137,7 @@ void ovl_dentry_version_inc(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 2a7ef4f8e2a6..907870e81a72 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -24,7 +24,6 @@ struct ovl_cache_entry { struct list_head l_node; struct rb_node node; bool is_whiteout; - bool is_cursor; char name[]; }; @@ -40,6 +39,7 @@ struct ovl_readdir_data { struct rb_root root; struct list_head *list; struct list_head middle; + struct dentry *dir; int count; int err; }; @@ -48,7 +48,7 @@ struct ovl_dir_file { bool is_real; bool is_upper; struct ovl_dir_cache *cache; - struct ovl_cache_entry cursor; + struct list_head *cursor; struct file *realfile; struct file *upperfile; }; @@ -79,23 +79,49 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } -static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, +static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, + const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); p = kmalloc(size, GFP_KERNEL); - if (p) { - memcpy(p->name, name, len); - p->name[len] = '\0'; - p->len = len; - p->type = d_type; - p->ino = ino; - p->is_whiteout = false; - p->is_cursor = false; - } + if (!p) + return NULL; + + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + + if (d_type == DT_CHR) { + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + kfree(p); + return NULL; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + dentry = lookup_one_len(name, dir, len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + revert_creds(old_cred); + put_cred(override_cred); + } return p; } @@ -122,7 +148,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } - p = ovl_cache_entry_new(name, len, ino, d_type); + p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); if (p == NULL) return -ENOMEM; @@ -143,7 +169,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { - p = ovl_cache_entry_new(name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else @@ -168,7 +194,6 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; - list_del_init(&od->cursor.l_node); WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { @@ -180,10 +205,12 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) } } -static int ovl_fill_merge(void *buf, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) +static int ovl_fill_merge(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) { - struct ovl_readdir_data *rdd = buf; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; if (!rdd->is_merge) @@ -202,6 +229,7 @@ static inline int ovl_dir_read(struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); + rdd->dir = realpath->dentry; rdd->ctx.pos = 0; do { rdd->count = 0; @@ -225,112 +253,63 @@ static void ovl_dir_reset(struct file *file) if (cache && ovl_dentry_version_get(dentry) != cache->version) { ovl_cache_put(od, dentry); od->cache = NULL; + od->cursor = NULL; } - WARN_ON(!od->is_real && type != OVL_PATH_MERGE); - if (od->is_real && type == OVL_PATH_MERGE) + WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); + if (od->is_real && OVL_TYPE_MERGE(type)) od->is_real = false; } -static int ovl_dir_mark_whiteouts(struct dentry *dir, - struct ovl_readdir_data *rdd) -{ - struct ovl_cache_entry *p; - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - ovl_cache_free(rdd->list); - return -ENOMEM; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - mutex_lock(&dir->d_inode->i_mutex); - list_for_each_entry(p, rdd->list, l_node) { - if (p->is_cursor) - continue; - - if (p->type != DT_CHR) - continue; - - dentry = lookup_one_len(p->name, dir, p->len); - if (IS_ERR(dentry)) - continue; - - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - mutex_unlock(&dir->d_inode->i_mutex); - - revert_creds(old_cred); - put_cred(override_cred); - - return 0; -} - -static inline int ovl_dir_read_merged(struct path *upperpath, - struct path *lowerpath, - struct list_head *list) +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) { int err; + struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, .root = RB_ROOT, .is_merge = false, }; + int idx, next; - if (upperpath->dentry) { - err = ovl_dir_read(upperpath, &rdd); - if (err) - goto out; + for (idx = 0; idx != -1; idx = next) { + next = ovl_path_next(idx, dentry, &realpath); - if (lowerpath->dentry) { - err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (next != -1) { + err = ovl_dir_read(&realpath, &rdd); if (err) - goto out; + break; + } else { + /* + * Insert lowest layer entries before upper ones, this + * allows offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_merge = true; + err = ovl_dir_read(&realpath, &rdd); + list_del(&rdd.middle); } } - if (lowerpath->dentry) { - /* - * Insert lowerpath entries before upperpath ones, this allows - * offsets to be reasonably constant - */ - list_add(&rdd.middle, rdd.list); - rdd.is_merge = true; - err = ovl_dir_read(lowerpath, &rdd); - list_del(&rdd.middle); - } -out: return err; } static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) { - struct ovl_cache_entry *p; + struct list_head *p; loff_t off = 0; - list_for_each_entry(p, &od->cache->entries, l_node) { - if (p->is_cursor) - continue; + list_for_each(p, &od->cache->entries) { if (off >= pos) break; off++; } - list_move_tail(&od->cursor.l_node, &p->l_node); + /* Cursor is safe since the cache is stable */ + od->cursor = p; } static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; - struct path lowerpath; - struct path upperpath; struct ovl_dir_cache *cache; cache = ovl_dir_cache(dentry); @@ -347,10 +326,7 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); - ovl_path_lower(dentry, &lowerpath); - ovl_path_upper(dentry, &upperpath); - - res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + res = ovl_dir_read_merged(dentry, &cache->entries); if (res) { ovl_cache_free(&cache->entries); kfree(cache); @@ -367,6 +343,7 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; + struct ovl_cache_entry *p; if (!ctx->pos) ovl_dir_reset(file); @@ -385,19 +362,13 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) ovl_seek_cursor(od, ctx->pos); } - while (od->cursor.l_node.next != &od->cache->entries) { - struct ovl_cache_entry *p; - - p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); - /* Skip cursors */ - if (!p->is_cursor) { - if (!p->is_whiteout) { - if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) - break; - } - ctx->pos++; - } - list_move(&od->cursor.l_node, &p->l_node); + while (od->cursor != &od->cache->entries) { + p = list_entry(od->cursor, struct ovl_cache_entry, l_node); + if (!p->is_whiteout) + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + od->cursor = p->l_node.next; + ctx->pos++; } return 0; } @@ -452,10 +423,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, /* * Need to check if we started out being a lower dir, but got copied up */ - if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) { struct inode *inode = file_inode(file); - realfile =lockless_dereference(od->upperfile); + realfile = lockless_dereference(od->upperfile); if (!realfile) { struct path upperpath; @@ -516,11 +487,9 @@ static int ovl_dir_open(struct inode *inode, struct file *file) kfree(od); return PTR_ERR(realfile); } - INIT_LIST_HEAD(&od->cursor.l_node); od->realfile = realfile; - od->is_real = (type != OVL_PATH_MERGE); - od->is_upper = (type != OVL_PATH_LOWER); - od->cursor.is_cursor = true; + od->is_real = !OVL_TYPE_MERGE(type); + od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; return 0; @@ -538,14 +507,9 @@ const struct file_operations ovl_dir_operations = { int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; - struct path lowerpath; - struct path upperpath; struct ovl_cache_entry *p; - ovl_path_upper(dentry, &upperpath); - ovl_path_lower(dentry, &lowerpath); - - err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + err = ovl_dir_read_merged(dentry, list); if (err) return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 08b704cebfc4..5f0d1993e6e3 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -24,7 +24,7 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -#define OVERLAYFS_SUPER_MAGIC 0x794c764f +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 struct ovl_config { char *lowerdir; @@ -35,7 +35,8 @@ struct ovl_config { /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; - struct vfsmount *lower_mnt; + unsigned numlower; + struct vfsmount **lower_mnt; struct dentry *workdir; long lower_namelen; /* pathnames of lower and upper dirs, for show_options */ @@ -47,7 +48,6 @@ struct ovl_dir_cache; /* private information held for every overlayfs dentry */ struct ovl_entry { struct dentry *__upperdentry; - struct dentry *lowerdentry; struct ovl_dir_cache *cache; union { struct { @@ -56,40 +56,41 @@ struct ovl_entry { }; struct rcu_head rcu; }; + unsigned numlower; + struct path lowerstack[]; }; -const char *ovl_opaque_xattr = "trusted.overlay.opaque"; +#define OVL_MAX_STACK 500 +static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) +{ + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} enum ovl_path_type ovl_path_type(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; if (oe->__upperdentry) { - if (oe->lowerdentry) { + type = __OVL_PATH_UPPER; + + if (oe->numlower) { if (S_ISDIR(dentry->d_inode->i_mode)) - return OVL_PATH_MERGE; - else - return OVL_PATH_UPPER; - } else { - if (oe->opaque) - return OVL_PATH_UPPER; - else - return OVL_PATH_PURE_UPPER; + type |= __OVL_PATH_MERGE; + } else if (!oe->opaque) { + type |= __OVL_PATH_PURE; } } else { - return OVL_PATH_LOWER; + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; } + return type; } static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) { - struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); - /* - * Make sure to order reads to upperdentry wrt ovl_dentry_update() - */ - smp_read_barrier_depends(); - return upperdentry; + return lockless_dereference(oe->__upperdentry); } void ovl_path_upper(struct dentry *dentry, struct path *path) @@ -103,10 +104,9 @@ void ovl_path_upper(struct dentry *dentry, struct path *path) enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) { - enum ovl_path_type type = ovl_path_type(dentry); - if (type == OVL_PATH_LOWER) + if (!OVL_TYPE_UPPER(type)) ovl_path_lower(dentry, path); else ovl_path_upper(dentry, path); @@ -125,7 +125,7 @@ struct dentry *ovl_dentry_lower(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - return oe->lowerdentry; + return __ovl_dentry_lower(oe); } struct dentry *ovl_dentry_real(struct dentry *dentry) @@ -135,7 +135,7 @@ struct dentry *ovl_dentry_real(struct dentry *dentry) realdentry = ovl_upperdentry_dereference(oe); if (!realdentry) - realdentry = oe->lowerdentry; + realdentry = __ovl_dentry_lower(oe); return realdentry; } @@ -148,7 +148,7 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) if (realdentry) { *is_upper = true; } else { - realdentry = oe->lowerdentry; + realdentry = __ovl_dentry_lower(oe); *is_upper = false; } return realdentry; @@ -170,11 +170,9 @@ void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) void ovl_path_lower(struct dentry *dentry, struct path *path) { - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *oe = dentry->d_fsdata; - path->mnt = ofs->lower_mnt; - path->dentry = oe->lowerdentry; + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; } int ovl_want_write(struct dentry *dentry) @@ -254,7 +252,7 @@ static bool ovl_is_opaquedir(struct dentry *dentry) if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) return false; - res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + res = inode->i_op->getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); if (res == 1 && val == 'y') return true; @@ -266,8 +264,11 @@ static void ovl_dentry_release(struct dentry *dentry) struct ovl_entry *oe = dentry->d_fsdata; if (oe) { + unsigned int i; + dput(oe->__upperdentry); - dput(oe->lowerdentry); + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); kfree_rcu(oe, rcu); } } @@ -276,9 +277,15 @@ static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, }; -static struct ovl_entry *ovl_alloc_entry(void) +static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { - return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; } static inline struct dentry *ovl_lookup_real(struct dentry *dir, @@ -300,82 +307,154 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, return dentry; } +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + *path = oe->lowerstack[idx - 1]; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe; - struct dentry *upperdir; - struct dentry *lowerdir; - struct dentry *upperdentry = NULL; - struct dentry *lowerdentry = NULL; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct path *stack = NULL; + struct dentry *upperdir, *upperdentry = NULL; + unsigned int ctr = 0; struct inode *inode = NULL; + bool upperopaque = false; + struct dentry *this, *prev = NULL; + unsigned int i; int err; - err = -ENOMEM; - oe = ovl_alloc_entry(); - if (!oe) - goto out; - - upperdir = ovl_dentry_upper(dentry->d_parent); - lowerdir = ovl_dentry_lower(dentry->d_parent); - + upperdir = ovl_upperdentry_dereference(poe); if (upperdir) { - upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); - err = PTR_ERR(upperdentry); - if (IS_ERR(upperdentry)) - goto out_put_dir; - - if (lowerdir && upperdentry) { - if (ovl_is_whiteout(upperdentry)) { - dput(upperdentry); - upperdentry = NULL; - oe->opaque = true; - } else if (ovl_is_opaquedir(upperdentry)) { - oe->opaque = true; + this = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) + goto out; + + if (this) { + if (ovl_is_whiteout(this)) { + dput(this); + this = NULL; + upperopaque = true; + } else if (poe->numlower && ovl_is_opaquedir(this)) { + upperopaque = true; } } + upperdentry = prev = this; } - if (lowerdir && !oe->opaque) { - lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); - err = PTR_ERR(lowerdentry); - if (IS_ERR(lowerdentry)) - goto out_dput_upper; + + if (!upperopaque && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_put_upper; } - if (lowerdentry && upperdentry && - (!S_ISDIR(upperdentry->d_inode->i_mode) || - !S_ISDIR(lowerdentry->d_inode->i_mode))) { - dput(lowerdentry); - lowerdentry = NULL; - oe->opaque = true; + for (i = 0; !upperopaque && i < poe->numlower; i++) { + bool opaque = false; + struct path lowerpath = poe->lowerstack[i]; + + this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); + err = PTR_ERR(this); + if (IS_ERR(this)) { + /* + * If it's positive, then treat ENAMETOOLONG as ENOENT. + */ + if (err == -ENAMETOOLONG && (upperdentry || ctr)) + continue; + goto out_put; + } + if (!this) + continue; + if (ovl_is_whiteout(this)) { + dput(this); + break; + } + /* + * Only makes sense to check opaque dir if this is not the + * lowermost layer. + */ + if (i < poe->numlower - 1 && ovl_is_opaquedir(this)) + opaque = true; + + if (prev && (!S_ISDIR(prev->d_inode->i_mode) || + !S_ISDIR(this->d_inode->i_mode))) { + /* + * FIXME: check for upper-opaqueness maybe better done + * in remove code. + */ + if (prev == upperdentry) + upperopaque = true; + dput(this); + break; + } + /* + * If this is a non-directory then stop here. + */ + if (!S_ISDIR(this->d_inode->i_mode)) + opaque = true; + + stack[ctr].dentry = this; + stack[ctr].mnt = lowerpath.mnt; + ctr++; + prev = this; + if (opaque) + break; } - if (lowerdentry || upperdentry) { + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + if (upperdentry || ctr) { struct dentry *realdentry; - realdentry = upperdentry ? upperdentry : lowerdentry; + realdentry = upperdentry ? upperdentry : stack[0].dentry; + err = -ENOMEM; inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, oe); if (!inode) - goto out_dput; + goto out_free_oe; ovl_copyattr(realdentry->d_inode, inode); } + oe->opaque = upperopaque; oe->__upperdentry = upperdentry; - oe->lowerdentry = lowerdentry; - + memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + kfree(stack); dentry->d_fsdata = oe; d_add(dentry, inode); return NULL; -out_dput: - dput(lowerdentry); -out_dput_upper: - dput(upperdentry); -out_put_dir: +out_free_oe: kfree(oe); +out_put: + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + dput(upperdentry); out: return ERR_PTR(err); } @@ -388,10 +467,12 @@ struct file *ovl_path_open(struct path *path, int flags) static void ovl_put_super(struct super_block *sb) { struct ovl_fs *ufs = sb->s_fs_info; + unsigned i; dput(ufs->workdir); mntput(ufs->upper_mnt); - mntput(ufs->lower_mnt); + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); @@ -405,7 +486,7 @@ static void ovl_put_super(struct super_block *sb) * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. As writes always target the upper layer - * filesystem pass the statfs to the same filesystem. + * filesystem pass the statfs to the upper filesystem (if it exists) */ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) { @@ -414,7 +495,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) struct path path; int err; - ovl_path_upper(root_dentry, &path); + ovl_path_real(root_dentry, &path); err = vfs_statfs(&path, buf); if (!err) { @@ -437,8 +518,20 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct ovl_fs *ufs = sb->s_fs_info; seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); - seq_printf(m, ",upperdir=%s", ufs->config.upperdir); - seq_printf(m, ",workdir=%s", ufs->config.workdir); + if (ufs->config.upperdir) { + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + } + return 0; +} + +static int ovl_remount(struct super_block *sb, int *flags, char *data) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) + return -EROFS; + return 0; } @@ -446,6 +539,7 @@ static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .statfs = ovl_statfs, .show_options = ovl_show_options, + .remount_fs = ovl_remount, }; enum { @@ -462,11 +556,34 @@ static const match_table_t ovl_tokens = { {OPT_ERR, NULL} }; +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; - while ((p = strsep(&opt, ",")) != NULL) { + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -497,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) break; default: + pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; } } + + /* Workdir is useless in non-upper mount */ + if (!config->upperdir && config->workdir) { + pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n", + config->workdir); + kfree(config->workdir); + config->workdir = NULL; + } + return 0; } @@ -554,16 +681,17 @@ out_dput: goto out_unlock; } -static int ovl_mount_dir(const char *name, struct path *path) +static void ovl_unescape(char *s) { - int err; + char *d = s; - err = kern_path(name, LOOKUP_FOLLOW, path); - if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); - err = -EINVAL; + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; } - return err; } static bool ovl_is_allowed_fs_type(struct dentry *root) @@ -585,6 +713,75 @@ static bool ovl_is_allowed_fs_type(struct dentry *root) return true; } +static int ovl_mount_dir_noesc(const char *name, struct path *path) +{ + int err = -EINVAL; + + if (!*name) { + pr_err("overlayfs: empty lowerdir\n"); + goto out; + } + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + goto out; + } + err = -EINVAL; + if (!ovl_is_allowed_fs_type(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported\n", name); + goto out_put; + } + if (!S_ISDIR(path->dentry->d_inode->i_mode)) { + pr_err("overlayfs: '%s' not a directory\n", name); + goto out_put; + } + return 0; + +out_put: + path_put(path); +out: + return err; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err = -ENOMEM; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (tmp) { + ovl_unescape(tmp); + err = ovl_mount_dir_noesc(tmp, path); + kfree(tmp); + } + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, long *namelen, + int *stack_depth) +{ + int err; + struct kstatfs statfs; + + err = ovl_mount_dir_noesc(name, path); + if (err) + goto out; + + err = vfs_statfs(path, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on '%s'\n", name); + goto out_put; + } + *namelen = max(*namelen, statfs.f_namelen); + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + + return 0; + +out_put: + path_put(path); +out: + return err; +} + /* Workdir should not be subdir of upperdir and vice versa */ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) { @@ -597,16 +794,39 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) return ok; } +static unsigned int ovl_split_lowerdirs(char *str) +{ + unsigned int ctr = 1; + char *s, *d; + + for (s = d = str;; s++, d++) { + if (*s == '\\') { + s++; + } else if (*s == ':') { + *d = '\0'; + ctr++; + continue; + } + *d = *s; + if (!*s) + break; + } + return ctr; +} + static int ovl_fill_super(struct super_block *sb, void *data, int silent) { - struct path lowerpath; - struct path upperpath; - struct path workpath; - struct inode *root_inode; + struct path upperpath = { NULL, NULL }; + struct path workpath = { NULL, NULL }; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct kstatfs statfs; + struct path *stack = NULL; + char *lowertmp; + char *lower; + unsigned int numlower; + unsigned int stacklen = 0; + unsigned int i; int err; err = -ENOMEM; @@ -618,123 +838,147 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_free_config; - /* FIXME: workdir is not needed for a R/O mount */ err = -EINVAL; - if (!ufs->config.upperdir || !ufs->config.lowerdir || - !ufs->config.workdir) { - pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + if (!ufs->config.lowerdir) { + pr_err("overlayfs: missing 'lowerdir'\n"); goto out_free_config; } - err = -ENOMEM; - oe = ovl_alloc_entry(); - if (oe == NULL) - goto out_free_config; - - err = ovl_mount_dir(ufs->config.upperdir, &upperpath); - if (err) - goto out_free_oe; + sb->s_stack_depth = 0; + if (ufs->config.upperdir) { + if (!ufs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_free_config; + } - err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); - if (err) - goto out_put_upperpath; + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); + if (err) + goto out_free_config; - err = ovl_mount_dir(ufs->config.workdir, &workpath); - if (err) - goto out_put_lowerpath; + /* Upper fs should not be r/o */ + if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out_put_upperpath; + } - err = -EINVAL; - if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || - !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || - !S_ISDIR(workpath.dentry->d_inode->i_mode)) { - pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); - goto out_put_workpath; - } + err = ovl_mount_dir(ufs->config.workdir, &workpath); + if (err) + goto out_put_upperpath; - if (upperpath.mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_workpath; + err = -EINVAL; + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; } - if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + err = -ENOMEM; + lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL); + if (!lowertmp) goto out_put_workpath; - } - if (!ovl_is_allowed_fs_type(upperpath.dentry)) { - pr_err("overlayfs: filesystem of upperdir is not supported\n"); - goto out_put_workpath; + err = -EINVAL; + stacklen = ovl_split_lowerdirs(lowertmp); + if (stacklen > OVL_MAX_STACK) { + pr_err("overlayfs: too many lower directries, limit is %d\n", + OVL_MAX_STACK); + goto out_free_lowertmp; + } else if (!ufs->config.upperdir && stacklen == 1) { + pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); + goto out_free_lowertmp; } - if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { - pr_err("overlayfs: filesystem of lowerdir is not supported\n"); - goto out_put_workpath; - } + stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); + if (!stack) + goto out_free_lowertmp; - err = vfs_statfs(&lowerpath, &statfs); - if (err) { - pr_err("overlayfs: statfs failed on lowerpath\n"); - goto out_put_workpath; - } - ufs->lower_namelen = statfs.f_namelen; + lower = lowertmp; + for (numlower = 0; numlower < stacklen; numlower++) { + err = ovl_lower_dir(lower, &stack[numlower], + &ufs->lower_namelen, &sb->s_stack_depth); + if (err) + goto out_put_lowerpath; - sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, - lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + lower = strchr(lower, '\0') + 1; + } err = -EINVAL; + sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("overlayfs: maximum fs stacking depth exceeded\n"); - goto out_put_workpath; + goto out_put_lowerpath; } - ufs->upper_mnt = clone_private_mount(&upperpath); - err = PTR_ERR(ufs->upper_mnt); - if (IS_ERR(ufs->upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); - goto out_put_workpath; - } + if (ufs->config.upperdir) { + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_lowerpath; + } - ufs->lower_mnt = clone_private_mount(&lowerpath); - err = PTR_ERR(ufs->lower_mnt); - if (IS_ERR(ufs->lower_mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); - goto out_put_upper_mnt; + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + ufs->config.workdir, OVL_WORKDIR_NAME); + goto out_put_upper_mnt; + } } - ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); - err = PTR_ERR(ufs->workdir); - if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_lower_mnt; - } + err = -ENOMEM; + ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); + if (ufs->lower_mnt == NULL) + goto out_put_workdir; + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt = clone_private_mount(&stack[i]); - /* - * Make lower_mnt R/O. That way fchmod/fchown on lower file - * will fail instead of modifying lower fs. - */ - ufs->lower_mnt->mnt_flags |= MNT_READONLY; + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_lower_mnt; + } + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY; + + ufs->lower_mnt[ufs->numlower] = mnt; + ufs->numlower++; + } - /* If the upper fs is r/o, we mark overlayfs r/o too */ - if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - root_inode = ovl_new_inode(sb, S_IFDIR, oe); - if (!root_inode) - goto out_put_workdir; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_put_lower_mnt; - root_dentry = d_make_root(root_inode); + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe)); if (!root_dentry) - goto out_put_workdir; + goto out_free_oe; mntput(upperpath.mnt); - mntput(lowerpath.mnt); + for (i = 0; i < numlower; i++) + mntput(stack[i].mnt); path_put(&workpath); + kfree(lowertmp); oe->__upperdentry = upperpath.dentry; - oe->lowerdentry = lowerpath.dentry; + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = stack[i].dentry; + oe->lowerstack[i].mnt = ufs->lower_mnt[i]; + } root_dentry->d_fsdata = oe; @@ -745,20 +989,26 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) return 0; +out_free_oe: + kfree(oe); +out_put_lower_mnt: + for (i = 0; i < ufs->numlower; i++) + mntput(ufs->lower_mnt[i]); + kfree(ufs->lower_mnt); out_put_workdir: dput(ufs->workdir); -out_put_lower_mnt: - mntput(ufs->lower_mnt); out_put_upper_mnt: mntput(ufs->upper_mnt); +out_put_lowerpath: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); +out_free_lowertmp: + kfree(lowertmp); out_put_workpath: path_put(&workpath); -out_put_lowerpath: - path_put(&lowerpath); out_put_upperpath: path_put(&upperpath); -out_free_oe: - kfree(oe); out_free_config: kfree(ufs->config.lowerdir); kfree(ufs->config.upperdir); @@ -776,11 +1026,11 @@ static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, static struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, - .name = "overlayfs", + .name = "overlay", .mount = ovl_mount, .kill_sb = kill_anon_super, }; -MODULE_ALIAS_FS("overlayfs"); +MODULE_ALIAS_FS("overlay"); static int __init ovl_init(void) { diff --git a/fs/pnode.c b/fs/pnode.c index aae331a5d03b..260ac8f898a4 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -242,6 +242,7 @@ static int propagate_one(struct mount *m) child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); + child->mnt.mnt_flags &= ~MNT_LOCKED; mnt_set_mountpoint(m, mp, child); last_dest = m; last_source = child; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 0855f772cd41..3a48bb789c9f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -564,13 +564,11 @@ posix_acl_create(struct inode *dir, umode_t *mode, *acl = posix_acl_clone(p, GFP_NOFS); if (!*acl) - return -ENOMEM; + goto no_mem; ret = posix_acl_create_masq(*acl, mode); - if (ret < 0) { - posix_acl_release(*acl); - return -ENOMEM; - } + if (ret < 0) + goto no_mem_clone; if (ret == 0) { posix_acl_release(*acl); @@ -591,6 +589,12 @@ no_acl: *default_acl = NULL; *acl = NULL; return 0; + +no_mem_clone: + posix_acl_release(*acl); +no_mem: + posix_acl_release(p); + return -ENOMEM; } EXPORT_SYMBOL_GPL(posix_acl_create); @@ -772,7 +776,7 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, if (!IS_POSIXACL(dentry->d_inode)) return -EOPNOTSUPP; - if (S_ISLNK(dentry->d_inode->i_mode)) + if (d_is_symlink(dentry)) return -EOPNOTSUPP; acl = get_acl(dentry->d_inode, type); @@ -832,7 +836,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, if (!IS_POSIXACL(dentry->d_inode)) return -EOPNOTSUPP; - if (S_ISLNK(dentry->d_inode->i_mode)) + if (d_is_symlink(dentry)) return -EOPNOTSUPP; if (type == ACL_TYPE_ACCESS) diff --git a/fs/proc/array.c b/fs/proc/array.c index cd3653e4f35c..1295a00ca316 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,6 +81,7 @@ #include <linux/pid_namespace.h> #include <linux/ptrace.h> #include <linux/tracehook.h> +#include <linux/string_helpers.h> #include <linux/user_namespace.h> #include <asm/pgtable.h> @@ -89,39 +90,18 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) { - int i; - char *buf, *end; - char *name; + char *buf; char tcomm[sizeof(p->comm)]; get_task_comm(tcomm, p); seq_puts(m, "Name:\t"); - end = m->buf + m->size; buf = m->buf + m->count; - name = tcomm; - i = sizeof(tcomm); - while (i && (buf < end)) { - unsigned char c = *name; - name++; - i--; - *buf = c; - if (!c) - break; - if (c == '\\') { - buf++; - if (buf < end) - *buf++ = c; - continue; - } - if (c == '\n') { - *buf++ = '\\'; - if (buf < end) - *buf++ = 'n'; - continue; - } - buf++; - } + + /* Ignore error for now */ + string_escape_str(tcomm, &buf, m->size - m->count, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + m->count = buf - m->buf; seq_putc(m, '\n'); } @@ -157,20 +137,29 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; - struct fdtable *fdt = NULL; + struct task_struct *tracer; const struct cred *cred; - pid_t ppid, tpid; + pid_t ppid, tpid = 0, tgid, ngid; + unsigned int max_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tpid = 0; - if (pid_alive(p)) { - struct task_struct *tracer = ptrace_parent(p); - if (tracer) - tpid = task_pid_nr_ns(tracer, ns); - } + + tracer = ptrace_parent(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); + + tgid = task_tgid_nr_ns(p, ns); + ngid = task_numa_group_id(p); cred = get_task_cred(p); + + task_lock(p); + if (p->files) + max_fds = files_fdtable(p->files)->max_fds; + task_unlock(p); + rcu_read_unlock(); + seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" @@ -179,12 +168,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n", + "Gid:\t%d\t%d\t%d\t%d\n" + "FDSize:\t%d\nGroups:\t", get_task_state(p), - task_tgid_nr_ns(p, ns), - task_numa_group_id(p), - pid_nr_ns(pid, ns), - ppid, tpid, + tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, from_kuid_munged(user_ns, cred->uid), from_kuid_munged(user_ns, cred->euid), from_kuid_munged(user_ns, cred->suid), @@ -192,20 +179,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, cred->gid), from_kgid_munged(user_ns, cred->egid), from_kgid_munged(user_ns, cred->sgid), - from_kgid_munged(user_ns, cred->fsgid)); - - task_lock(p); - if (p->files) - fdt = files_fdtable(p->files); - seq_printf(m, - "FDSize:\t%d\n" - "Groups:\t", - fdt ? fdt->max_fds : 0); - rcu_read_unlock(); + from_kgid_munged(user_ns, cred->fsgid), + max_fds); group_info = cred->group_info; - task_unlock(p); - for (g = 0; g < group_info->ngroups; g++) seq_printf(m, "%d ", from_kgid_munged(user_ns, GROUP_AT(group_info, g))); @@ -339,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { - seq_puts(m, "Cpus_allowed:\t"); - seq_cpumask(m, &task->cpus_allowed); - seq_putc(m, '\n'); - seq_puts(m, "Cpus_allowed_list:\t"); - seq_cpumask_list(m, &task->cpus_allowed); - seq_putc(m, '\n'); + seq_printf(m, "Cpus_allowed:\t%*pb\n", + cpumask_pr_args(&task->cpus_allowed)); + seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", + cpumask_pr_args(&task->cpus_allowed)); } int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/base.c b/fs/proc/base.c index 772efa45a452..3f3d7aeb0712 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2464,6 +2464,57 @@ static const struct file_operations proc_projid_map_operations = { .llseek = seq_lseek, .release = proc_id_map_release, }; + +static int proc_setgroups_open(struct inode *inode, struct file *file) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + int ret; + + ret = -ESRCH; + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + if (file->f_mode & FMODE_WRITE) { + ret = -EACCES; + if (!ns_capable(ns, CAP_SYS_ADMIN)) + goto err_put_ns; + } + + ret = single_open(file, &proc_setgroups_show, ns); + if (ret) + goto err_put_ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_setgroups_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + int ret = single_release(inode, file); + put_user_ns(ns); + return ret; +} + +static const struct file_operations proc_setgroups_operations = { + .open = proc_setgroups_open, + .write = proc_setgroups_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_setgroups_release, +}; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, @@ -2572,6 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), @@ -2618,6 +2670,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) dput(dentry); } + if (pid == tgid) + return; + name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); @@ -2789,7 +2844,7 @@ retry: int proc_pid_readdir(struct file *file, struct dir_context *ctx) { struct tgid_iter iter; - struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info; + struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info; loff_t pos = ctx->pos; if (pos >= PID_MAX_LIMIT + TGID_OFFSET) @@ -2913,6 +2968,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif }; @@ -3095,7 +3151,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - ns = file->f_dentry->d_sb->s_fs_info; + ns = inode->i_sb->s_fs_info; tid = (int)file->f_version; file->f_version = 0; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index e11d7c590bb0..8e5ad83b629a 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -53,7 +53,8 @@ static int seq_show(struct seq_file *m, void *v) (long long)file->f_pos, f_flags, real_mount(file->f_path.mnt)->mnt_id); if (file->f_op->show_fdinfo) - ret = file->f_op->show_fdinfo(m, file); + file->f_op->show_fdinfo(m, file); + ret = seq_has_overflowed(m); fput(file); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 317b72641ebf..be65b2082135 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -19,7 +19,6 @@ #include <linux/mount.h> #include <linux/init.h> #include <linux/idr.h> -#include <linux/namei.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/completion.h> @@ -31,9 +30,73 @@ static DEFINE_SPINLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { - if (de->namelen != len) - return 0; - return !memcmp(name, de->name, len); + if (len < de->namelen) + return -1; + if (len > de->namelen) + return 1; + + return memcmp(name, de->name, len); +} + +static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) +{ + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); +} + +static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) +{ + return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, + subdir_node); +} + +static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, + const char *name, + unsigned int len) +{ + struct rb_node *node = dir->subdir.rb_node; + + while (node) { + struct proc_dir_entry *de = container_of(node, + struct proc_dir_entry, + subdir_node); + int result = proc_match(len, name, de); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return de; + } + return NULL; +} + +static bool pde_subdir_insert(struct proc_dir_entry *dir, + struct proc_dir_entry *de) +{ + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_dir_entry *this = + container_of(*new, struct proc_dir_entry, subdir_node); + int result = proc_match(de->namelen, de->name, this); + + parent = *new; + if (result < 0) + new = &(*new)->rb_left; + else if (result > 0) + new = &(*new)->rb_right; + else + return false; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&de->subdir_node, parent, new); + rb_insert_color(&de->subdir_node, root); + return true; } static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) @@ -58,7 +121,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; - struct proc_dir_entry *de = PROC_I(inode)->pde; + struct proc_dir_entry *de = PDE(inode); if (de && de->nlink) set_nlink(inode, de->nlink); @@ -92,10 +155,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, break; len = next - cp; - for (de = de->subdir; de ; de = de->next) { - if (proc_match(len, cp, de)) - break; - } + de = pde_subdir_find(de, cp, len); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; @@ -162,17 +222,6 @@ void proc_free_inum(unsigned int inum) spin_unlock_irqrestore(&proc_inum_lock, flags); } -static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, __PDE_DATA(dentry->d_inode)); - return NULL; -} - -static const struct inode_operations proc_link_inode_operations = { - .readlink = generic_readlink, - .follow_link = proc_follow_link, -}; - /* * Don't create negative dentries here, return -ENOENT by hand * instead. @@ -183,19 +232,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct inode *inode; spin_lock(&proc_subdir_lock); - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - pde_get(de); - spin_unlock(&proc_subdir_lock); - inode = proc_get_inode(dir->i_sb, de); - if (!inode) - return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &simple_dentry_operations); - d_add(dentry, inode); - return NULL; - } + de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); + if (de) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + inode = proc_get_inode(dir->i_sb, de); + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, inode); + return NULL; } spin_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -225,7 +271,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, return 0; spin_lock(&proc_subdir_lock); - de = de->subdir; + de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { @@ -234,7 +280,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, } if (!i) break; - de = de->next; + de = pde_subdir_next(de); i--; } @@ -249,7 +295,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, } spin_lock(&proc_subdir_lock); ctx->pos++; - next = de->next; + next = pde_subdir_next(de); pde_put(de); de = next; } while (de); @@ -286,39 +332,21 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - struct proc_dir_entry *tmp; int ret; - + ret = proc_alloc_inum(&dp->low_ino); if (ret) return ret; - if (S_ISDIR(dp->mode)) { - dp->proc_fops = &proc_dir_operations; - dp->proc_iops = &proc_dir_inode_operations; - dir->nlink++; - } else if (S_ISLNK(dp->mode)) { - dp->proc_iops = &proc_link_inode_operations; - } else if (S_ISREG(dp->mode)) { - BUG_ON(dp->proc_fops == NULL); - dp->proc_iops = &proc_file_inode_operations; - } else { - WARN_ON(1); - return -EINVAL; - } - spin_lock(&proc_subdir_lock); - - for (tmp = dir->subdir; tmp; tmp = tmp->next) - if (strcmp(tmp->name, dp->name) == 0) { - WARN(1, "proc_dir_entry '%s/%s' already registered\n", - dir->name, dp->name); - break; - } - - dp->next = dir->subdir; dp->parent = dir; - dir->subdir = dp; + if (pde_subdir_insert(dir, dp) == false) { + WARN(1, "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); + spin_unlock(&proc_subdir_lock); + proc_free_inum(dp->low_ino); + return -EEXIST; + } spin_unlock(&proc_subdir_lock); return 0; @@ -354,6 +382,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; + ent->subdir = RB_ROOT; atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -373,6 +402,7 @@ struct proc_dir_entry *proc_symlink(const char *name, ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); if (ent->data) { strcpy((char*)ent->data,dest); + ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { kfree(ent->data); kfree(ent); @@ -398,8 +428,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; + ent->proc_fops = &proc_dir_operations; + ent->proc_iops = &proc_dir_inode_operations; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); + parent->nlink--; ent = NULL; } } @@ -435,6 +469,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, return NULL; } + BUG_ON(proc_fops == NULL); + if ((mode & S_IALLUGO) == 0) mode |= S_IRUGO; pde = __proc_create(&parent, name, mode, 1); @@ -442,6 +478,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out; pde->proc_fops = proc_fops; pde->data = data; + pde->proc_iops = &proc_file_inode_operations; if (proc_register(parent, pde) < 0) goto out_free; return pde; @@ -485,7 +522,6 @@ void pde_put(struct proc_dir_entry *pde) */ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) { - struct proc_dir_entry **p; struct proc_dir_entry *de = NULL; const char *fn = name; unsigned int len; @@ -497,14 +533,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } len = strlen(fn); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - de = *p; - *p = de->next; - de->next = NULL; - break; - } - } + de = pde_subdir_find(parent, fn, len); + if (de) + rb_erase(&de->subdir_node, &parent->subdir); spin_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -516,16 +547,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; - WARN(de->subdir, "%s: removing non-empty directory " - "'%s/%s', leaking at least '%s'\n", __func__, - de->parent->name, de->name, de->subdir->name); + WARN(pde_subdir_first(de), + "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", + __func__, de->parent->name, de->name, pde_subdir_first(de)->name); pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { - struct proc_dir_entry **p; struct proc_dir_entry *root = NULL, *de, *next; const char *fn = name; unsigned int len; @@ -537,24 +567,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) } len = strlen(fn); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - root = *p; - *p = root->next; - root->next = NULL; - break; - } - } + root = pde_subdir_find(parent, fn, len); if (!root) { spin_unlock(&proc_subdir_lock); return -ENOENT; } + rb_erase(&root->subdir_node, &parent->subdir); + de = root; while (1) { - next = de->subdir; + next = pde_subdir_first(de); if (next) { - de->subdir = next->next; - next->next = NULL; + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 333080d7a671..7697b6621cfd 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <linux/mount.h> #include <linux/magic.h> +#include <linux/namei.h> #include <asm/uaccess.h> @@ -32,8 +33,6 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; struct ctl_table_header *head; - const struct proc_ns_operations *ns_ops; - void *ns; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -42,7 +41,7 @@ static void proc_evict_inode(struct inode *inode) put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ - de = PROC_I(inode)->pde; + de = PDE(inode); if (de) pde_put(de); head = PROC_I(inode)->sysctl; @@ -50,11 +49,6 @@ static void proc_evict_inode(struct inode *inode) RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL); sysctl_head_put(head); } - /* Release any associated namespace */ - ns_ops = PROC_I(inode)->ns.ns_ops; - ns = PROC_I(inode)->ns.ns; - if (ns_ops && ns) - ns_ops->put(ns); } static struct kmem_cache * proc_inode_cachep; @@ -73,8 +67,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei->pde = NULL; ei->sysctl = NULL; ei->sysctl_entry = NULL; - ei->ns.ns = NULL; - ei->ns.ns_ops = NULL; + ei->ns_ops = NULL; inode = &ei->vfs_inode; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; return inode; @@ -401,6 +394,26 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif +static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct proc_dir_entry *pde = PDE(dentry->d_inode); + if (unlikely(!use_pde(pde))) + return ERR_PTR(-EINVAL); + nd_set_link(nd, pde->data); + return pde; +} + +static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + unuse_pde(p); +} + +const struct inode_operations proc_link_inode_operations = { + .readlink = generic_readlink, + .follow_link = proc_follow_link, + .put_link = proc_put_link, +}; + struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) { struct inode *inode = new_inode_pseudo(sb); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa7a0ee182e1..c835b94c0cd3 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -24,10 +24,9 @@ struct mempolicy; * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * - * The "next" pointer creates a linked list of one /proc directory, - * while parent/subdir create the directory structure (every - * /proc file has a parent, but "subdir" is NULL for all - * non-directory entries). + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { unsigned int low_ino; @@ -38,7 +37,9 @@ struct proc_dir_entry { loff_t size; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; - struct proc_dir_entry *next, *parent, *subdir; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; void *data; atomic_t count; /* use count */ atomic_t in_use; /* number of callers into module in progress; */ @@ -64,7 +65,7 @@ struct proc_inode { struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; - struct proc_ns ns; + const struct proc_ns_operations *ns_ops; struct inode vfs_inode; }; @@ -199,6 +200,7 @@ struct pde_opener { int closing; struct completion *c; }; +extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index aa1eee06420f..d3ebf2e61853 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -12,6 +12,9 @@ #include <linux/vmstat.h> #include <linux/atomic.h> #include <linux/vmalloc.h> +#ifdef CONFIG_CMA +#include <linux/cma.h> +#endif #include <asm/page.h> #include <asm/pgtable.h> #include "internal.h" @@ -138,6 +141,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE "AnonHugePages: %8lu kB\n" #endif +#ifdef CONFIG_CMA + "CmaTotal: %8lu kB\n" + "CmaFree: %8lu kB\n" +#endif , K(i.totalram), K(i.freeram), @@ -187,12 +194,16 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.used >> 10, vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE - ,atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) + , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * HPAGE_PMD_NR) #endif +#ifdef CONFIG_CMA + , K(totalcma_pages) + , K(global_page_state(NR_FREE_CMA_PAGES)) +#endif ); hugetlb_report_meminfo(m); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 89026095f2b5..c9eac4563fa8 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -1,10 +1,6 @@ #include <linux/proc_fs.h> #include <linux/nsproxy.h> -#include <linux/sched.h> #include <linux/ptrace.h> -#include <linux/fs_struct.h> -#include <linux/mount.h> -#include <linux/path.h> #include <linux/namei.h> #include <linux/file.h> #include <linux/utsname.h> @@ -34,138 +30,45 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static const struct file_operations ns_file_operations = { - .llseek = no_llseek, -}; - -static const struct inode_operations ns_inode_operations = { - .setattr = proc_setattr, -}; - -static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops; - - return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", - ns_ops->name, inode->i_ino); -} - -const struct dentry_operations ns_dentry_operations = -{ - .d_delete = always_delete_dentry, - .d_dname = ns_dname, -}; - -static struct dentry *proc_ns_get_dentry(struct super_block *sb, - struct task_struct *task, const struct proc_ns_operations *ns_ops) -{ - struct dentry *dentry, *result; - struct inode *inode; - struct proc_inode *ei; - struct qstr qname = { .name = "", }; - void *ns; - - ns = ns_ops->get(task); - if (!ns) - return ERR_PTR(-ENOENT); - - dentry = d_alloc_pseudo(sb, &qname); - if (!dentry) { - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - inode = iget_locked(sb, ns_ops->inum(ns)); - if (!inode) { - dput(dentry); - ns_ops->put(ns); - return ERR_PTR(-ENOMEM); - } - - ei = PROC_I(inode); - if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_op = &ns_inode_operations; - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &ns_file_operations; - ei->ns.ns_ops = ns_ops; - ei->ns.ns = ns; - unlock_new_inode(inode); - } else { - ns_ops->put(ns); - } - - d_set_d_op(dentry, &ns_dentry_operations); - result = d_instantiate_unique(dentry, inode); - if (result) { - dput(dentry); - dentry = result; - } - - return dentry; -} - static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct proc_inode *ei = PROC_I(inode); + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; void *error = ERR_PTR(-EACCES); task = get_proc_task(inode); if (!task) - goto out; + return error; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; - - ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops); - if (IS_ERR(ns_path.dentry)) { - error = ERR_CAST(ns_path.dentry); - goto out_put_task; + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + error = ns_get_path(&ns_path, task, ns_ops); + if (!error) + nd_jump_link(nd, &ns_path); } - - ns_path.mnt = mntget(nd->path.mnt); - nd_jump_link(nd, &ns_path); - error = NULL; - -out_put_task: put_task_struct(task); -out: return error; } static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = dentry->d_inode; - struct proc_inode *ei = PROC_I(inode); - const struct proc_ns_operations *ns_ops = ei->ns.ns_ops; + const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; - void *ns; char name[50]; int res = -EACCES; task = get_proc_task(inode); if (!task) - goto out; - - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto out_put_task; + return res; - res = -ENOENT; - ns = ns_ops->get(task); - if (!ns) - goto out_put_task; - - snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - res = readlink_copy(buffer, buflen, name); - ns_ops->put(ns); -out_put_task: + if (ptrace_may_access(task, PTRACE_MODE_READ)) { + res = ns_get_name(name, sizeof(name), task, ns_ops); + if (res >= 0) + res = readlink_copy(buffer, buflen, name); + } put_task_struct(task); -out: return res; } @@ -189,7 +92,7 @@ static int proc_ns_instantiate(struct inode *dir, ei = PROC_I(inode); inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; - ei->ns.ns_ops = ns_ops; + ei->ns_ops = ns_ops; d_set_d_op(dentry, &pid_dentry_operations); d_add(dentry, inode); @@ -267,31 +170,3 @@ const struct inode_operations proc_ns_dir_inode_operations = { .getattr = pid_getattr, .setattr = proc_setattr, }; - -struct file *proc_ns_fget(int fd) -{ - struct file *file; - - file = fget(fd); - if (!file) - return ERR_PTR(-EBADF); - - if (file->f_op != &ns_file_operations) - goto out_invalid; - - return file; - -out_invalid: - fput(file); - return ERR_PTR(-EINVAL); -} - -struct proc_ns *get_proc_ns(struct inode *inode) -{ - return &PROC_I(inode)->ns; -} - -bool proc_ns_inode(struct inode *inode) -{ - return inode->i_fop == &ns_file_operations; -} diff --git a/fs/proc/page.c b/fs/proc/page.c index 1e3187da1fed..7eee2d8b97d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -5,6 +5,7 @@ #include <linux/ksm.h> #include <linux/mm.h> #include <linux/mmzone.h> +#include <linux/huge_mm.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/hugetlb.h> @@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page) * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || - PageAnon(compound_head(page)))) - u |= 1 << KPF_THP; + else if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + if (PageLRU(head) || PageAnon(head)) + u |= 1 << KPF_THP; + else if (is_huge_zero_page(head)) { + u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } + } else if (is_zero_pfn(page_to_pfn(page))) + u |= 1 << KPF_ZERO_PAGE; + /* * Caveats on high order pages: page->_count will only be set diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a63af3e0a612..1bde894bc624 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index 094e44d4a6be..e74ac9f1a2c0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .subdir = RB_ROOT, .name = "/proc", }; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index bf2d03f8fd3e..510413eb25b8 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -159,7 +159,7 @@ static int show_stat(struct seq_file *p, void *v) /* sum again ? it could be updated? */ for_each_irq_nr(j) - seq_put_decimal_ull(p, ' ', kstat_irqs(j)); + seq_put_decimal_ull(p, ' ', kstat_irqs_usr(j)); seq_printf(p, "\nctxt %llu\n" diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0388cffe3d..6dee68d013ff 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -21,7 +21,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap; + unsigned long data, text, lib, swap, ptes, pmds; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" + "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE * sizeof(pte_t) * - atomic_long_read(&mm->nr_ptes)) >> 10, + ptes >> 10, + pmds >> 10, swap << (PAGE_SHIFT-10)); } @@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { - struct vm_area_struct *vma; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -443,75 +445,97 @@ struct mem_size_stats { unsigned long anonymous; unsigned long anonymous_thp; unsigned long swap; - unsigned long nonlinear; u64 pss; }; +static void smaps_account(struct mem_size_stats *mss, struct page *page, + unsigned long size, bool young, bool dirty) +{ + int mapcount; + + if (PageAnon(page)) + mss->anonymous += size; + + mss->resident += size; + /* Accumulate the size in pages that have been accessed. */ + if (young || PageReferenced(page)) + mss->referenced += size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + u64 pss_delta; + + if (dirty || PageDirty(page)) + mss->shared_dirty += size; + else + mss->shared_clean += size; + pss_delta = (u64)size << PSS_SHIFT; + do_div(pss_delta, mapcount); + mss->pss += pss_delta; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += size; + else + mss->private_clean += size; + mss->pss += (u64)size << PSS_SHIFT; + } +} -static void smaps_pte_entry(pte_t ptent, unsigned long addr, - unsigned long ptent_size, struct mm_walk *walk) +static void smaps_pte_entry(pte_t *pte, unsigned long addr, + struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = mss->vma; - pgoff_t pgoff = linear_page_index(vma, addr); + struct vm_area_struct *vma = walk->vma; struct page *page = NULL; - int mapcount; - if (pte_present(ptent)) { - page = vm_normal_page(vma, addr, ptent); - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); if (!non_swap_entry(swpent)) - mss->swap += ptent_size; + mss->swap += PAGE_SIZE; else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); - } else if (pte_file(ptent)) { - if (pte_to_pgoff(ptent) != pgoff) - mss->nonlinear += ptent_size; } if (!page) return; + smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); +} - if (PageAnon(page)) - mss->anonymous += ptent_size; - - if (page->index != pgoff) - mss->nonlinear += ptent_size; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; + struct page *page; - mss->resident += ptent_size; - /* Accumulate the size in pages that have been accessed. */ - if (pte_young(ptent) || PageReferenced(page)) - mss->referenced += ptent_size; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - if (pte_dirty(ptent) || PageDirty(page)) - mss->shared_dirty += ptent_size; - else - mss->shared_clean += ptent_size; - mss->pss += (ptent_size << PSS_SHIFT) / mapcount; - } else { - if (pte_dirty(ptent) || PageDirty(page)) - mss->private_dirty += ptent_size; - else - mss->private_clean += ptent_size; - mss->pss += (ptent_size << PSS_SHIFT); - } + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return; + mss->anonymous_thp += HPAGE_PMD_SIZE; + smaps_account(mss, page, HPAGE_PMD_SIZE, + pmd_young(*pmd), pmd_dirty(*pmd)); +} +#else +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ } +#endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = mss->vma; + struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); - mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -524,7 +548,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) - smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); + smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; @@ -552,6 +576,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", [ilog2(VM_DENYWRITE)] = "dw", +#ifdef CONFIG_X86_INTEL_MPX + [ilog2(VM_MPX)] = "mp", +#endif [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -561,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", - [ilog2(VM_NONLINEAR)] = "nl", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY @@ -595,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) }; memset(&mss, 0, sizeof mss); - mss.vma = vma; /* mmap_sem is held in m_start */ - if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); + walk_page_vma(vma, &smaps_walk); show_map_vma(m, vma, is_pid); @@ -633,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) (vma->vm_flags & VM_LOCKED) ? (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); - if (vma->vm_flags & VM_NONLINEAR) - seq_printf(m, "Nonlinear: %8lu kB\n", - mss.nonlinear >> 10); - show_smap_vma_flags(m, vma); m_cache_vma(m, vma); return 0; @@ -712,18 +732,18 @@ enum clear_refs_types { CLEAR_REFS_ANON, CLEAR_REFS_MAPPED, CLEAR_REFS_SOFT_DIRTY, + CLEAR_REFS_MM_HIWATER_RSS, CLEAR_REFS_LAST, }; struct clear_refs_private { - struct vm_area_struct *vma; enum clear_refs_types type; }; +#ifdef CONFIG_MEM_SOFT_DIRTY static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { -#ifdef CONFIG_MEM_SOFT_DIRTY /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the @@ -737,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); } else if (is_swap_pte(ptent)) { ptent = pte_swp_clear_soft_dirty(ptent); - } else if (pte_file(ptent)) { - ptent = pte_file_clear_soft_dirty(ptent); } set_pte_at(vma->vm_mm, addr, pte, ptent); -#endif } +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); + + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); +} + +#else + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} + +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ +} +#endif + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; - struct vm_area_struct *vma = cp->vma; + struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; - split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty_pmd(vma, addr, pmd); + goto out; + } + + page = pmd_page(*pmd); + + /* Clear accessed and referenced bits. */ + pmdp_test_and_clear_young(vma, addr, pmd); + ClearPageReferenced(page); +out: + spin_unlock(ptl); + return 0; + } + if (pmd_trans_unstable(pmd)) return 0; @@ -783,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } +static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. + * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. + * Writing 4 to /proc/pid/clear_refs affects all pages. + */ + if (cp->type == CLEAR_REFS_ANON && vma->vm_file) + return 1; + if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) + return 1; + return 0; +} + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -823,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, .mm = mm, .private = &cp, }; + + if (type == CLEAR_REFS_MM_HIWATER_RSS) { + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + down_write(&mm->mmap_sem); + reset_mm_hiwater_rss(mm); + up_write(&mm->mmap_sem); + goto out_mm; + } + down_read(&mm->mmap_sem); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { @@ -842,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } mmu_notifier_invalidate_range_start(mm, 0, -1); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { - cp.vma = vma; - if (is_vm_hugetlb_page(vma)) - continue; - /* - * Writing 1 to /proc/pid/clear_refs affects all pages. - * - * Writing 2 to /proc/pid/clear_refs only affects - * Anonymous pages. - * - * Writing 3 to /proc/pid/clear_refs only affects file - * mapped pages. - * - * Writing 4 to /proc/pid/clear_refs affects all pages. - */ - if (type == CLEAR_REFS_ANON && vma->vm_file) - continue; - if (type == CLEAR_REFS_MAPPED && !vma->vm_file) - continue; - walk_page_range(vma->vm_start, vma->vm_end, - &clear_refs_walk); - } + walk_page_range(0, ~0UL, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); up_read(&mm->mmap_sem); +out_mm: mmput(mm); } put_task_struct(task); @@ -1031,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma; + struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; - pte_t *pte; + pte_t *pte, *orig_pte; int err = 0; - /* find the first VMA at or above 'addr' */ - vma = find_vma(walk->mm, addr); - if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { int pmd_flags2; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) @@ -1065,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) return 0; - while (1) { - /* End of address space hole, which we mark as non-present. */ - unsigned long hole_end; - - if (vma) - hole_end = min(end, vma->vm_start); - else - hole_end = end; - - for (; addr < hole_end; addr += PAGE_SIZE) { - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); - - err = add_to_pagemap(addr, &pme, pm); - if (err) - return err; - } - - if (!vma || vma->vm_start >= end) - break; - /* - * We can't possibly be in a hugetlb VMA. In general, - * for a mm_walk with a pmd_entry and a hugetlb_entry, - * the pmd_entry can only be called on addresses in a - * hugetlb if the walk starts in a non-hugetlb VMA and - * spans a hugepage VMA. Since pagemap_read walks are - * PMD-sized and PMD-aligned, this will never be true. - */ - BUG_ON(is_vm_hugetlb_page(vma)); - - /* Addresses in the VMA. */ - for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { - pagemap_entry_t pme; - pte = pte_offset_map(pmd, addr); - pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); - pte_unmap(pte); - err = add_to_pagemap(addr, &pme, pm); - if (err) - return err; - } + /* + * We can assume that @vma always points to a valid one and @end never + * goes beyond vma->vm_end. + */ + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { + pagemap_entry_t pme; - if (addr == end) + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) break; - - vma = find_vma(walk->mm, addr); } + pte_unmap_unlock(orig_pte, ptl); cond_resched(); @@ -1135,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { struct pagemapread *pm = walk->private; - struct vm_area_struct *vma; + struct vm_area_struct *vma = walk->vma; int err = 0; int flags2; pagemap_entry_t pme; - vma = find_vma(walk->mm, addr); - WARN_ON_ONCE(!vma); - - if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + if (vma->vm_flags & VM_SOFTDIRTY) flags2 = __PM_SOFT_DIRTY; else flags2 = 0; @@ -1287,6 +1325,9 @@ out: static int pagemap_open(struct inode *inode, struct file *file) { + /* do not disclose physical addresses: attack vector */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " "to stop being page-shift some time soon. See the " "linux/Documentation/vm/pagemap.txt for details.\n"); @@ -1303,7 +1344,6 @@ const struct file_operations proc_pagemap_operations = { #ifdef CONFIG_NUMA struct numa_maps { - struct vm_area_struct *vma; unsigned long pages; unsigned long anon; unsigned long active; @@ -1372,18 +1412,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct numa_maps *md; + struct numa_maps *md = walk->private; + struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte; pte_t *pte; - md = walk->private; - - if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, md->vma, addr); + page = can_gather_numa_stats(huge_pte, vma, addr); if (page) gather_stats(page, md, pte_dirty(huge_pte), HPAGE_PMD_SIZE/PAGE_SIZE); @@ -1395,7 +1434,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { - struct page *page = can_gather_numa_stats(*pte, md->vma, addr); + struct page *page = can_gather_numa_stats(*pte, vma, addr); if (!page) continue; gather_stats(page, md, pte_dirty(*pte), 1); @@ -1405,7 +1444,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } #ifdef CONFIG_HUGETLB_PAGE -static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct numa_maps *md; @@ -1424,7 +1463,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, } #else -static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; @@ -1442,7 +1481,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mm_walk walk = {}; + struct mm_walk walk = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, + .private = md, + .mm = mm, + }; struct mempolicy *pol; char buffer[64]; int nid; @@ -1453,13 +1497,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - md->vma = vma; - - walk.hugetlb_entry = gather_hugetbl_stats; - walk.pmd_entry = gather_pte_stats; - walk.private = md; - walk.mm = mm; - pol = __get_vma_policy(vma, vma->vm_start); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); @@ -1493,7 +1530,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); - walk_page_range(vma->vm_start, vma->vm_end, &walk); + /* mmap_sem is held by m_start */ + walk_page_vma(vma, &walk); if (!md->pages) goto out; @@ -1522,6 +1560,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) for_each_node_state(nid, N_MEMORY) if (md->node[nid]) seq_printf(m, " N%d=%lu", nid, md->node[nid]); + + seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); out: seq_putc(m, '\n'); m_cache_vma(m, vma); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a90d6d354199..4e61388ec03d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) nhdr_ptr = notes_section; while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf64_Nhdr) + - ((nhdr_ptr->n_namesz + 3) & ~3) + - ((nhdr_ptr->n_descsz + 3) & ~3); + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); if ((real_sz + sz) > max_sz) { pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); @@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) nhdr_ptr = notes_section; while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf32_Nhdr) + - ((nhdr_ptr->n_namesz + 3) & ~3) + - ((nhdr_ptr->n_descsz + 3) & ~3); + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); if ((real_sz + sz) > max_sz) { pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 73ca1740d839..8db932da4009 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb) { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, + { MS_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; @@ -91,6 +92,7 @@ static void show_type(struct seq_file *m, struct super_block *sb) static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { + struct proc_mounts *p = proc_mounts(m); struct mount *r = real_mount(mnt); int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -104,7 +106,10 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) mangle(m, r->mnt_devname ? r->mnt_devname : "none"); } seq_putc(m, ' '); - seq_path(m, &mnt_path, " \t\n\\"); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; seq_putc(m, ' '); show_type(m, sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); @@ -125,7 +130,6 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - struct path root = p->root; int err = 0; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, @@ -139,7 +143,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ - err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); if (err) goto out; @@ -182,6 +186,7 @@ out: static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { + struct proc_mounts *p = proc_mounts(m); struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; @@ -201,7 +206,10 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) /* mount point */ seq_puts(m, " mounted on "); - seq_path(m, &mnt_path, " \t\n\\"); + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &p->root, " \t\n\\"); + if (err) + goto out; seq_putc(m, ' '); /* file system type */ @@ -216,6 +224,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) } seq_putc(m, '\n'); +out: return err; } diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 983d9510becc..916b8e23d968 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -21,6 +21,16 @@ config PSTORE_CONSOLE When the option is enabled, pstore will log all kernel messages, even if no oops or panic happened. +config PSTORE_PMSG + bool "Log user space messages" + depends on PSTORE + help + When the option is enabled, pstore will export a character + interface /dev/pmsg0 to log user space messages. On reboot + data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID]. + + If unsure, say N. + config PSTORE_FTRACE bool "Persistent function tracer" depends on PSTORE diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 4c9095c2781e..e647d8e81712 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -7,5 +7,7 @@ obj-y += pstore.o pstore-objs += inode.o platform.o obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o +obj-$(CONFIG_PSTORE_PMSG) += pmsg.o + ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index fafb7a02a5d6..b32ce53d24ee 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/uaccess.h> +#include <linux/syslog.h> #include "internal.h" @@ -120,6 +121,18 @@ static const struct seq_operations pstore_ftrace_seq_ops = { .show = pstore_ftrace_seq_show, }; +static int pstore_check_syslog_permissions(struct pstore_private *ps) +{ + switch (ps->type) { + case PSTORE_TYPE_DMESG: + case PSTORE_TYPE_CONSOLE: + return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, + SYSLOG_FROM_READER); + default: + return 0; + } +} + static ssize_t pstore_file_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -138,6 +151,10 @@ static int pstore_file_open(struct inode *inode, struct file *file) int err; const struct seq_operations *sops = NULL; + err = pstore_check_syslog_permissions(ps); + if (err) + return err; + if (ps->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; @@ -174,6 +191,11 @@ static const struct file_operations pstore_file_operations = { static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = dentry->d_inode->i_private; + int err; + + err = pstore_check_syslog_permissions(p); + if (err) + return err; if (p->psi->erase) p->psi->erase(p->type, p->id, p->count, @@ -316,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, switch (type) { case PSTORE_TYPE_DMESG: - sprintf(name, "dmesg-%s-%lld%s", psname, id, - compressed ? ".enc.z" : ""); + scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", + psname, id, compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - sprintf(name, "console-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "console-%s-%lld", psname, id); break; case PSTORE_TYPE_FTRACE: - sprintf(name, "ftrace-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id); break; case PSTORE_TYPE_MCE: - sprintf(name, "mce-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id); break; case PSTORE_TYPE_PPC_RTAS: - sprintf(name, "rtas-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id); break; case PSTORE_TYPE_PPC_OF: - sprintf(name, "powerpc-ofw-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", + psname, id); break; case PSTORE_TYPE_PPC_COMMON: - sprintf(name, "powerpc-common-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", + psname, id); + break; + case PSTORE_TYPE_PMSG: + scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id); break; case PSTORE_TYPE_UNKNOWN: - sprintf(name, "unknown-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id); break; default: - sprintf(name, "type%d-%s-%lld", type, psname, id); + scnprintf(name, sizeof(name), "type%d-%s-%lld", + type, psname, id); break; } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 3b3d305277c4..c36ba2cd0b5d 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void); static inline void pstore_register_ftrace(void) {} #endif +#ifdef CONFIG_PSTORE_PMSG +extern void pstore_register_pmsg(void); +#else +static inline void pstore_register_pmsg(void) {} +#endif + extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 0a9b72cdfeca..c4c9a10c5760 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, if (big_oops_buf) { dst = big_oops_buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, + hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); size = big_oops_buf_sz - hsize; @@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, } } else { dst = psinfo->buf; - hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, + hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; @@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi) if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) { pstore_register_console(); pstore_register_ftrace(); + pstore_register_pmsg(); } if (pstore_update_ms >= 0) { diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c new file mode 100644 index 000000000000..feb5dd2948b4 --- /dev/null +++ b/fs/pstore/pmsg.c @@ -0,0 +1,114 @@ +/* + * Copyright 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include "internal.h" + +static DEFINE_MUTEX(pmsg_lock); +#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE) + +static ssize_t write_pmsg(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + size_t i, buffer_size; + char *buffer; + + if (!count) + return 0; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + buffer_size = count; + if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE) + buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE; + buffer = vmalloc(buffer_size); + + mutex_lock(&pmsg_lock); + for (i = 0; i < count; ) { + size_t c = min(count - i, buffer_size); + u64 id; + long ret; + + ret = __copy_from_user(buffer, buf + i, c); + if (unlikely(ret != 0)) { + mutex_unlock(&pmsg_lock); + vfree(buffer); + return -EFAULT; + } + psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c, + psinfo); + + i += c; + } + + mutex_unlock(&pmsg_lock); + vfree(buffer); + return count; +} + +static const struct file_operations pmsg_fops = { + .owner = THIS_MODULE, + .llseek = noop_llseek, + .write = write_pmsg, +}; + +static struct class *pmsg_class; +static int pmsg_major; +#define PMSG_NAME "pmsg" +#undef pr_fmt +#define pr_fmt(fmt) PMSG_NAME ": " fmt + +static char *pmsg_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0220; + return NULL; +} + +void pstore_register_pmsg(void) +{ + struct device *pmsg_device; + + pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops); + if (pmsg_major < 0) { + pr_err("register_chrdev failed\n"); + goto err; + } + + pmsg_class = class_create(THIS_MODULE, PMSG_NAME); + if (IS_ERR(pmsg_class)) { + pr_err("device class file already in use\n"); + goto err_class; + } + pmsg_class->devnode = pmsg_devnode; + + pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0), + NULL, "%s%d", PMSG_NAME, 0); + if (IS_ERR(pmsg_device)) { + pr_err("failed to create device\n"); + goto err_device; + } + return; + +err_device: + class_destroy(pmsg_class); +err_class: + unregister_chrdev(pmsg_major, PMSG_NAME); +err: + return; +} diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 3b5744306ed8..39d1373128e9 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE; module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400); MODULE_PARM_DESC(ftrace_size, "size of ftrace log"); +static ulong ramoops_pmsg_size = MIN_MEM_SIZE; +module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); +MODULE_PARM_DESC(pmsg_size, "size of user space message log"); + static ulong mem_address; module_param(mem_address, ulong, 0400); MODULE_PARM_DESC(mem_address, @@ -61,6 +65,11 @@ module_param(mem_size, ulong, 0400); MODULE_PARM_DESC(mem_size, "size of reserved RAM used to store oops/panic logs"); +static unsigned int mem_type; +module_param(mem_type, uint, 0600); +MODULE_PARM_DESC(mem_type, + "set to 1 to try to use unbuffered memory (default 0)"); + static int dump_oops = 1; module_param(dump_oops, int, 0600); MODULE_PARM_DESC(dump_oops, @@ -77,11 +86,14 @@ struct ramoops_context { struct persistent_ram_zone **przs; struct persistent_ram_zone *cprz; struct persistent_ram_zone *fprz; + struct persistent_ram_zone *mprz; phys_addr_t phys_addr; unsigned long size; + unsigned int memtype; size_t record_size; size_t console_size; size_t ftrace_size; + size_t pmsg_size; int dump_oops; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; @@ -90,6 +102,7 @@ struct ramoops_context { unsigned int dump_read_cnt; unsigned int console_read_cnt; unsigned int ftrace_read_cnt; + unsigned int pmsg_read_cnt; struct pstore_info pstore; }; @@ -103,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi) cxt->dump_read_cnt = 0; cxt->console_read_cnt = 0; cxt->ftrace_read_cnt = 0; + cxt->pmsg_read_cnt = 0; return 0; } @@ -135,25 +149,33 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max, return prz; } -static void ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, +static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time, bool *compressed) { char data_type; + int header_length = 0; - if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n", - &time->tv_sec, &time->tv_nsec, &data_type) == 3) { + if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu-%c\n%n", &time->tv_sec, + &time->tv_nsec, &data_type, &header_length) == 3) { if (data_type == 'C') *compressed = true; else *compressed = false; - } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n", - &time->tv_sec, &time->tv_nsec) == 2) { + } else if (sscanf(buffer, RAMOOPS_KERNMSG_HDR "%lu.%lu\n%n", + &time->tv_sec, &time->tv_nsec, &header_length) == 2) { *compressed = false; } else { time->tv_sec = 0; time->tv_nsec = 0; *compressed = false; } + return header_length; +} + +static bool prz_ok(struct persistent_ram_zone *prz) +{ + return !!prz && !!(persistent_ram_old_size(prz) + + persistent_ram_ecc_string(prz, NULL, 0)); } static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, @@ -165,20 +187,30 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz; + int header_length; prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, cxt->max_dump_cnt, id, type, PSTORE_TYPE_DMESG, 1); - if (!prz) + if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 1, id, type, PSTORE_TYPE_CONSOLE, 0); - if (!prz) + if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, 1, id, type, PSTORE_TYPE_FTRACE, 0); - if (!prz) + if (!prz_ok(prz)) + prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, + 1, id, type, PSTORE_TYPE_PMSG, 0); + if (!prz_ok(prz)) + return 0; + + if (!persistent_ram_old(prz)) return 0; size = persistent_ram_old_size(prz); + header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time, + compressed); + size -= header_length; /* ECC correction notice */ ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); @@ -187,8 +219,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (*buf == NULL) return -ENOMEM; - memcpy(*buf, persistent_ram_old(prz), size); - ramoops_read_kmsg_hdr(*buf, time, compressed); + memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); persistent_ram_ecc_string(prz, *buf + size, ecc_notice_size + 1); return size + ecc_notice_size; @@ -238,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, return -ENOMEM; persistent_ram_write(cxt->fprz, buf, size); return 0; + } else if (type == PSTORE_TYPE_PMSG) { + if (!cxt->mprz) + return -ENOMEM; + persistent_ram_write(cxt->mprz, buf, size); + return 0; } if (type != PSTORE_TYPE_DMESG) @@ -295,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, case PSTORE_TYPE_FTRACE: prz = cxt->fprz; break; + case PSTORE_TYPE_PMSG: + prz = cxt->mprz; + break; default: return -EINVAL; } @@ -358,7 +397,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, size_t sz = cxt->record_size; cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, - &cxt->ecc_info); + &cxt->ecc_info, + cxt->memtype); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", @@ -388,7 +428,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); @@ -420,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && - !pdata->ftrace_size)) { + !pdata->ftrace_size && !pdata->pmsg_size)) { pr_err("The memory size and the record/console size must be " "non-zero\n"); goto fail_out; @@ -432,18 +472,23 @@ static int ramoops_probe(struct platform_device *pdev) pdata->console_size = rounddown_pow_of_two(pdata->console_size); if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size)) pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size); + if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size)) + pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size); cxt->size = pdata->mem_size; cxt->phys_addr = pdata->mem_address; + cxt->memtype = pdata->mem_type; cxt->record_size = pdata->record_size; cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; + cxt->pmsg_size = pdata->pmsg_size; cxt->dump_oops = pdata->dump_oops; cxt->ecc_info = pdata->ecc_info; paddr = cxt->phys_addr; - dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size; + dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size + - cxt->pmsg_size; err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); if (err) goto fail_out; @@ -458,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev) if (err) goto fail_init_fprz; - if (!cxt->przs && !cxt->cprz && !cxt->fprz) { - pr_err("memory size too small, minimum is %zu\n", - cxt->console_size + cxt->record_size + - cxt->ftrace_size); - err = -EINVAL; - goto fail_cnt; - } + err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0); + if (err) + goto fail_init_mprz; cxt->pstore.data = cxt; /* @@ -509,7 +550,8 @@ fail_buf: kfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; -fail_cnt: + kfree(cxt->mprz); +fail_init_mprz: kfree(cxt->fprz); fail_init_fprz: kfree(cxt->cprz); @@ -545,7 +587,6 @@ static struct platform_driver ramoops_driver = { .remove = __exit_p(ramoops_remove), .driver = { .name = "ramoops", - .owner = THIS_MODULE, }, }; @@ -564,9 +605,11 @@ static void ramoops_register_dummy(void) dummy_data->mem_size = mem_size; dummy_data->mem_address = mem_address; + dummy_data->mem_type = 0; dummy_data->record_size = record_size; dummy_data->console_size = ramoops_console_size; dummy_data->ftrace_size = ramoops_ftrace_size; + dummy_data->pmsg_size = ramoops_pmsg_size; dummy_data->dump_oops = dump_oops; /* * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 9d7b9a83699e..76c3f80efdfa 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -380,7 +380,8 @@ void persistent_ram_zap(struct persistent_ram_zone *prz) persistent_ram_update_header_ecc(prz); } -static void *persistent_ram_vmap(phys_addr_t start, size_t size) +static void *persistent_ram_vmap(phys_addr_t start, size_t size, + unsigned int memtype) { struct page **pages; phys_addr_t page_start; @@ -392,7 +393,10 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) page_start = start - offset_in_page(start); page_count = DIV_ROUND_UP(size + offset_in_page(start), PAGE_SIZE); - prot = pgprot_noncached(PAGE_KERNEL); + if (memtype) + prot = pgprot_noncached(PAGE_KERNEL); + else + prot = pgprot_writecombine(PAGE_KERNEL); pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { @@ -411,8 +415,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) return vaddr; } -static void *persistent_ram_iomap(phys_addr_t start, size_t size) +static void *persistent_ram_iomap(phys_addr_t start, size_t size, + unsigned int memtype) { + void *va; + if (!request_mem_region(start, size, "persistent_ram")) { pr_err("request mem region (0x%llx@0x%llx) failed\n", (unsigned long long)size, (unsigned long long)start); @@ -422,19 +429,24 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size) buffer_start_add = buffer_start_add_locked; buffer_size_add = buffer_size_add_locked; - return ioremap(start, size); + if (memtype) + va = ioremap(start, size); + else + va = ioremap_wc(start, size); + + return va; } static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, - struct persistent_ram_zone *prz) + struct persistent_ram_zone *prz, int memtype) { prz->paddr = start; prz->size = size; if (pfn_valid(start >> PAGE_SHIFT)) - prz->vaddr = persistent_ram_vmap(start, size); + prz->vaddr = persistent_ram_vmap(start, size, memtype); else - prz->vaddr = persistent_ram_iomap(start, size); + prz->vaddr = persistent_ram_iomap(start, size, memtype); if (!prz->vaddr) { pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__, @@ -500,7 +512,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz) } struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, - u32 sig, struct persistent_ram_ecc_info *ecc_info) + u32 sig, struct persistent_ram_ecc_info *ecc_info, + unsigned int memtype) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -511,7 +524,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, goto err; } - ret = persistent_ram_buffer_map(start, size, prz); + ret = persistent_ram_buffer_map(start, size, prz, memtype); if (ret) goto err; diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index c51df1dd237e..4a09975aac90 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -5,6 +5,7 @@ config QUOTA bool "Quota support" select QUOTACTL + select SRCU help If you say Y here, you will be able to set per user limits for disk usage (also called disk quotas). Currently, it works for the diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 6b4527216a7f..0ccd4ba3a246 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -893,6 +893,11 @@ out: } EXPORT_SYMBOL(dqget); +static inline struct dquot **i_dquot(struct inode *inode) +{ + return inode->i_sb->s_op->get_dquots(inode); +} + static int dqinit_needed(struct inode *inode, int type) { int cnt; @@ -900,9 +905,9 @@ static int dqinit_needed(struct inode *inode, int type) if (IS_NOQUOTA(inode)) return 0; if (type != -1) - return !inode->i_dquot[type]; + return !i_dquot(inode)[type]; for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (!inode->i_dquot[cnt]) + if (!i_dquot(inode)[cnt]) return 1; return 0; } @@ -965,9 +970,9 @@ static void add_dquot_ref(struct super_block *sb, int type) static void remove_inode_dquot_ref(struct inode *inode, int type, struct list_head *tofree_head) { - struct dquot *dquot = inode->i_dquot[type]; + struct dquot *dquot = i_dquot(inode)[type]; - inode->i_dquot[type] = NULL; + i_dquot(inode)[type] = NULL; if (!dquot) return; @@ -1243,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot) return capable(CAP_SYS_RESOURCE) && (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || - !(info->dqi_flags & V1_DQF_RSQUASH)); + !(info->dqi_flags & DQF_ROOT_SQUASH)); } /* needs dq_data_lock */ @@ -1402,7 +1407,7 @@ static void __dquot_initialize(struct inode *inode, int type) * we check it without locking here to avoid unnecessary * dqget()/dqput() calls. */ - if (inode->i_dquot[cnt]) + if (i_dquot(inode)[cnt]) continue; init_needed = 1; @@ -1433,8 +1438,8 @@ static void __dquot_initialize(struct inode *inode, int type) /* We could race with quotaon or dqget() could have failed */ if (!got[cnt]) continue; - if (!inode->i_dquot[cnt]) { - inode->i_dquot[cnt] = got[cnt]; + if (!i_dquot(inode)[cnt]) { + i_dquot(inode)[cnt] = got[cnt]; got[cnt] = NULL; /* * Make quota reservation system happy if someone @@ -1442,7 +1447,7 @@ static void __dquot_initialize(struct inode *inode, int type) */ rsv = inode_get_rsv_space(inode); if (unlikely(rsv)) - dquot_resv_space(inode->i_dquot[cnt], rsv); + dquot_resv_space(i_dquot(inode)[cnt], rsv); } } out_err: @@ -1472,8 +1477,8 @@ static void __dquot_drop(struct inode *inode) spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - put[cnt] = inode->i_dquot[cnt]; - inode->i_dquot[cnt] = NULL; + put[cnt] = i_dquot(inode)[cnt]; + i_dquot(inode)[cnt] = NULL; } spin_unlock(&dq_data_lock); dqput_all(put); @@ -1494,7 +1499,7 @@ void dquot_drop(struct inode *inode) * add quota pointers back anyway. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) + if (i_dquot(inode)[cnt]) break; } @@ -1595,7 +1600,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; - struct dquot **dquots = inode->i_dquot; + struct dquot **dquots = i_dquot(inode); int reserve = flags & DQUOT_SPACE_RESERVE; if (!dquot_active(inode)) { @@ -1643,11 +1648,11 @@ EXPORT_SYMBOL(__dquot_alloc_space); /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode) +int dquot_alloc_inode(struct inode *inode) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; - struct dquot * const *dquots = inode->i_dquot; + struct dquot * const *dquots = i_dquot(inode); if (!dquot_active(inode)) return 0; @@ -1696,14 +1701,14 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_claim_reserved_space(inode->i_dquot[cnt], + if (i_dquot(inode)[cnt]) + dquot_claim_reserved_space(i_dquot(inode)[cnt], number); } /* Update inode bytes */ inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - mark_all_dquot_dirty(inode->i_dquot); + mark_all_dquot_dirty(i_dquot(inode)); srcu_read_unlock(&dquot_srcu, index); return 0; } @@ -1725,14 +1730,14 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_reclaim_reserved_space(inode->i_dquot[cnt], + if (i_dquot(inode)[cnt]) + dquot_reclaim_reserved_space(i_dquot(inode)[cnt], number); } /* Update inode bytes */ inode_reclaim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - mark_all_dquot_dirty(inode->i_dquot); + mark_all_dquot_dirty(i_dquot(inode)); srcu_read_unlock(&dquot_srcu, index); return; } @@ -1745,7 +1750,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; - struct dquot **dquots = inode->i_dquot; + struct dquot **dquots = i_dquot(inode); int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!dquot_active(inode)) { @@ -1784,11 +1789,11 @@ EXPORT_SYMBOL(__dquot_free_space); /* * This operation can block, but only after everything is updated */ -void dquot_free_inode(const struct inode *inode) +void dquot_free_inode(struct inode *inode) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; - struct dquot * const *dquots = inode->i_dquot; + struct dquot * const *dquots = i_dquot(inode); int index; if (!dquot_active(inode)) @@ -1865,7 +1870,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) if (!sb_has_quota_active(inode->i_sb, cnt)) continue; is_valid[cnt] = 1; - transfer_from[cnt] = inode->i_dquot[cnt]; + transfer_from[cnt] = i_dquot(inode)[cnt]; ret = check_idq(transfer_to[cnt], 1, &warn_to[cnt]); if (ret) goto over_quota; @@ -1901,7 +1906,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) dquot_incr_space(transfer_to[cnt], cur_space); dquot_resv_space(transfer_to[cnt], rsv_space); - inode->i_dquot[cnt] = transfer_to[cnt]; + i_dquot(inode)[cnt] = transfer_to[cnt]; } spin_unlock(&dq_data_lock); @@ -2380,41 +2385,106 @@ out: } EXPORT_SYMBOL(dquot_quota_on_mount); -static inline qsize_t qbtos(qsize_t blocks) +static int dquot_quota_enable(struct super_block *sb, unsigned int flags) { - return blocks << QIF_DQBLKSIZE_BITS; + int ret; + int type; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) + return -ENOSYS; + /* Accounting cannot be turned on while fs is mounted */ + flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT); + if (!flags) + return -EINVAL; + for (type = 0; type < MAXQUOTAS; type++) { + if (!(flags & qtype_enforce_flag(type))) + continue; + /* Can't enforce without accounting */ + if (!sb_has_quota_usage_enabled(sb, type)) + return -EINVAL; + ret = dquot_enable(dqopt->files[type], type, + dqopt->info[type].dqi_fmt_id, + DQUOT_LIMITS_ENABLED); + if (ret < 0) + goto out_err; + } + return 0; +out_err: + /* Backout enforcement enablement we already did */ + for (type--; type >= 0; type--) { + if (flags & qtype_enforce_flag(type)) + dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); + } + /* Error code translation for better compatibility with XFS */ + if (ret == -EBUSY) + ret = -EEXIST; + return ret; } -static inline qsize_t stoqb(qsize_t space) +static int dquot_quota_disable(struct super_block *sb, unsigned int flags) { - return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; + int ret; + int type; + struct quota_info *dqopt = sb_dqopt(sb); + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) + return -ENOSYS; + /* + * We don't support turning off accounting via quotactl. In principle + * quota infrastructure can do this but filesystems don't expect + * userspace to be able to do it. + */ + if (flags & + (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT)) + return -EOPNOTSUPP; + + /* Filter out limits not enabled */ + for (type = 0; type < MAXQUOTAS; type++) + if (!sb_has_quota_limits_enabled(sb, type)) + flags &= ~qtype_enforce_flag(type); + /* Nothing left? */ + if (!flags) + return -EEXIST; + for (type = 0; type < MAXQUOTAS; type++) { + if (flags & qtype_enforce_flag(type)) { + ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); + if (ret < 0) + goto out_err; + } + } + return 0; +out_err: + /* Backout enforcement disabling we already did */ + for (type--; type >= 0; type--) { + if (flags & qtype_enforce_flag(type)) + dquot_enable(dqopt->files[type], type, + dqopt->info[type].dqi_fmt_id, + DQUOT_LIMITS_ENABLED); + } + return ret; } /* Generic routine for getting common part of quota structure */ -static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) +static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; memset(di, 0, sizeof(*di)); - di->d_version = FS_DQUOT_VERSION; - di->d_flags = dquot->dq_id.type == USRQUOTA ? - FS_USER_QUOTA : FS_GROUP_QUOTA; - di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id); - spin_lock(&dq_data_lock); - di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit); - di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit); + di->d_spc_hardlimit = dm->dqb_bhardlimit; + di->d_spc_softlimit = dm->dqb_bsoftlimit; di->d_ino_hardlimit = dm->dqb_ihardlimit; di->d_ino_softlimit = dm->dqb_isoftlimit; - di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace; - di->d_icount = dm->dqb_curinodes; - di->d_btimer = dm->dqb_btime; - di->d_itimer = dm->dqb_itime; + di->d_space = dm->dqb_curspace + dm->dqb_rsvspace; + di->d_ino_count = dm->dqb_curinodes; + di->d_spc_timer = dm->dqb_btime; + di->d_ino_timer = dm->dqb_itime; spin_unlock(&dq_data_lock); } int dquot_get_dqblk(struct super_block *sb, struct kqid qid, - struct fs_disk_quota *di) + struct qc_dqblk *di) { struct dquot *dquot; @@ -2428,70 +2498,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid, } EXPORT_SYMBOL(dquot_get_dqblk); -#define VFS_FS_DQ_MASK \ - (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ - FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \ - FS_DQ_BTIMER | FS_DQ_ITIMER) +#define VFS_QC_MASK \ + (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \ + QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \ + QC_SPC_TIMER | QC_INO_TIMER) /* Generic routine for setting common part of quota structure */ -static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) +static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; - if (di->d_fieldmask & ~VFS_FS_DQ_MASK) + if (di->d_fieldmask & ~VFS_QC_MASK) return -EINVAL; - if (((di->d_fieldmask & FS_DQ_BSOFT) && - (di->d_blk_softlimit > dqi->dqi_maxblimit)) || - ((di->d_fieldmask & FS_DQ_BHARD) && - (di->d_blk_hardlimit > dqi->dqi_maxblimit)) || - ((di->d_fieldmask & FS_DQ_ISOFT) && - (di->d_ino_softlimit > dqi->dqi_maxilimit)) || - ((di->d_fieldmask & FS_DQ_IHARD) && - (di->d_ino_hardlimit > dqi->dqi_maxilimit))) + if (((di->d_fieldmask & QC_SPC_SOFT) && + di->d_spc_softlimit > dqi->dqi_max_spc_limit) || + ((di->d_fieldmask & QC_SPC_HARD) && + di->d_spc_hardlimit > dqi->dqi_max_spc_limit) || + ((di->d_fieldmask & QC_INO_SOFT) && + (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) || + ((di->d_fieldmask & QC_INO_HARD) && + (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) return -ERANGE; spin_lock(&dq_data_lock); - if (di->d_fieldmask & FS_DQ_BCOUNT) { - dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace; + if (di->d_fieldmask & QC_SPACE) { + dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } - if (di->d_fieldmask & FS_DQ_BSOFT) - dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit); - if (di->d_fieldmask & FS_DQ_BHARD) - dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit); - if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) { + if (di->d_fieldmask & QC_SPC_SOFT) + dm->dqb_bsoftlimit = di->d_spc_softlimit; + if (di->d_fieldmask & QC_SPC_HARD) + dm->dqb_bhardlimit = di->d_spc_hardlimit; + if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) { check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } - if (di->d_fieldmask & FS_DQ_ICOUNT) { - dm->dqb_curinodes = di->d_icount; + if (di->d_fieldmask & QC_INO_COUNT) { + dm->dqb_curinodes = di->d_ino_count; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } - if (di->d_fieldmask & FS_DQ_ISOFT) + if (di->d_fieldmask & QC_INO_SOFT) dm->dqb_isoftlimit = di->d_ino_softlimit; - if (di->d_fieldmask & FS_DQ_IHARD) + if (di->d_fieldmask & QC_INO_HARD) dm->dqb_ihardlimit = di->d_ino_hardlimit; - if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) { + if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) { check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } - if (di->d_fieldmask & FS_DQ_BTIMER) { - dm->dqb_btime = di->d_btimer; + if (di->d_fieldmask & QC_SPC_TIMER) { + dm->dqb_btime = di->d_spc_timer; check_blim = 1; set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); } - if (di->d_fieldmask & FS_DQ_ITIMER) { - dm->dqb_itime = di->d_itimer; + if (di->d_fieldmask & QC_INO_TIMER) { + dm->dqb_itime = di->d_ino_timer; check_ilim = 1; set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); } @@ -2501,7 +2571,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) dm->dqb_curspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); - } else if (!(di->d_fieldmask & FS_DQ_BTIMER)) + } else if (!(di->d_fieldmask & QC_SPC_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; } @@ -2510,7 +2580,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) dm->dqb_curinodes < dm->dqb_isoftlimit) { dm->dqb_itime = 0; clear_bit(DQ_INODES_B, &dquot->dq_flags); - } else if (!(di->d_fieldmask & FS_DQ_ITIMER)) + } else if (!(di->d_fieldmask & QC_INO_TIMER)) /* Set grace only if user hasn't provided his own... */ dm->dqb_itime = get_seconds() + dqi->dqi_igrace; } @@ -2526,7 +2596,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di) } int dquot_set_dqblk(struct super_block *sb, struct kqid qid, - struct fs_disk_quota *di) + struct qc_dqblk *di) { struct dquot *dquot; int rc; @@ -2577,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) goto out; } mi = sb_dqopt(sb)->info + type; + if (ii->dqi_valid & IIF_FLAGS) { + if (ii->dqi_flags & ~DQF_SETINFO_MASK || + (ii->dqi_flags & DQF_ROOT_SQUASH && + mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) { + err = -EINVAL; + goto out; + } + } spin_lock(&dq_data_lock); if (ii->dqi_valid & IIF_BGRACE) mi->dqi_bgrace = ii->dqi_bgrace; @@ -2606,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = { }; EXPORT_SYMBOL(dquot_quotactl_ops); +const struct quotactl_ops dquot_quotactl_sysfile_ops = { + .quota_enable = dquot_quota_enable, + .quota_disable = dquot_quota_disable, + .quota_sync = dquot_quota_sync, + .get_info = dquot_get_dqinfo, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk +}; +EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); + static int do_proc_dqstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -2743,8 +2832,8 @@ static int __init dquot_init(void) for (i = 0; i < nr_hash; i++) INIT_HLIST_HEAD(dquot_hash + i); - printk("Dquot-cache hash table entries: %ld (order %ld, %ld bytes)\n", - nr_hash, order, (PAGE_SIZE << order)); + pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," + " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); register_shrinker(&dqcache_shrinker); diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 75621649dbd7..d14a799c7785 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -47,8 +47,11 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd, static void quota_sync_one(struct super_block *sb, void *arg) { - if (sb->s_qcop && sb->s_qcop->quota_sync) - sb->s_qcop->quota_sync(sb, *(int *)arg); + int type = *(int *)arg; + + if (sb->s_qcop && sb->s_qcop->quota_sync && + (sb->s_quota_types & (1 << type))) + sb->s_qcop->quota_sync(sb, type); } static int quota_sync_all(int type) @@ -63,18 +66,40 @@ static int quota_sync_all(int type) return ret; } +unsigned int qtype_enforce_flag(int type) +{ + switch (type) { + case USRQUOTA: + return FS_QUOTA_UDQ_ENFD; + case GRPQUOTA: + return FS_QUOTA_GDQ_ENFD; + case PRJQUOTA: + return FS_QUOTA_PDQ_ENFD; + } + return 0; +} + static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, struct path *path) { - if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta) + if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) return -ENOSYS; - if (sb->s_qcop->quota_on_meta) - return sb->s_qcop->quota_on_meta(sb, type, id); + if (sb->s_qcop->quota_enable) + return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type)); if (IS_ERR(path)) return PTR_ERR(path); return sb->s_qcop->quota_on(sb, type, id, path); } +static int quota_quotaoff(struct super_block *sb, int type) +{ + if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable) + return -ENOSYS; + if (sb->s_qcop->quota_disable) + return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type)); + return sb->s_qcop->quota_off(sb, type); +} + static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; @@ -115,17 +140,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr) return sb->s_qcop->set_info(sb, type, &info); } -static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src) +static inline qsize_t qbtos(qsize_t blocks) +{ + return blocks << QIF_DQBLKSIZE_BITS; +} + +static inline qsize_t stoqb(qsize_t space) +{ + return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; +} + +static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src) { memset(dst, 0, sizeof(*dst)); - dst->dqb_bhardlimit = src->d_blk_hardlimit; - dst->dqb_bsoftlimit = src->d_blk_softlimit; - dst->dqb_curspace = src->d_bcount; + dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit); + dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit); + dst->dqb_curspace = src->d_space; dst->dqb_ihardlimit = src->d_ino_hardlimit; dst->dqb_isoftlimit = src->d_ino_softlimit; - dst->dqb_curinodes = src->d_icount; - dst->dqb_btime = src->d_btimer; - dst->dqb_itime = src->d_itimer; + dst->dqb_curinodes = src->d_ino_count; + dst->dqb_btime = src->d_spc_timer; + dst->dqb_itime = src->d_ino_timer; dst->dqb_valid = QIF_ALL; } @@ -133,7 +168,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct kqid qid; - struct fs_disk_quota fdq; + struct qc_dqblk fdq; struct if_dqblk idq; int ret; @@ -151,36 +186,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id, return 0; } -static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src) +static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src) { - dst->d_blk_hardlimit = src->dqb_bhardlimit; - dst->d_blk_softlimit = src->dqb_bsoftlimit; - dst->d_bcount = src->dqb_curspace; + dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit); + dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit); + dst->d_space = src->dqb_curspace; dst->d_ino_hardlimit = src->dqb_ihardlimit; dst->d_ino_softlimit = src->dqb_isoftlimit; - dst->d_icount = src->dqb_curinodes; - dst->d_btimer = src->dqb_btime; - dst->d_itimer = src->dqb_itime; + dst->d_ino_count = src->dqb_curinodes; + dst->d_spc_timer = src->dqb_btime; + dst->d_ino_timer = src->dqb_itime; dst->d_fieldmask = 0; if (src->dqb_valid & QIF_BLIMITS) - dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD; + dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD; if (src->dqb_valid & QIF_SPACE) - dst->d_fieldmask |= FS_DQ_BCOUNT; + dst->d_fieldmask |= QC_SPACE; if (src->dqb_valid & QIF_ILIMITS) - dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD; + dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD; if (src->dqb_valid & QIF_INODES) - dst->d_fieldmask |= FS_DQ_ICOUNT; + dst->d_fieldmask |= QC_INO_COUNT; if (src->dqb_valid & QIF_BTIME) - dst->d_fieldmask |= FS_DQ_BTIMER; + dst->d_fieldmask |= QC_SPC_TIMER; if (src->dqb_valid & QIF_ITIME) - dst->d_fieldmask |= FS_DQ_ITIMER; + dst->d_fieldmask |= QC_INO_TIMER; } static int quota_setquota(struct super_block *sb, int type, qid_t id, void __user *addr) { - struct fs_disk_quota fdq; + struct qc_dqblk fdq; struct if_dqblk idq; struct kqid qid; @@ -195,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id, return sb->s_qcop->set_dqblk(sb, qid, &fdq); } -static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr) +static int quota_enable(struct super_block *sb, void __user *addr) +{ + __u32 flags; + + if (copy_from_user(&flags, addr, sizeof(flags))) + return -EFAULT; + if (!sb->s_qcop->quota_enable) + return -ENOSYS; + return sb->s_qcop->quota_enable(sb, flags); +} + +static int quota_disable(struct super_block *sb, void __user *addr) { __u32 flags; if (copy_from_user(&flags, addr, sizeof(flags))) return -EFAULT; - if (!sb->s_qcop->set_xstate) + if (!sb->s_qcop->quota_disable) return -ENOSYS; - return sb->s_qcop->set_xstate(sb, flags, cmd); + return sb->s_qcop->quota_disable(sb, flags); } static int quota_getxstate(struct super_block *sb, void __user *addr) @@ -244,10 +290,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr) return ret; } +/* + * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them + * out of there as xfsprogs rely on definitions being in that header file. So + * just define same functions here for quota purposes. + */ +#define XFS_BB_SHIFT 9 + +static inline u64 quota_bbtob(u64 blocks) +{ + return blocks << XFS_BB_SHIFT; +} + +static inline u64 quota_btobb(u64 bytes) +{ + return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT; +} + +static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) +{ + dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit); + dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit); + dst->d_ino_hardlimit = src->d_ino_hardlimit; + dst->d_ino_softlimit = src->d_ino_softlimit; + dst->d_space = quota_bbtob(src->d_bcount); + dst->d_ino_count = src->d_icount; + dst->d_ino_timer = src->d_itimer; + dst->d_spc_timer = src->d_btimer; + dst->d_ino_warns = src->d_iwarns; + dst->d_spc_warns = src->d_bwarns; + dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit); + dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit); + dst->d_rt_space = quota_bbtob(src->d_rtbcount); + dst->d_rt_spc_timer = src->d_rtbtimer; + dst->d_rt_spc_warns = src->d_rtbwarns; + dst->d_fieldmask = 0; + if (src->d_fieldmask & FS_DQ_ISOFT) + dst->d_fieldmask |= QC_INO_SOFT; + if (src->d_fieldmask & FS_DQ_IHARD) + dst->d_fieldmask |= QC_INO_HARD; + if (src->d_fieldmask & FS_DQ_BSOFT) + dst->d_fieldmask |= QC_SPC_SOFT; + if (src->d_fieldmask & FS_DQ_BHARD) + dst->d_fieldmask |= QC_SPC_HARD; + if (src->d_fieldmask & FS_DQ_RTBSOFT) + dst->d_fieldmask |= QC_RT_SPC_SOFT; + if (src->d_fieldmask & FS_DQ_RTBHARD) + dst->d_fieldmask |= QC_RT_SPC_HARD; + if (src->d_fieldmask & FS_DQ_BTIMER) + dst->d_fieldmask |= QC_SPC_TIMER; + if (src->d_fieldmask & FS_DQ_ITIMER) + dst->d_fieldmask |= QC_INO_TIMER; + if (src->d_fieldmask & FS_DQ_RTBTIMER) + dst->d_fieldmask |= QC_RT_SPC_TIMER; + if (src->d_fieldmask & FS_DQ_BWARNS) + dst->d_fieldmask |= QC_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_IWARNS) + dst->d_fieldmask |= QC_INO_WARNS; + if (src->d_fieldmask & FS_DQ_RTBWARNS) + dst->d_fieldmask |= QC_RT_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_BCOUNT) + dst->d_fieldmask |= QC_SPACE; + if (src->d_fieldmask & FS_DQ_ICOUNT) + dst->d_fieldmask |= QC_INO_COUNT; + if (src->d_fieldmask & FS_DQ_RTBCOUNT) + dst->d_fieldmask |= QC_RT_SPACE; +} + static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; + struct qc_dqblk qdq; struct kqid qid; if (copy_from_user(&fdq, addr, sizeof(fdq))) @@ -257,13 +371,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, qid = make_kqid(current_user_ns(), type, id); if (!qid_valid(qid)) return -EINVAL; - return sb->s_qcop->set_dqblk(sb, qid, &fdq); + copy_from_xfs_dqblk(&qdq, &fdq); + return sb->s_qcop->set_dqblk(sb, qid, &qdq); +} + +static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src, + int type, qid_t id) +{ + memset(dst, 0, sizeof(*dst)); + dst->d_version = FS_DQUOT_VERSION; + dst->d_id = id; + if (type == USRQUOTA) + dst->d_flags = FS_USER_QUOTA; + else if (type == PRJQUOTA) + dst->d_flags = FS_PROJ_QUOTA; + else + dst->d_flags = FS_GROUP_QUOTA; + dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit); + dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit); + dst->d_ino_hardlimit = src->d_ino_hardlimit; + dst->d_ino_softlimit = src->d_ino_softlimit; + dst->d_bcount = quota_btobb(src->d_space); + dst->d_icount = src->d_ino_count; + dst->d_itimer = src->d_ino_timer; + dst->d_btimer = src->d_spc_timer; + dst->d_iwarns = src->d_ino_warns; + dst->d_bwarns = src->d_spc_warns; + dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit); + dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit); + dst->d_rtbcount = quota_btobb(src->d_rt_space); + dst->d_rtbtimer = src->d_rt_spc_timer; + dst->d_rtbwarns = src->d_rt_spc_warns; } static int quota_getxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { struct fs_disk_quota fdq; + struct qc_dqblk qdq; struct kqid qid; int ret; @@ -272,8 +417,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id, qid = make_kqid(current_user_ns(), type, id); if (!qid_valid(qid)) return -EINVAL; - ret = sb->s_qcop->get_dqblk(sb, qid, &fdq); - if (!ret && copy_to_user(addr, &fdq, sizeof(fdq))) + ret = sb->s_qcop->get_dqblk(sb, qid, &qdq); + if (ret) + return ret; + copy_to_xfs_dqblk(&fdq, &qdq, type, id); + if (copy_to_user(addr, &fdq, sizeof(fdq))) return -EFAULT; return ret; } @@ -297,8 +445,14 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, if (type >= (XQM_COMMAND(cmd) ? XQM_MAXQUOTAS : MAXQUOTAS)) return -EINVAL; + /* + * Quota not supported on this fs? Check this before s_quota_types + * since they needn't be set if quota is not supported at all. + */ if (!sb->s_qcop) return -ENOSYS; + if (!(sb->s_quota_types & (1 << type))) + return -EINVAL; ret = check_quotactl_permission(sb, type, cmd, id); if (ret < 0) @@ -308,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, case Q_QUOTAON: return quota_quotaon(sb, type, cmd, id, path); case Q_QUOTAOFF: - if (!sb->s_qcop->quota_off) - return -ENOSYS; - return sb->s_qcop->quota_off(sb, type); + return quota_quotaoff(sb, type); case Q_GETFMT: return quota_getfmt(sb, type, addr); case Q_GETINFO: @@ -326,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return -ENOSYS; return sb->s_qcop->quota_sync(sb, type); case Q_XQUOTAON: + return quota_enable(sb, addr); case Q_XQUOTAOFF: - return quota_setxstate(sb, cmd, addr); + return quota_disable(sb, addr); case Q_XQUOTARM: return quota_rmxquota(sb, addr); case Q_XGETQSTAT: diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 469c6848b322..8fe79beced5c 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type) } ret = 0; /* limits are stored as unsigned 32-bit data */ - dqopt->info[type].dqi_maxblimit = 0xffffffff; - dqopt->info[type].dqi_maxilimit = 0xffffffff; + dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS; + dqopt->info[type].dqi_max_ino_limit = 0xffffffff; dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; dqopt->info[type].dqi_bgrace = diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 02751ec695c5..9cb10d7197f7 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo = info->dqi_priv; if (version == 0) { /* limits are stored as unsigned 32-bit data */ - info->dqi_maxblimit = 0xffffffff; - info->dqi_maxilimit = 0xffffffff; + info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS; + info->dqi_max_ino_limit = 0xffffffff; } else { - /* used space is stored as unsigned 64-bit value */ - info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ - info->dqi_maxilimit = 0xffffffffffffffffULL; + /* used space is stored as unsigned 64-bit value in bytes */ + info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */ + info->dqi_max_ino_limit = 0xffffffffffffffffULL; } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); - info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); + /* No flags currently supported */ + info->dqi_flags = 0; qinfo->dqi_sb = sb; qinfo->dqi_type = type; qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); @@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type) info->dqi_flags &= ~DQF_INFO_DIRTY; dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); - dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + /* No flags currently supported */ + dinfo.dqi_flags = cpu_to_le32(0); spin_unlock(&dq_data_lock); dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index bbafbde3471a..f6ab41b39612 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long flags); static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); +static unsigned ramfs_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ | + NOMMU_MAP_WRITE | NOMMU_MAP_EXEC; +} + const struct file_operations ramfs_file_operations = { + .mmap_capabilities = ramfs_mmap_capabilities, .mmap = ramfs_nommu_mmap, .get_unmapped_area = ramfs_nommu_get_unmapped_area, .read = new_sync_read, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index d365b1c4eb3c..889d558b4e05 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = { .set_page_dirty = __set_page_dirty_no_writeback, }; -static struct backing_dev_info ramfs_backing_dev_info = { - .name = "ramfs", - .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | - BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | - BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, -}; - struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode, dev_t dev) { @@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; - inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = { int __init init_ramfs_fs(void) { static unsigned long once; - int err; if (test_and_set_bit(0, &once)) return 0; - - err = bdi_init(&ramfs_backing_dev_info); - if (err) - return err; - - err = register_filesystem(&ramfs_fs_type); - if (err) - bdi_destroy(&ramfs_backing_dev_info); - - return err; + return register_filesystem(&ramfs_fs_type); } fs_initcall(init_ramfs_fs); diff --git a/fs/read_write.c b/fs/read_write.c index 7d9318c3d43c..8e1b68786d66 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -333,6 +333,52 @@ out_putf: } #endif +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + if (!file->f_op->read_iter) + return -EINVAL; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = iov_iter_count(iter); + + iter->type |= READ; + ret = file->f_op->read_iter(&kiocb, iter); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(vfs_iter_read); + +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + if (!file->f_op->write_iter) + return -EINVAL; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = iov_iter_count(iter); + + iter->type |= WRITE; + ret = file->f_op->write_iter(&kiocb, iter); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(vfs_iter_write); + /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others @@ -358,7 +404,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t return retval; } - if (unlikely(inode->i_flock && mandatory_lock(inode))) { + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); @@ -412,6 +458,23 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p EXPORT_SYMBOL(new_sync_read); +ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, + loff_t *pos) +{ + ssize_t ret; + + if (file->f_op->read) + ret = file->f_op->read(file, buf, count, pos); + else if (file->f_op->aio_read) + ret = do_sync_read(file, buf, count, pos); + else if (file->f_op->read_iter) + ret = new_sync_read(file, buf, count, pos); + else + ret = -EINVAL; + + return ret; +} + ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { ssize_t ret; @@ -426,12 +489,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) ret = rw_verify_area(READ, file, pos, count); if (ret >= 0) { count = ret; - if (file->f_op->read) - ret = file->f_op->read(file, buf, count, pos); - else if (file->f_op->aio_read) - ret = do_sync_read(file, buf, count, pos); - else - ret = new_sync_read(file, buf, count, pos); + ret = __vfs_read(file, buf, count, pos); if (ret > 0) { fsnotify_access(file); add_rchar(current, ret); diff --git a/fs/readdir.c b/fs/readdir.c index 33fd92208cb7..ced679179cac 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -74,10 +74,11 @@ struct readdir_callback { int result; }; -static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int fillonedir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { - struct readdir_callback *buf = (struct readdir_callback *) __buf; + struct readdir_callback *buf = + container_of(ctx, struct readdir_callback, ctx); struct old_linux_dirent __user * dirent; unsigned long d_ino; @@ -148,11 +149,12 @@ struct getdents_callback { int error; }; -static int filldir(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent __user * dirent; - struct getdents_callback * buf = (struct getdents_callback *) __buf; + struct getdents_callback *buf = + container_of(ctx, struct getdents_callback, ctx); unsigned long d_ino; int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); @@ -232,11 +234,12 @@ struct getdents_callback64 { int error; }; -static int filldir64(void * __buf, const char * name, int namlen, loff_t offset, - u64 ino, unsigned int d_type) +static int filldir64(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) { struct linux_dirent64 __user *dirent; - struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; + struct getdents_callback64 *buf = + container_of(ctx, struct getdents_callback64, ctx); int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a7eec9888f10..e72401e1f995 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file, int old_ref = 0; inode = mapping->host; - *fsdata = 0; + *fsdata = NULL; if (flags & AOP_FLAG_CONT_EXPAND && (pos & (inode->i_sb->s_blocksize - 1)) == 0) { pos ++; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 1894d96ccb7c..bb79cddf0a1f 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -97,6 +97,10 @@ struct reiserfs_inode_info { #ifdef CONFIG_REISERFS_FS_XATTR struct rw_semaphore i_xattr_sem; #endif +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + struct inode vfs_inode; }; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f1376c92cf74..71fbbe3e2dab 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -594,6 +594,10 @@ static struct inode *reiserfs_alloc_inode(struct super_block *sb) return NULL; atomic_set(&ei->openers, 0); mutex_init(&ei->tailpack); +#ifdef CONFIG_QUOTA + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); +#endif + return &ei->vfs_inode; } @@ -750,6 +754,11 @@ static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t); static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t); + +static struct dquot **reiserfs_get_dquots(struct inode *inode) +{ + return REISERFS_I(inode)->i_dquot; +} #endif static const struct super_operations reiserfs_sops = { @@ -768,6 +777,7 @@ static const struct super_operations reiserfs_sops = { #ifdef CONFIG_QUOTA .quota_read = reiserfs_quota_read, .quota_write = reiserfs_quota_write, + .get_dquots = reiserfs_get_dquots, #endif }; @@ -1633,6 +1643,7 @@ static int read_super_block(struct super_block *s, int offset) #ifdef CONFIG_QUOTA s->s_qcop = &reiserfs_qctl_operations; s->dq_op = &reiserfs_quota_operations; + s->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif /* @@ -2161,6 +2172,9 @@ error_unlocked: reiserfs_write_unlock(s); } + if (sbi->commit_wq) + destroy_workqueue(sbi->commit_wq); + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); reiserfs_free_bitmap_cache(s); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 7c36898af402..4e781e697c90 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -188,10 +188,11 @@ struct reiserfs_dentry_buf { }; static int -fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, - u64 ino, unsigned int d_type) +fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) { - struct reiserfs_dentry_buf *dbuf = buf; + struct reiserfs_dentry_buf *dbuf = + container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); @@ -209,9 +210,9 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, } else if (!dentry->d_inode) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", - "Corrupted directory: xattr %s listed but " - "not found for file %s.\n", - dentry->d_name.name, dbuf->xadir->d_name.name); + "Corrupted directory: xattr %pd listed but " + "not found for file %pd.\n", + dentry, dbuf->xadir); dput(dentry); return -EIO; } @@ -265,7 +266,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, for (i = 0; !err && i < buf.count && buf.dentries[i]; i++) { struct dentry *dentry = buf.dentries[i]; - if (!S_ISDIR(dentry->d_inode->i_mode)) + if (!d_is_dir(dentry)) err = action(dentry, data); dput(dentry); @@ -321,7 +322,7 @@ static int delete_one_xattr(struct dentry *dentry, void *data) struct inode *dir = dentry->d_parent->d_inode; /* This is the xattr dir, handle specially. */ - if (S_ISDIR(dentry->d_inode->i_mode)) + if (d_is_dir(dentry)) return xattr_rmdir(dir, dentry); return xattr_unlink(dir, dentry); @@ -824,10 +825,12 @@ struct listxattr_buf { struct dentry *dentry; }; -static int listxattr_filler(void *buf, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) +static int listxattr_filler(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) { - struct listxattr_buf *b = (struct listxattr_buf *)buf; + struct listxattr_buf *b = + container_of(ctx, struct listxattr_buf, ctx); size_t size; if (name[0] != '.' || diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index ea06c7554860..7da9e2153953 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; } +static unsigned romfs_mmap_capabilities(struct file *file) +{ + struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd; + + if (!mtd) + return NOMMU_MAP_COPY; + return mtd_mmap_capabilities(mtd); +} + const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, .read = new_sync_read, @@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = { .splice_read = generic_file_splice_read, .mmap = romfs_mmap, .get_unmapped_area = romfs_get_unmapped_area, + .mmap_capabilities = romfs_mmap_capabilities, }; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e98dd88197d5..268733cda397 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) case ROMFH_REG: i->i_fop = &romfs_ro_fops; i->i_data.a_ops = &romfs_aops; - if (i->i_sb->s_mtd) - i->i_data.backing_dev_info = - i->i_sb->s_mtd->backing_dev_info; if (nextfh & ROMFH_EXEC) mode |= S_IXUGO; break; diff --git a/fs/select.c b/fs/select.c index 467bb1cb3ea5..f684c750e08a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, if (ret == -EINTR) { struct restart_block *restart_block; - restart_block = ¤t_thread_info()->restart_block; + restart_block = ¤t->restart_block; restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; diff --git a/fs/seq_file.c b/fs/seq_file.c index 3857b720cb1b..555f82155be8 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -16,17 +16,6 @@ #include <asm/uaccess.h> #include <asm/page.h> - -/* - * seq_files have a buffer which can may overflow. When this happens a larger - * buffer is reallocated and all the data will be printed again. - * The overflow state is true when m->count == m->size. - */ -static bool seq_overflow(struct seq_file *m) -{ - return m->count == m->size; -} - static void seq_set_overflow(struct seq_file *m) { m->count = m->size; @@ -36,7 +25,11 @@ static void *seq_buf_alloc(unsigned long size) { void *buf; - buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + /* + * __GFP_NORETRY to avoid oom-killings with high-order allocations - + * it's better to fall back to vmalloc() than to kill things. + */ + buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); if (!buf && size > PAGE_SIZE) buf = vmalloc(size); return buf; @@ -124,7 +117,7 @@ static int traverse(struct seq_file *m, loff_t offset) error = 0; m->count = 0; } - if (seq_overflow(m)) + if (seq_has_overflowed(m)) goto Eoverflow; if (pos + m->count > offset) { m->from = offset - pos; @@ -267,7 +260,7 @@ Fill: break; } err = m->op->show(m, p); - if (seq_overflow(m) || err) { + if (seq_has_overflowed(m) || err) { m->count = offs; if (likely(err <= 0)) break; @@ -546,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } -int seq_bitmap(struct seq_file *m, const unsigned long *bits, - unsigned int nr_bits) -{ - if (m->count < m->size) { - int len = bitmap_scnprintf(m->buf + m->count, - m->size - m->count, bits, nr_bits); - if (m->count + len < m->size) { - m->count += len; - return 0; - } - } - seq_set_overflow(m); - return -1; -} -EXPORT_SYMBOL(seq_bitmap); - -int seq_bitmap_list(struct seq_file *m, const unsigned long *bits, - unsigned int nr_bits) -{ - if (m->count < m->size) { - int len = bitmap_scnlistprintf(m->buf + m->count, - m->size - m->count, bits, nr_bits); - if (m->count + len < m->size) { - m->count += len; - return 0; - } - } - seq_set_overflow(m); - return -1; -} -EXPORT_SYMBOL(seq_bitmap_list); - static void *single_start(struct seq_file *p, loff_t *pos) { return NULL + (*pos == 0); diff --git a/fs/signalfd.c b/fs/signalfd.c index 424b7b65321f..7e412ad74836 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -230,7 +230,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count, } #ifdef CONFIG_PROC_FS -static int signalfd_show_fdinfo(struct seq_file *m, struct file *f) +static void signalfd_show_fdinfo(struct seq_file *m, struct file *f) { struct signalfd_ctx *ctx = f->private_data; sigset_t sigmask; @@ -238,8 +238,6 @@ static int signalfd_show_fdinfo(struct seq_file *m, struct file *f) sigmask = ctx->sigmask; signotset(&sigmask); render_sigset_t(m, "sigmask:\t", &sigmask); - - return 0; } #endif diff --git a/fs/splice.c b/fs/splice.c index 75c6058eabf2..7968da96bebb 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; - struct kiocb kiocb; size_t left; int n, idx; @@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, left -= this_len; } - /* ... iov_iter */ - from.type = ITER_BVEC | WRITE; - from.bvec = array; - from.nr_segs = n; - from.count = sd.total_len - left; - from.iov_offset = 0; - - /* ... and iocb */ - init_sync_kiocb(&kiocb, out); - kiocb.ki_pos = sd.pos; - kiocb.ki_nbytes = sd.total_len - left; - - /* now, send it */ - ret = out->f_op->write_iter(&kiocb, &from); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - + iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, + sd.total_len - left); + ret = vfs_iter_write(out, &from, &sd.pos); if (ret <= 0) break; sd.num_spliced += ret; sd.total_len -= ret; - *ppos = sd.pos = kiocb.ki_pos; + *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ while (ret) { diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index b6fa8657dcbc..ffb093e72b6c 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -120,6 +120,21 @@ config SQUASHFS_ZLIB If unsure, say Y. +config SQUASHFS_LZ4 + bool "Include support for LZ4 compressed file systems" + depends on SQUASHFS + select LZ4_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with LZ4 compression. LZ4 compression is mainly + aimed at embedded systems with slower CPUs where the overheads + of zlib are too high. + + LZ4 is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config SQUASHFS_LZO bool "Include support for LZO compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 4132520b4ff2..246a6f329d89 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -11,6 +11,7 @@ squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o +squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index ac22fe73b0ad..e9034bf6e5ae 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -41,6 +41,12 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 }; +#ifndef CONFIG_SQUASHFS_LZ4 +static const struct squashfs_decompressor squashfs_lz4_comp_ops = { + NULL, NULL, NULL, NULL, LZ4_COMPRESSION, "lz4", 0 +}; +#endif + #ifndef CONFIG_SQUASHFS_LZO static const struct squashfs_decompressor squashfs_lzo_comp_ops = { NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 @@ -65,6 +71,7 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = { static const struct squashfs_decompressor *decompressor[] = { &squashfs_zlib_comp_ops, + &squashfs_lz4_comp_ops, &squashfs_lzo_comp_ops, &squashfs_xz_comp_ops, &squashfs_lzma_unsupported_comp_ops, diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index af0985321808..a25713c031a5 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -46,6 +46,10 @@ static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk, extern const struct squashfs_decompressor squashfs_xz_comp_ops; #endif +#ifdef CONFIG_SQUASHFS_LZ4 +extern const struct squashfs_decompressor squashfs_lz4_comp_ops; +#endif + #ifdef CONFIG_SQUASHFS_LZO extern const struct squashfs_decompressor squashfs_lzo_comp_ops; #endif diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c new file mode 100644 index 000000000000..c31e2bc9c081 --- /dev/null +++ b/fs/squashfs/lz4_wrapper.c @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2013, 2014 + * Phillip Lougher <phillip@squashfs.org.uk> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/buffer_head.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/lz4.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +#define LZ4_LEGACY 1 + +struct lz4_comp_opts { + __le32 version; + __le32 flags; +}; + +struct squashfs_lz4 { + void *input; + void *output; +}; + + +static void *lz4_comp_opts(struct squashfs_sb_info *msblk, + void *buff, int len) +{ + struct lz4_comp_opts *comp_opts = buff; + + /* LZ4 compressed filesystems always have compression options */ + if (comp_opts == NULL || len < sizeof(*comp_opts)) + return ERR_PTR(-EIO); + + if (le32_to_cpu(comp_opts->version) != LZ4_LEGACY) { + /* LZ4 format currently used by the kernel is the 'legacy' + * format */ + ERROR("Unknown LZ4 version\n"); + return ERR_PTR(-EINVAL); + } + + return NULL; +} + + +static void *lz4_init(struct squashfs_sb_info *msblk, void *buff) +{ + int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); + struct squashfs_lz4 *stream; + + stream = kzalloc(sizeof(*stream), GFP_KERNEL); + if (stream == NULL) + goto failed; + stream->input = vmalloc(block_size); + if (stream->input == NULL) + goto failed2; + stream->output = vmalloc(block_size); + if (stream->output == NULL) + goto failed3; + + return stream; + +failed3: + vfree(stream->input); +failed2: + kfree(stream); +failed: + ERROR("Failed to initialise LZ4 decompressor\n"); + return ERR_PTR(-ENOMEM); +} + + +static void lz4_free(void *strm) +{ + struct squashfs_lz4 *stream = strm; + + if (stream) { + vfree(stream->input); + vfree(stream->output); + } + kfree(stream); +} + + +static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + struct squashfs_lz4 *stream = strm; + void *buff = stream->input, *data; + int avail, i, bytes = length, res; + size_t dest_len = output->length; + + for (i = 0; i < b; i++) { + avail = min(bytes, msblk->devblksize - offset); + memcpy(buff, bh[i]->b_data + offset, avail); + buff += avail; + bytes -= avail; + offset = 0; + put_bh(bh[i]); + } + + res = lz4_decompress_unknownoutputsize(stream->input, length, + stream->output, &dest_len); + if (res) + return -EIO; + + bytes = dest_len; + data = squashfs_first_page(output); + buff = stream->output; + while (data) { + if (bytes <= PAGE_CACHE_SIZE) { + memcpy(data, buff, bytes); + break; + } + memcpy(data, buff, PAGE_CACHE_SIZE); + buff += PAGE_CACHE_SIZE; + bytes -= PAGE_CACHE_SIZE; + data = squashfs_next_page(output); + } + squashfs_finish_page(output); + + return dest_len; +} + +const struct squashfs_decompressor squashfs_lz4_comp_ops = { + .init = lz4_init, + .comp_opts = lz4_comp_opts, + .free = lz4_free, + .decompress = lz4_uncompress, + .id = LZ4_COMPRESSION, + .name = "lz4", + .supported = 1 +}; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 4b2beda49498..506f4ba5b983 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -240,6 +240,7 @@ struct meta_index { #define LZMA_COMPRESSION 2 #define LZO_COMPRESSION 3 #define XZ_COMPRESSION 4 +#define LZ4_COMPRESSION 5 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/super.c b/fs/super.c index eae088f6aaae..2b7dc90ccdbb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -36,8 +36,8 @@ #include "internal.h" -LIST_HEAD(super_blocks); -DEFINE_SPINLOCK(sb_lock); +static LIST_HEAD(super_blocks); +static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", @@ -71,14 +71,14 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!grab_super_passive(sb)) + if (!trylock_super(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) - fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); + fs_objects = sb->s_op->nr_cached_objects(sb, sc); - inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); - dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; @@ -86,22 +86,26 @@ static unsigned long super_cache_scan(struct shrinker *shrink, /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches + * + * Ensure that we always scan at least one object - memcg kmem + * accounting uses this to fully empty the caches. */ - freed = prune_dcache_sb(sb, dentries, sc->nid); - freed += prune_icache_sb(sb, inodes, sc->nid); + sc->nr_to_scan = dentries + 1; + freed = prune_dcache_sb(sb, sc); + sc->nr_to_scan = inodes + 1; + freed += prune_icache_sb(sb, sc); if (fs_objects) { - fs_objects = mult_frac(sc->nr_to_scan, fs_objects, - total_objects); - freed += sb->s_op->free_cached_objects(sb, fs_objects, - sc->nid); + sc->nr_to_scan = fs_objects + 1; + freed += sb->s_op->free_cached_objects(sb, sc); } - drop_super(sb); + up_read(&sb->s_umount); return freed; } @@ -114,21 +118,18 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * Don't call grab_super_passive as it is a potential + * Don't call trylock_super as it is a potential * scalability bottleneck. The counts could get updated * between super_cache_count and super_cache_scan anyway. * Call to super_cache_count with shrinker_rwsem held - * ensures the safety of call to list_lru_count_node() and + * ensures the safety of call to list_lru_shrink_count() and * s_op->nr_cached_objects(). */ if (sb->s_op && sb->s_op->nr_cached_objects) - total_objects = sb->s_op->nr_cached_objects(sb, - sc->nid); + total_objects = sb->s_op->nr_cached_objects(sb, sc); - total_objects += list_lru_count_node(&sb->s_dentry_lru, - sc->nid); - total_objects += list_lru_count_node(&sb->s_inode_lru, - sc->nid); + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); total_objects = vfs_pressure_ratio(total_objects); return total_objects; @@ -185,15 +186,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) } init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; - s->s_bdi = &default_backing_dev_info; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); - if (list_lru_init(&s->s_dentry_lru)) + if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; - if (list_lru_init(&s->s_inode_lru)) + if (list_lru_init_memcg(&s->s_inode_lru)) goto fail; init_rwsem(&s->s_umount); @@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_shrink.scan_objects = super_cache_scan; s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE; + s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; return s; fail: @@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s) unregister_shrinker(&s->s_shrink); fs->kill_sb(s); + /* + * Since list_lru_destroy() may sleep, we cannot call it from + * put_super(), where we hold the sb_lock. Therefore we destroy + * the lru lists right now. + */ + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + put_filesystem(fs); put_super(s); } else { @@ -339,35 +348,31 @@ static int grab_super(struct super_block *s) __releases(sb_lock) } /* - * grab_super_passive - acquire a passive reference + * trylock_super - try to grab ->s_umount shared * @sb: reference we are trying to grab * - * Tries to acquire a passive reference. This is used in places where we + * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the - * superblock does not go away while we are working on it. It returns - * false if a reference was not gained, and returns true with the s_umount - * lock held in read mode if a reference is gained. On successful return, - * the caller must drop the s_umount lock and the passive reference when - * done. + * filesystem is not shut down while we are working on it. It returns + * false if we cannot acquire s_umount or if we lose the race and + * filesystem already got into shutdown, and returns true with the s_umount + * lock held in read mode in case of success. On successful return, + * the caller must drop the s_umount lock when done. + * + * Note that unlike get_super() et.al. this one does *not* bump ->s_count. + * The reason why it's safe is that we are OK with doing trylock instead + * of down_read(). There's a couple of places that are OK with that, but + * it's very much not a general-purpose interface. */ -bool grab_super_passive(struct super_block *sb) +bool trylock_super(struct super_block *sb) { - spin_lock(&sb_lock); - if (hlist_unhashed(&sb->s_instances)) { - spin_unlock(&sb_lock); - return false; - } - - sb->s_count++; - spin_unlock(&sb_lock); - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root && (sb->s_flags & MS_BORN)) + if (!hlist_unhashed(&sb->s_instances) && + sb->s_root && (sb->s_flags & MS_BORN)) return true; up_read(&sb->s_umount); } - put_super(sb); return false; } @@ -706,9 +711,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); if (remount_ro) { - if (sb->s_pins.first) { + if (!hlist_empty(&sb->s_pins)) { up_write(&sb->s_umount); - sb_pin_kill(sb); + group_pin_kill(&sb->s_pins); down_write(&sb->s_umount); if (!sb->s_root) return 0; @@ -863,10 +868,7 @@ EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { - int error = get_anon_bdev(&s->s_dev); - if (!error) - s->s_bdi = &noop_backing_dev_info; - return error; + return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); @@ -1111,7 +1113,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) sb = root->d_sb; BUG_ON(!sb); WARN_ON(!sb->s_bdi); - WARN_ON(sb->s_bdi == &default_backing_dev_info); sb->s_flags |= MS_BORN; error = security_sb_kern_mount(sb, flags, secdata); diff --git a/fs/sync.c b/fs/sync.c index bdc729d80e5e..fbc98ee62044 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -154,7 +154,7 @@ SYSCALL_DEFINE1(syncfs, int, fd) if (!f.file) return -EBADF; - sb = f.file->f_dentry->d_sb; + sb = f.file->f_path.dentry->d_sb; down_read(&sb->s_umount); ret = sync_filesystem(sb); @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd) */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { + struct inode *inode = file->f_mapping->host; + if (!file->f_op->fsync) return -EINVAL; + if (!datasync && (inode->i_state & I_DIRTY_TIME)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index e9ef59b3abb1..7c2867b44141 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -102,6 +102,22 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, return battr->read(of->file, kobj, battr, buf, pos, count); } +/* kernfs read callback for regular sysfs files with pre-alloc */ +static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, + size_t count, loff_t pos) +{ + const struct sysfs_ops *ops = sysfs_file_ops(of->kn); + struct kobject *kobj = of->kn->parent->priv; + + /* + * If buf != of->prealloc_buf, we don't know how + * large it is, so cannot safely pass it to ->show + */ + if (pos || WARN_ON_ONCE(buf != of->prealloc_buf)) + return 0; + return ops->show(kobj, of->kn->priv, buf); +} + /* kernfs write callback for regular sysfs files */ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) @@ -125,7 +141,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, if (size) { if (size <= pos) - return 0; + return -EFBIG; count = min_t(ssize_t, count, size - pos); } if (!count) @@ -184,6 +200,22 @@ static const struct kernfs_ops sysfs_file_kfops_rw = { .write = sysfs_kf_write, }; +static const struct kernfs_ops sysfs_prealloc_kfops_ro = { + .read = sysfs_kf_read, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_wo = { + .write = sysfs_kf_write, + .prealloc = true, +}; + +static const struct kernfs_ops sysfs_prealloc_kfops_rw = { + .read = sysfs_kf_read, + .write = sysfs_kf_write, + .prealloc = true, +}; + static const struct kernfs_ops sysfs_bin_kfops_ro = { .read = sysfs_kf_bin_read, }; @@ -222,13 +254,22 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, kobject_name(kobj))) return -EINVAL; - if (sysfs_ops->show && sysfs_ops->store) - ops = &sysfs_file_kfops_rw; - else if (sysfs_ops->show) - ops = &sysfs_file_kfops_ro; - else if (sysfs_ops->store) - ops = &sysfs_file_kfops_wo; - else + if (sysfs_ops->show && sysfs_ops->store) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_rw; + else + ops = &sysfs_file_kfops_rw; + } else if (sysfs_ops->show) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_ro; + else + ops = &sysfs_file_kfops_ro; + } else if (sysfs_ops->store) { + if (mode & SYSFS_PREALLOC) + ops = &sysfs_prealloc_kfops_wo; + else + ops = &sysfs_file_kfops_wo; + } else ops = &sysfs_file_kfops_empty; size = PAGE_SIZE; @@ -253,8 +294,8 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, if (!attr->ignore_lockdep) key = attr->key ?: (struct lock_class_key *)&attr->skey; #endif - kn = __kernfs_create_file(parent, attr->name, mode, size, ops, - (void *)attr, ns, true, key); + kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops, + (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 7d2a860ba788..b400c04371f0 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -41,7 +41,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { - umode_t mode = 0; + umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or @@ -55,9 +55,14 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (!mode) continue; } + + WARN(mode & ~(SYSFS_PREALLOC | 0664), + "Attribute %s: Invalid permissions 0%o\n", + (*attr)->name, mode); + + mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, false, - (*attr)->mode | mode, - NULL); + mode, NULL); if (unlikely(error)) break; } @@ -99,7 +104,7 @@ static int internal_create_group(struct kobject *kobj, int update, return -EINVAL; if (!grp->attrs && !grp->bin_attrs) { WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n", - kobj->name, grp->name ? "" : grp->name); + kobj->name, grp->name ?: ""); return -EINVAL; } if (grp->name) { diff --git a/fs/timerfd.c b/fs/timerfd.c index b46ffa94372a..b94fa6c3c6eb 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -288,7 +288,7 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, } #ifdef CONFIG_PROC_FS -static int timerfd_show(struct seq_file *m, struct file *file) +static void timerfd_show(struct seq_file *m, struct file *file) { struct timerfd_ctx *ctx = file->private_data; struct itimerspec t; @@ -298,18 +298,19 @@ static int timerfd_show(struct seq_file *m, struct file *file) t.it_interval = ktime_to_timespec(ctx->tintv); spin_unlock_irq(&ctx->wqh.lock); - return seq_printf(m, - "clockid: %d\n" - "ticks: %llu\n" - "settime flags: 0%o\n" - "it_value: (%llu, %llu)\n" - "it_interval: (%llu, %llu)\n", - ctx->clockid, (unsigned long long)ctx->ticks, - ctx->settime_flags, - (unsigned long long)t.it_value.tv_sec, - (unsigned long long)t.it_value.tv_nsec, - (unsigned long long)t.it_interval.tv_sec, - (unsigned long long)t.it_interval.tv_nsec); + seq_printf(m, + "clockid: %d\n" + "ticks: %llu\n" + "settime flags: 0%o\n" + "it_value: (%llu, %llu)\n" + "it_interval: (%llu, %llu)\n", + ctx->clockid, + (unsigned long long)ctx->ticks, + ctx->settime_flags, + (unsigned long long)t.it_value.tv_sec, + (unsigned long long)t.it_value.tv_nsec, + (unsigned long long)t.it_interval.tv_sec, + (unsigned long long)t.it_interval.tv_nsec); } #else #define timerfd_show NULL diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 7ed13e1e216a..4cfb3e82c56f 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -2032,6 +2032,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, long long blk_offs; struct ubifs_data_node *dn = node; + ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ); + /* * Search the inode node this data node belongs to and insert * it to the RB-tree of inodes. @@ -2060,6 +2062,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, struct ubifs_dent_node *dent = node; struct fsck_inode *fscki1; + ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ); + err = ubifs_validate_entry(c, dent); if (err) goto out_dump; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ea41649e4ca5..0fa6c803992e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, inode->i_mtime = inode->i_atime = inode->i_ctime = ubifs_current_time(inode); inode->i_mapping->nrpages = 0; - /* Disable readahead */ - inode->i_mapping->backing_dev_info = &c->bdi; switch (mode & S_IFMT) { case S_IFREG: @@ -272,6 +270,10 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, goto out_budg; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_cancel; + mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; @@ -728,6 +730,10 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_budg; } + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_cancel; + mutex_lock(&dir_ui->ui_mutex); insert_inode_hash(inode); inc_nlink(inode); @@ -808,6 +814,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, ui->data = dev; ui->data_len = devlen; + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_cancel; + mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; @@ -884,6 +894,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, ui->data_len = len; inode->i_size = ubifs_inode(inode)->ui_size = len; + err = ubifs_init_security(dir, inode, &dentry->d_name); + if (err) + goto out_cancel; + mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b5b593c45270..e627c0acf626 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -262,6 +262,7 @@ static int write_begin_slow(struct address_space *mapping, if (err) { unlock_page(page); page_cache_release(page); + ubifs_release_budget(c, &req); return err; } } @@ -1535,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -1573,6 +1573,10 @@ const struct inode_operations ubifs_symlink_inode_operations = { .follow_link = ubifs_follow_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, }; const struct file_operations ubifs_file_operations = { diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index fb166e204441..f6ac3f29323c 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -571,7 +571,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, aligned_dlen = ALIGN(dlen, 8); aligned_ilen = ALIGN(ilen, 8); + len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; + /* Make sure to also account for extended attributes */ + len += host_ui->data_len; + dent = kmalloc(len, GFP_NOFS); if (!dent) return -ENOMEM; @@ -648,7 +652,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ino_key_init(c, &ino_key, dir->i_ino); ino_offs += aligned_ilen; - err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ); + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, + UBIFS_INO_NODE_SZ + host_ui->data_len); if (err) goto out_ro; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 3187925e9879..9b40a1c5e160 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -1028,9 +1028,22 @@ int ubifs_replay_journal(struct ubifs_info *c) do { err = replay_log_leb(c, lnum, 0, c->sbuf); - if (err == 1) - /* We hit the end of the log */ - break; + if (err == 1) { + if (lnum != c->lhead_lnum) + /* We hit the end of the log */ + break; + + /* + * The head of the log must always start with the + * "commit start" node on a properly formatted UBIFS. + * But we found no nodes at all, which means that + * someting went wrong and we cannot proceed mounting + * the file-system. + */ + ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted", + lnum, 0); + err = -EINVAL; + } if (err) goto out; lnum = ubifs_next_log_lnum(c, lnum); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 106bf20629ce..93e946561c5c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) if (err) goto out_invalid; - /* Disable read-ahead */ - inode->i_mapping->backing_dev_info = &c->bdi; - switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &ubifs_file_address_operations; @@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ c->bdi.name = "ubifs", - c->bdi.capabilities = BDI_CAP_MAP_COPY; + c->bdi.capabilities = 0; err = bdi_init(&c->bdi); if (err) goto out_close; @@ -2039,6 +2036,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; + sb->s_xattr = ubifs_xattr_handlers; mutex_lock(&c->umount_mutex); err = mount_ubifs(c); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index c4fe900c67ab..bc04b9c69891 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -36,6 +36,7 @@ #include <linux/mtd/ubi.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> +#include <linux/security.h> #include "ubifs-media.h" /* Version of this UBIFS implementation */ @@ -1465,6 +1466,7 @@ extern spinlock_t ubifs_infos_lock; extern atomic_long_t ubifs_clean_zn_cnt; extern struct kmem_cache *ubifs_inode_slab; extern const struct super_operations ubifs_super_operations; +extern const struct xattr_handler *ubifs_xattr_handlers[]; extern const struct address_space_operations ubifs_file_address_operations; extern const struct file_operations ubifs_file_operations; extern const struct inode_operations ubifs_file_inode_operations; @@ -1754,6 +1756,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size); ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); int ubifs_removexattr(struct dentry *dentry, const char *name); +int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr); /* super.c */ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 5e0a63b1b0d5..a92be244a6fb 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -100,24 +100,30 @@ static const struct file_operations empty_fops; static int create_xattr(struct ubifs_info *c, struct inode *host, const struct qstr *nm, const void *value, int size) { - int err; + int err, names_len; struct inode *inode; struct ubifs_inode *ui, *host_ui = ubifs_inode(host); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; - if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) + if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) { + ubifs_err("inode %lu already has too many xattrs (%d), cannot create more", + host->i_ino, host_ui->xattr_cnt); return -ENOSPC; + } /* * Linux limits the maximum size of the extended attribute names list * to %XATTR_LIST_MAX. This means we should not allow creating more * extended attributes if the name list becomes larger. This limitation * is artificial for UBIFS, though. */ - if (host_ui->xattr_names + host_ui->xattr_cnt + - nm->len + 1 > XATTR_LIST_MAX) + names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1; + if (names_len > XATTR_LIST_MAX) { + ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", + host->i_ino, names_len, XATTR_LIST_MAX); return -ENOSPC; + } err = ubifs_budget_space(c, &req); if (err) @@ -293,18 +299,16 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) return ERR_PTR(-EINVAL); } -int ubifs_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int setxattr(struct inode *host, const char *name, const void *value, + size_t size, int flags) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; union ubifs_key key; int err, type; - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, - host->i_ino, dentry, size); ubifs_assert(mutex_is_locked(&host->i_mutex)); if (size > UBIFS_MAX_INO_DATA) @@ -356,6 +360,15 @@ out_free: return err; } +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", + name, dentry->d_inode->i_ino, dentry, size); + + return setxattr(dentry->d_inode, name, value, size, flags); +} + ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size) { @@ -568,3 +581,84 @@ out_free: kfree(xent); return err; } + +static size_t security_listxattr(struct dentry *d, char *list, size_t list_size, + const char *name, size_t name_len, int flags) +{ + const int prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + + return total_len; +} + +static int security_getxattr(struct dentry *d, const char *name, void *buffer, + size_t size, int flags) +{ + return ubifs_getxattr(d, name, buffer, size); +} + +static int security_setxattr(struct dentry *d, const char *name, + const void *value, size_t size, int flags, + int handler_flags) +{ + return ubifs_setxattr(d, name, value, size, flags); +} + +static const struct xattr_handler ubifs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = security_listxattr, + .get = security_getxattr, + .set = security_setxattr, +}; + +const struct xattr_handler *ubifs_xattr_handlers[] = { + &ubifs_xattr_security_handler, + NULL, +}; + +static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + char *name; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + err = setxattr(inode, name, xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; + } + + return err; +} + +int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr) +{ + int err; + + mutex_lock(&inode->i_mutex); + err = security_inode_init_security(inode, dentry, qstr, + &init_xattrs, 0); + mutex_unlock(&inode->i_mutex); + + if (err) + ubifs_err("cannot initialize security for inode %lu, error %d", + inode->i_ino, err); + return err; +} diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index 0e0e99bd6bce..c6e17a744c3b 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -2,10 +2,12 @@ config UDF_FS tristate "UDF file system support" select CRC_ITU_T help - This is the new file system used on some CD-ROMs and DVDs. Say Y if - you intend to mount DVD discs or CDRW's written in packet mode, or - if written to by other UDF utilities, such as DirectCD. - Please read <file:Documentation/filesystems/udf.txt>. + This is a file system used on some CD-ROMs and DVDs. Since the + file system is supported by multiple operating systems and is more + compatible with standard unix file systems, it is also suitable for + removable USB disks. Say Y if you intend to mount DVD discs or CDRW's + written in packet mode, or if you want to use UDF for removable USB + disks. Please read <file:Documentation/filesystems/udf.txt>. To compile this file system support as a module, choose M here: the module will be called udf. diff --git a/fs/udf/dir.c b/fs/udf/dir.c index a012c51caffd..05e90edd1992 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -57,6 +57,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) sector_t offset; int i, num, ret = 0; struct extent_position epos = { NULL, 0, {0, 0} }; + struct super_block *sb = dir->i_sb; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) @@ -76,16 +77,16 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) if (nf_pos == 0) nf_pos = udf_ext0_offset(dir); - fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); + fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, + if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { ret = -ENOENT; goto out; } - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + block = udf_get_lb_pblock(sb, &eloc, offset); + if ((++offset << sb->s_blocksize_bits) < elen) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) epos.offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == @@ -95,18 +96,18 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) offset = 0; } - if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) { + if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) { ret = -EIO; goto out; } - if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) { - i = 16 >> (dir->i_sb->s_blocksize_bits - 9); - if (i + offset > (elen >> dir->i_sb->s_blocksize_bits)) - i = (elen >> dir->i_sb->s_blocksize_bits) - offset; + if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) { + i = 16 >> (sb->s_blocksize_bits - 9); + if (i + offset > (elen >> sb->s_blocksize_bits)) + i = (elen >> sb->s_blocksize_bits) - offset; for (num = 0; i > 0; i--) { - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i); - tmp = udf_tgetblk(dir->i_sb, block); + block = udf_get_lb_pblock(sb, &eloc, offset + i); + tmp = udf_tgetblk(sb, block); if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp)) bha[num++] = tmp; else @@ -152,12 +153,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } @@ -167,12 +168,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) continue; } - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); if (!flen) continue; tloc = lelb_to_cpu(cfi.icb.extLocation); - iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0); + iblock = udf_get_lb_pblock(sb, &tloc, 0); if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN)) goto out; } /* end while */ diff --git a/fs/udf/file.c b/fs/udf/file.c index bb15771b92ae..08f3555fbeac 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -224,7 +224,7 @@ out: static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE && - atomic_read(&inode->i_writecount) > 1) { + atomic_read(&inode->i_writecount) == 1) { /* * Grab i_mutex to avoid races with writes changing i_size * while we are running. diff --git a/fs/udf/inode.c b/fs/udf/inode.c index c9b4df5810d5..a445d599098d 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, /* Are we beyond EOF? */ if (etype == -1) { int ret; - isBeyondEOF = 1; + isBeyondEOF = true; if (count) { if (c) laarr[0] = laarr[1]; @@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block, endnum = c + 1; lastblock = 1; } else { - isBeyondEOF = 0; + isBeyondEOF = false; endnum = startnum = ((count > 2) ? 2 : count); /* if the current extent is in position 0, @@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; + int bs = inode->i_sb->s_blocksize; int ret = -EIO; reread: @@ -1374,38 +1375,35 @@ reread: if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; - ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + ret = udf_alloc_i_data(inode, bs - sizeof(struct extendedFileEntry)); if (ret) goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct extendedFileEntry), - inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry)); + bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; - ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct fileEntry)); + ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct fileEntry), - inode->i_sb->s_blocksize - sizeof(struct fileEntry)); + bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { iinfo->i_efe = 0; iinfo->i_use = 1; iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); - ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + ret = udf_alloc_i_data(inode, bs - sizeof(struct unallocSpaceEntry)); if (ret) goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct unallocSpaceEntry), - inode->i_sb->s_blocksize - - sizeof(struct unallocSpaceEntry)); + bs - sizeof(struct unallocSpaceEntry)); return 0; } @@ -1489,6 +1487,28 @@ reread: } inode->i_generation = iinfo->i_unique; + /* + * Sanity check length of allocation descriptors and extended attrs to + * avoid integer overflows + */ + if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) + goto out; + /* Now do exact checks */ + if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) + goto out; + /* Sanity checks for files in ICB so that we don't get confused later */ + if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { + /* + * For file in ICB data is stored in allocation descriptor + * so sizes should match + */ + if (iinfo->i_lenAlloc != inode->i_size) + goto out; + /* File in ICB has to fit in there... */ + if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) + goto out; + } + switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: inode->i_op = &udf_dir_inode_operations; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c12e260fd6c4..33b246b82c98 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -159,18 +159,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, struct udf_inode_info *dinfo = UDF_I(dir); int isdotdot = child->len == 2 && child->name[0] == '.' && child->name[1] == '.'; + struct super_block *sb = dir->i_sb; size = udf_ext0_offset(dir) + dir->i_size; f_pos = udf_ext0_offset(dir); fibh->sbh = fibh->ebh = NULL; - fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); + fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { - if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos, + if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) goto out_err; - block = udf_get_lb_pblock(dir->i_sb, &eloc, offset); - if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { + block = udf_get_lb_pblock(sb, &eloc, offset); + if ((++offset << sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) epos.offset -= sizeof(struct short_ad); else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) @@ -178,7 +179,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } else offset = 0; - fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); + fibh->sbh = fibh->ebh = udf_tread(sb, block); if (!fibh->sbh) goto out_err; } @@ -217,12 +218,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE)) continue; } if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) { - if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE)) + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE)) continue; } @@ -233,7 +234,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, if (!lfi) continue; - flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); + flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); if (flen && udf_match(flen, fname, child->len, child->name)) goto out_ok; } diff --git a/fs/udf/super.c b/fs/udf/super.c index e229315bbf7a..f169411c4ea0 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence( struct udf_vds_record *curr; struct generic_desc *gd; struct volDescPtr *vdp; - int done = 0; + bool done = false; uint32_t vdsn; uint16_t ident; long next_s = 0, next_e = 0; @@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence( lastblock = next_e; next_s = next_e = 0; } else - done = 1; + done = true; break; } brelse(bh); @@ -2082,12 +2082,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) mutex_init(&sbi->s_alloc_mutex); if (!udf_parse_options((char *)options, &uopt, false)) - goto error_out; + goto parse_options_failure; if (uopt.flags & (1 << UDF_FLAG_UTF8) && uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { udf_err(sb, "utf8 cannot be combined with iocharset\n"); - goto error_out; + goto parse_options_failure; } #ifdef CONFIG_UDF_NLS if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { @@ -2237,8 +2237,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) return 0; error_out: - if (sbi->s_vat_inode) - iput(sbi->s_vat_inode); + iput(sbi->s_vat_inode); +parse_options_failure: #ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); @@ -2291,8 +2291,7 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); - if (sbi->s_vat_inode) - iput(sbi->s_vat_inode); + iput(sbi->s_vat_inode); #ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); @@ -2301,6 +2300,7 @@ static void udf_put_super(struct super_block *sb) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); udf_sb_free_partitions(sb); + mutex_destroy(&sbi->s_alloc_mutex); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 6fb7945c1e6e..ac10ca939f26 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -30,49 +30,73 @@ #include <linux/buffer_head.h> #include "udf_i.h" -static void udf_pc_to_char(struct super_block *sb, unsigned char *from, - int fromlen, unsigned char *to) +static int udf_pc_to_char(struct super_block *sb, unsigned char *from, + int fromlen, unsigned char *to, int tolen) { struct pathComponent *pc; int elen = 0; + int comp_len; unsigned char *p = to; + /* Reserve one byte for terminating \0 */ + tolen--; while (elen < fromlen) { pc = (struct pathComponent *)(from + elen); + elen += sizeof(struct pathComponent); switch (pc->componentType) { case 1: /* * Symlink points to some place which should be agreed * upon between originator and receiver of the media. Ignore. */ - if (pc->lengthComponentIdent > 0) + if (pc->lengthComponentIdent > 0) { + elen += pc->lengthComponentIdent; break; + } /* Fall through */ case 2: + if (tolen == 0) + return -ENAMETOOLONG; p = to; *p++ = '/'; + tolen--; break; case 3: + if (tolen < 3) + return -ENAMETOOLONG; memcpy(p, "../", 3); p += 3; + tolen -= 3; break; case 4: + if (tolen < 2) + return -ENAMETOOLONG; memcpy(p, "./", 2); p += 2; + tolen -= 2; /* that would be . - just ignore */ break; case 5: - p += udf_get_filename(sb, pc->componentIdent, p, - pc->lengthComponentIdent); + elen += pc->lengthComponentIdent; + if (elen > fromlen) + return -EIO; + comp_len = udf_get_filename(sb, pc->componentIdent, + pc->lengthComponentIdent, + p, tolen); + p += comp_len; + tolen -= comp_len; + if (tolen == 0) + return -ENAMETOOLONG; *p++ = '/'; + tolen--; break; } - elen += sizeof(struct pathComponent) + pc->lengthComponentIdent; } if (p > to + 1) p[-1] = '\0'; else p[0] = '\0'; + return 0; } static int udf_symlink_filler(struct file *file, struct page *page) @@ -80,11 +104,17 @@ static int udf_symlink_filler(struct file *file, struct page *page) struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; - int err = -EIO; + int err; unsigned char *p = kmap(page); struct udf_inode_info *iinfo; uint32_t pos; + /* We don't support symlinks longer than one block */ + if (inode->i_size > inode->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto out_unmap; + } + iinfo = UDF_I(inode); pos = udf_block_map(inode, 0); @@ -94,14 +124,18 @@ static int udf_symlink_filler(struct file *file, struct page *page) } else { bh = sb_bread(inode->i_sb, pos); - if (!bh) - goto out; + if (!bh) { + err = -EIO; + goto out_unlock_inode; + } symlink = bh->b_data; } - udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p); + err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE); brelse(bh); + if (err) + goto out_unlock_inode; up_read(&iinfo->i_data_sem); SetPageUptodate(page); @@ -109,9 +143,10 @@ static int udf_symlink_filler(struct file *file, struct page *page) unlock_page(page); return 0; -out: +out_unlock_inode: up_read(&iinfo->i_data_sem); SetPageError(page); +out_unmap: kunmap(page); unlock_page(page); return err; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 1cc3c993ebd0..47bb3f5ca360 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -211,7 +211,8 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc, } /* unicode.c */ -extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int); +extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *, + int); extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, int); extern int udf_build_ustr(struct ustr *, dstring *, int); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index afd470e588ff..b84fee372734 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -28,7 +28,8 @@ #include "udf_sb.h" -static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int); +static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *, + int); static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen) { @@ -333,8 +334,8 @@ try_again: return u_len + 1; } -int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, - int flen) +int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, + uint8_t *dname, int dlen) { struct ustr *filename, *unifilename; int len = 0; @@ -347,7 +348,7 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, if (!unifilename) goto out1; - if (udf_build_ustr_exact(unifilename, sname, flen)) + if (udf_build_ustr_exact(unifilename, sname, slen)) goto out2; if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { @@ -366,7 +367,8 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname, } else goto out2; - len = udf_translate_to_linux(dname, filename->u_name, filename->u_len, + len = udf_translate_to_linux(dname, dlen, + filename->u_name, filename->u_len, unifilename->u_name, unifilename->u_len); out2: kfree(unifilename); @@ -403,10 +405,12 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname, #define EXT_MARK '.' #define CRC_MARK '#' #define EXT_SIZE 5 +/* Number of chars we need to store generated CRC to make filename unique */ +#define CRC_LEN 5 -static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, - int udfLen, uint8_t *fidName, - int fidNameLen) +static int udf_translate_to_linux(uint8_t *newName, int newLen, + uint8_t *udfName, int udfLen, + uint8_t *fidName, int fidNameLen) { int index, newIndex = 0, needsCRC = 0; int extIndex = 0, newExtIndex = 0, hasExt = 0; @@ -439,7 +443,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, newExtIndex = newIndex; } } - if (newIndex < 256) + if (newIndex < newLen) newName[newIndex++] = curr; else needsCRC = 1; @@ -467,13 +471,13 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName, } ext[localExtIndex++] = curr; } - maxFilenameLen = 250 - localExtIndex; + maxFilenameLen = newLen - CRC_LEN - localExtIndex; if (newIndex > maxFilenameLen) newIndex = maxFilenameLen; else newIndex = newExtIndex; - } else if (newIndex > 250) - newIndex = 250; + } else if (newIndex > newLen - CRC_LEN) + newIndex = newLen - CRC_LEN; newName[newIndex++] = CRC_MARK; valueCRC = crc_itu_t(0, fidName, fidNameLen); newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index da73801301d5..8092d3759a5e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -95,22 +95,18 @@ void lock_ufs(struct super_block *sb) { -#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) struct ufs_sb_info *sbi = UFS_SB(sb); mutex_lock(&sbi->mutex); sbi->mutex_owner = current; -#endif } void unlock_ufs(struct super_block *sb) { -#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) struct ufs_sb_info *sbi = UFS_SB(sb); sbi->mutex_owner = NULL; mutex_unlock(&sbi->mutex); -#endif } static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) @@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep; static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; - ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); + + ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + ei->vfs_inode.i_version = 1; return &ei->vfs_inode; } diff --git a/fs/xattr.c b/fs/xattr.c index 64e83efb742d..4ef698549e31 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -405,16 +405,14 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { struct fd f = fdget(fd); - struct dentry *dentry; int error = -EBADF; if (!f.file) return error; - dentry = f.file->f_path.dentry; - audit_inode(NULL, dentry, 0); + audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(dentry, name, value, size, flags); + error = setxattr(f.file->f_path.dentry, name, value, size, flags); mnt_drop_write_file(f.file); } fdput(f); @@ -509,7 +507,7 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, if (!f.file) return error; - audit_inode(NULL, f.file->f_path.dentry, 0); + audit_file(f.file); error = getxattr(f.file->f_path.dentry, name, value, size); fdput(f); return error; @@ -590,7 +588,7 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) if (!f.file) return error; - audit_inode(NULL, f.file->f_path.dentry, 0); + audit_file(f.file); error = listxattr(f.file->f_path.dentry, list, size); fdput(f); return error; @@ -651,16 +649,14 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { struct fd f = fdget(fd); - struct dentry *dentry; int error = -EBADF; if (!f.file) return error; - dentry = f.file->f_path.dentry; - audit_inode(NULL, dentry, 0); + audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = removexattr(dentry, name); + error = removexattr(f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } fdput(f); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index d61799949580..df6828570e87 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o +xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 53e95b2a1369..a7a3a63bb360 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -91,16 +91,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) return ptr; } -void -kmem_free(const void *ptr) -{ - if (!is_vmalloc_addr(ptr)) { - kfree(ptr); - } else { - vfree(ptr); - } -} - void * kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, xfs_km_flags_t flags) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 64db0e53edea..cc6b768fc068 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -63,7 +63,10 @@ kmem_flags_convert(xfs_km_flags_t flags) extern void *kmem_alloc(size_t, xfs_km_flags_t); extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); -extern void kmem_free(const void *); +static inline void kmem_free(const void *ptr) +{ + kvfree(ptr); +} extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h deleted file mode 100644 index 6e247a99f5db..000000000000 --- a/fs/xfs/libxfs/xfs_ag.h +++ /dev/null @@ -1,281 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_AG_H__ -#define __XFS_AG_H__ - -/* - * Allocation group header - * This is divided into three structures, placed in sequential 512-byte - * buffers after a copy of the superblock (also in a 512-byte buffer). - */ - -struct xfs_buf; -struct xfs_mount; -struct xfs_trans; - -#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ -#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ -#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ -#define XFS_AGF_VERSION 1 -#define XFS_AGI_VERSION 1 - -#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) -#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) - -/* - * Btree number 0 is bno, 1 is cnt. This value gives the size of the - * arrays below. - */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) - -/* - * The second word of agf_levels in the first a.g. overlaps the EFS - * superblock's magic number. Since the magic numbers valid for EFS - * are > 64k, our value cannot be confused for an EFS superblock's. - */ - -typedef struct xfs_agf { - /* - * Common allocation group header information - */ - __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ - __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ - __be32 agf_seqno; /* sequence # starting from 0 */ - __be32 agf_length; /* size in blocks of a.g. */ - /* - * Freespace information - */ - __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_spare0; /* spare field */ - __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ - __be32 agf_spare1; /* spare field */ - - __be32 agf_flfirst; /* first freelist block's index */ - __be32 agf_fllast; /* last freelist block's index */ - __be32 agf_flcount; /* count of blocks in freelist */ - __be32 agf_freeblks; /* total free blocks */ - - __be32 agf_longest; /* longest free space */ - __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ - uuid_t agf_uuid; /* uuid of filesystem */ - - /* - * reserve some contiguous space for future logged fields before we add - * the unlogged fields. This makes the range logging via flags and - * structure offsets much simpler. - */ - __be64 agf_spare64[16]; - - /* unlogged fields, written during buffer writeback. */ - __be64 agf_lsn; /* last write sequence */ - __be32 agf_crc; /* crc of agf sector */ - __be32 agf_spare2; - - /* structure must be padded to 64 bit alignment */ -} xfs_agf_t; - -#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) - -#define XFS_AGF_MAGICNUM 0x00000001 -#define XFS_AGF_VERSIONNUM 0x00000002 -#define XFS_AGF_SEQNO 0x00000004 -#define XFS_AGF_LENGTH 0x00000008 -#define XFS_AGF_ROOTS 0x00000010 -#define XFS_AGF_LEVELS 0x00000020 -#define XFS_AGF_FLFIRST 0x00000040 -#define XFS_AGF_FLLAST 0x00000080 -#define XFS_AGF_FLCOUNT 0x00000100 -#define XFS_AGF_FREEBLKS 0x00000200 -#define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_BTREEBLKS 0x00000800 -#define XFS_AGF_UUID 0x00001000 -#define XFS_AGF_NUM_BITS 13 -#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) - -#define XFS_AGF_FLAGS \ - { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ - { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ - { XFS_AGF_SEQNO, "SEQNO" }, \ - { XFS_AGF_LENGTH, "LENGTH" }, \ - { XFS_AGF_ROOTS, "ROOTS" }, \ - { XFS_AGF_LEVELS, "LEVELS" }, \ - { XFS_AGF_FLFIRST, "FLFIRST" }, \ - { XFS_AGF_FLLAST, "FLLAST" }, \ - { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ - { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ - { XFS_AGF_LONGEST, "LONGEST" }, \ - { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ - { XFS_AGF_UUID, "UUID" } - -/* disk block (xfs_daddr_t) in the AG */ -#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) -#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) - -extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); - -/* - * Size of the unlinked inode hash table in the agi. - */ -#define XFS_AGI_UNLINKED_BUCKETS 64 - -typedef struct xfs_agi { - /* - * Common allocation group header information - */ - __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ - __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ - __be32 agi_seqno; /* sequence # starting from 0 */ - __be32 agi_length; /* size in blocks of a.g. */ - /* - * Inode information - * Inodes are mapped by interpreting the inode number, so no - * mapping data is needed here. - */ - __be32 agi_count; /* count of allocated inodes */ - __be32 agi_root; /* root of inode btree */ - __be32 agi_level; /* levels in inode btree */ - __be32 agi_freecount; /* number of free inodes */ - - __be32 agi_newino; /* new inode just allocated */ - __be32 agi_dirino; /* last directory inode chunk */ - /* - * Hash table of inodes which have been unlinked but are - * still being referenced. - */ - __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; - /* - * This marks the end of logging region 1 and start of logging region 2. - */ - uuid_t agi_uuid; /* uuid of filesystem */ - __be32 agi_crc; /* crc of agi sector */ - __be32 agi_pad32; - __be64 agi_lsn; /* last write sequence */ - - __be32 agi_free_root; /* root of the free inode btree */ - __be32 agi_free_level;/* levels in free inode btree */ - - /* structure must be padded to 64 bit alignment */ -} xfs_agi_t; - -#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) - -#define XFS_AGI_MAGICNUM (1 << 0) -#define XFS_AGI_VERSIONNUM (1 << 1) -#define XFS_AGI_SEQNO (1 << 2) -#define XFS_AGI_LENGTH (1 << 3) -#define XFS_AGI_COUNT (1 << 4) -#define XFS_AGI_ROOT (1 << 5) -#define XFS_AGI_LEVEL (1 << 6) -#define XFS_AGI_FREECOUNT (1 << 7) -#define XFS_AGI_NEWINO (1 << 8) -#define XFS_AGI_DIRINO (1 << 9) -#define XFS_AGI_UNLINKED (1 << 10) -#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ -#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) -#define XFS_AGI_FREE_ROOT (1 << 11) -#define XFS_AGI_FREE_LEVEL (1 << 12) -#define XFS_AGI_NUM_BITS_R2 13 - -/* disk block (xfs_daddr_t) in the AG */ -#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) -#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) - -extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); - -/* - * The third a.g. block contains the a.g. freelist, an array - * of block pointers to blocks owned by the allocation btree code. - */ -#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) -#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) - -#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ - (__be32 *)(bp)->b_addr) - -/* - * Size of the AGFL. For CRC-enabled filesystes we steal a couple of - * slots in the beginning of the block for a proper header with the - * location information and CRC. - */ -#define XFS_AGFL_SIZE(mp) \ - (((mp)->m_sb.sb_sectsize - \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - sizeof(struct xfs_agfl) : 0)) / \ - sizeof(xfs_agblock_t)) - -typedef struct xfs_agfl { - __be32 agfl_magicnum; - __be32 agfl_seqno; - uuid_t agfl_uuid; - __be64 agfl_lsn; - __be32 agfl_crc; - __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; - -#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -/* - * tags for inode radix tree - */ -#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup - in xfs_inode_ag_iterator */ -#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ -#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - -#define XFS_AGB_TO_FSB(mp,agno,agbno) \ - (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) -#define XFS_FSB_TO_AGNO(mp,fsbno) \ - ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) -#define XFS_FSB_TO_AGBNO(mp,fsbno) \ - ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) -#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ - ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ - (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) -#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) - -/* - * For checking for bad ranges of xfs_daddr_t's, covering multiple - * allocation groups or a single xfs_daddr_t that's a superblock copy. - */ -#define XFS_AG_CHECK_DADDR(mp,d,len) \ - ((len) == 1 ? \ - ASSERT((d) == XFS_SB_DADDR || \ - xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ - ASSERT(xfs_daddr_to_agno(mp, d) == \ - xfs_daddr_to_agno(mp, (d) + (len) - 1))) - -#endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index eff34218f405..a6fbf4472017 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -23,7 +23,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index feacb061bab7..d1b4b6a5c894 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -231,4 +231,7 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ +int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index e0e83e24d3ef..59d521c09a17 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -22,7 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 353fb425faef..0a472fbe06d4 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -42,7 +40,6 @@ #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_trace.h" -#include "xfs_dinode.h" /* * xfs_attr.c diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b1f73dbbf3d8..15105dbc9e28 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -24,7 +24,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -41,7 +40,6 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" #include "xfs_dir2.h" @@ -405,7 +403,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) if (!xfs_sb_version_hasattr2(&mp->m_sb)) { xfs_sb_version_addattr2(&mp->m_sb); spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + xfs_log_sb(tp); } else spin_unlock(&mp->m_sb_lock); } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 7510ab8058a4..20de88d1bf86 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 79c981984dca..61ec015dca16 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -22,9 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -46,7 +44,6 @@ #include "xfs_trace.h" #include "xfs_symlink.h" #include "xfs_attr_leaf.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" @@ -976,7 +973,11 @@ xfs_bmap_local_to_extents( *firstblock = args.fsbno; bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - /* initialise the block and copy the data */ + /* + * Initialise the block and copy the data + * + * Note: init_fn must set the buffer log item type correctly! + */ init_fn(tp, bp, ip, ifp); /* account for the change in fork size and log everything */ @@ -1224,22 +1225,20 @@ xfs_bmap_add_attrfork( goto bmap_cancel; if (!xfs_sb_version_hasattr(&mp->m_sb) || (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { - __int64_t sbfields = 0; + bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_sb_version_hasattr(&mp->m_sb)) { xfs_sb_version_addattr(&mp->m_sb); - sbfields |= XFS_SB_VERSIONNUM; + log_sb = true; } if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { xfs_sb_version_addattr2(&mp->m_sb); - sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + log_sb = true; } - if (sbfields) { - spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, sbfields); - } else - spin_unlock(&mp->m_sb_lock); + spin_unlock(&mp->m_sb_lock); + if (log_sb) + xfs_log_sb(tp); } error = xfs_bmap_finish(&tp, &flist, &committed); @@ -5450,13 +5449,11 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { - struct xfs_ifork *ifp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec left; xfs_filblks_t blockcount; int error, i; - ifp = XFS_IFORK_PTR(ip, whichfork); xfs_bmbt_get_all(gotp, &got); xfs_bmbt_get_all(leftp, &left); blockcount = left.br_blockcount + got.br_blockcount; @@ -5489,32 +5486,25 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, got.br_blockcount, &i); if (error) - goto out_error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); error = xfs_btree_delete(cur, &i); if (error) - goto out_error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) - goto out_error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); left.br_blockcount = blockcount; - error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, left.br_state); - if (error) - goto out_error; - - return 0; - -out_error: - return error; + return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, left.br_state); } /* @@ -5544,35 +5534,29 @@ xfs_bmse_shift_one( startoff = got.br_startoff - offset_shift_fsb; /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock), - out_error); + XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); /* - * If this is the first extent in the file, make sure there's enough - * room at the start of the file and jump right to the shift as there's - * no left extent to merge. + * Check for merge if we've got an extent to the left, otherwise make + * sure there's enough room at the start of the file for the shift. */ - if (*current_ext == 0) { - if (got.br_startoff < offset_shift_fsb) - return -EINVAL; - goto shift_extent; - } + if (*current_ext) { + /* grab the left extent and check for a large enough hole */ + leftp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(leftp, &left); - /* grab the left extent and check for a large enough hole */ - leftp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(leftp, &left); + if (startoff < left.br_startoff + left.br_blockcount) + return -EINVAL; - if (startoff < left.br_startoff + left.br_blockcount) + /* check whether to merge the extent or shift it down */ + if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, gotp, leftp, cur, + logflags); + } + } else if (got.br_startoff < offset_shift_fsb) return -EINVAL; - /* check whether to merge the extent or shift it down */ - if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) - goto shift_extent; - - return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext, - gotp, leftp, cur, logflags); - -shift_extent: /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. @@ -5589,18 +5573,11 @@ shift_extent: got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + XFS_WANT_CORRUPTED_RETURN(i == 1); got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, got.br_blockcount, got.br_state); - if (error) - return error; - - return 0; - -out_error: - return error; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 44db6db86402..b9d8a499d2c4 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -28,6 +28,37 @@ struct xfs_trans; extern kmem_zone_t *xfs_bmap_free_item_zone; /* + * Argument structure for xfs_bmap_alloc. + */ +struct xfs_bmalloca { + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ + int flags; +}; + +/* * List of extents to be free "later". * The list is kept sorted on xbf_startblock. */ @@ -149,6 +180,8 @@ void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_bmap_free *flist, struct xfs_mount *mp); void xfs_bmap_cancel(struct xfs_bmap_free *flist); +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fba753308f31..2c44c8e50782 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -36,7 +34,6 @@ #include "xfs_quota.h" #include "xfs_trace.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" /* * Determine the extent state. diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 8fe6a93ff473..81cad433df85 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index fd827530afec..9cb0115c6bd1 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -514,7 +512,6 @@ xfs_da3_root_split( struct xfs_buf *bp; struct xfs_inode *dp; struct xfs_trans *tp; - struct xfs_mount *mp; struct xfs_dir2_leaf *leaf; xfs_dablk_t blkno; int level; @@ -534,7 +531,6 @@ xfs_da3_root_split( dp = args->dp; tp = args->trans; - mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) return error; @@ -2342,14 +2338,12 @@ xfs_da_shrink_inode( xfs_inode_t *dp; int done, error, w, count; xfs_trans_t *tp; - xfs_mount_t *mp; trace_xfs_da_shrink_inode(args); dp = args->dp; w = args->whichfork; tp = args->trans; - mp = dp->i_mount; count = args->geo->fsbcount; for (;;) { /* diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index 7e42fdfd2f1d..9d624a622946 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/libxfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h deleted file mode 100644 index 623bbe8fd921..000000000000 --- a/fs/xfs/libxfs/xfs_dinode.h +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DINODE_H__ -#define __XFS_DINODE_H__ - -#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) - -typedef struct xfs_timestamp { - __be32 t_sec; /* timestamp seconds */ - __be32 t_nsec; /* timestamp nanoseconds */ -} xfs_timestamp_t; - -/* - * On-disk inode structure. - * - * This is just the header or "dinode core", the inode is expanded to fill a - * variable size the leftover area split into a data and an attribute fork. - * The format of the data and attribute fork depends on the format of the - * inode as indicated by di_format and di_aformat. To access the data and - * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros - * below. - * - * There is a very similar struct icdinode in xfs_inode which matches the - * layout of the first 96 bytes of this structure, but is kept in native - * format instead of big endian. - * - * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed - * padding field for v3 inodes. - */ -typedef struct xfs_dinode { - __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __be16 di_mode; /* mode and type of file */ - __u8 di_version; /* inode version */ - __u8 di_format; /* format of di_c data */ - __be16 di_onlink; /* old number of links to file */ - __be32 di_uid; /* owner's user id */ - __be32 di_gid; /* owner's group id */ - __be32 di_nlink; /* number of links to file */ - __be16 di_projid_lo; /* lower part of owner's project id */ - __be16 di_projid_hi; /* higher part owner's project id */ - __u8 di_pad[6]; /* unused, zeroed space */ - __be16 di_flushiter; /* incremented on flush */ - xfs_timestamp_t di_atime; /* time last accessed */ - xfs_timestamp_t di_mtime; /* time last modified */ - xfs_timestamp_t di_ctime; /* time created/inode modified */ - __be64 di_size; /* number of bytes in file */ - __be64 di_nblocks; /* # of direct & btree blocks used */ - __be32 di_extsize; /* basic/minimum extent size for file */ - __be32 di_nextents; /* number of extents in data fork */ - __be16 di_anextents; /* number of extents in attribute fork*/ - __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ - __s8 di_aformat; /* format of attr fork's data */ - __be32 di_dmevmask; /* DMIG event mask */ - __be16 di_dmstate; /* DMIG state info */ - __be16 di_flags; /* random flags, XFS_DIFLAG_... */ - __be32 di_gen; /* generation number */ - - /* di_next_unlinked is the only non-core field in the old dinode */ - __be32 di_next_unlinked;/* agi unlinked list ptr */ - - /* start of the extended dinode, writable fields */ - __le32 di_crc; /* CRC of the inode */ - __be64 di_changecount; /* number of attribute changes */ - __be64 di_lsn; /* flush sequence */ - __be64 di_flags2; /* more random flags */ - __u8 di_pad2[16]; /* more padding for future expansion */ - - /* fields only written to during inode creation */ - xfs_timestamp_t di_crtime; /* time created */ - __be64 di_ino; /* inode number */ - uuid_t di_uuid; /* UUID of the filesystem */ - - /* structure must be padded to 64 bit alignment */ -} xfs_dinode_t; - -#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) - -#define DI_MAX_FLUSH 0xffff - -/* - * Size of the core inode on disk. Version 1 and 2 inodes have - * the same size, but version 3 has grown a few additional fields. - */ -static inline uint xfs_dinode_size(int version) -{ - if (version == 3) - return sizeof(struct xfs_dinode); - return offsetof(struct xfs_dinode, di_crc); -} - -/* - * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. - * Since the pathconf interface is signed, we use 2^31 - 1 instead. - * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. - */ -#define XFS_MAXLINK ((1U << 31) - 1U) -#define XFS_MAXLINK_1 65535U - -/* - * Values for di_format - */ -typedef enum xfs_dinode_fmt { - XFS_DINODE_FMT_DEV, /* xfs_dev_t */ - XFS_DINODE_FMT_LOCAL, /* bulk data */ - XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ - XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ - XFS_DINODE_FMT_UUID /* uuid_t */ -} xfs_dinode_fmt_t; - -/* - * Inode minimum and maximum sizes. - */ -#define XFS_DINODE_MIN_LOG 8 -#define XFS_DINODE_MAX_LOG 11 -#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) -#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) - -/* - * Inode size for given fs. - */ -#define XFS_LITINO(mp, version) \ - ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) - -/* - * Inode data & attribute fork sizes, per inode. - */ -#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) -#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) - -#define XFS_DFORK_DSIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_DFORK_BOFF(dip) : \ - XFS_LITINO(mp, (dip)->di_version)) -#define XFS_DFORK_ASIZE(dip,mp) \ - (XFS_DFORK_Q(dip) ? \ - XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ - 0) -#define XFS_DFORK_SIZE(dip,mp,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_DFORK_DSIZE(dip, mp) : \ - XFS_DFORK_ASIZE(dip, mp)) - -/* - * Return pointers to the data or attribute forks. - */ -#define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) -#define XFS_DFORK_APTR(dip) \ - (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) -#define XFS_DFORK_PTR(dip,w) \ - ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) - -#define XFS_DFORK_FORMAT(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - (dip)->di_format : \ - (dip)->di_aformat) -#define XFS_DFORK_NEXTENTS(dip,w) \ - ((w) == XFS_DATA_FORK ? \ - be32_to_cpu((dip)->di_nextents) : \ - be16_to_cpu((dip)->di_anextents)) - -#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr)) - -/* - * For block and character special files the 32bit dev_t is stored at the - * beginning of the data fork. - */ -static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) -{ - return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); -} - -static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) -{ - *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); -} - -/* - * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s. - */ -#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ -#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ -#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ -#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ -#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ -#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ -#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ -#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ -#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ -#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ -#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ -#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ -#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ -#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ -#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ -#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) -#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) -#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) -#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) -#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) -#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) -#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) -#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) -#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) -#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) -#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) -#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) -#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) -#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) -#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) - -#ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) -#else -#define XFS_IS_REALTIME_INODE(ip) (0) -#endif - -#define XFS_DIFLAG_ANY \ - (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ - XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ - XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ - XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ - XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) - -#endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 7075aaf131f4..a69fb3a1e161 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -20,9 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -34,10 +31,25 @@ #include "xfs_dir2_priv.h" #include "xfs_error.h" #include "xfs_trace.h" -#include "xfs_dinode.h" struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; /* * ASCII case-insensitive (ie. A-Z) support for directories that was diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 4dff261e6ed5..e55353651f5b 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -32,6 +32,12 @@ struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; /* + * directory filetype conversion tables. + */ +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; + +/* * directory operations vector for encode/decode routines */ struct xfs_dir_ops { @@ -177,4 +183,138 @@ extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9628ceccfa02..9354e190b82e 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -36,7 +34,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" /* * Local function prototypes. @@ -353,7 +350,6 @@ xfs_dir2_block_addname( int low; /* low index for binary srch */ int lowstale; /* low stale index */ int mid=0; /* midpoint for binary srch */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log header */ int needscan; /* need to rescan freespace */ __be16 *tagp; /* pointer to tag value */ @@ -363,7 +359,6 @@ xfs_dir2_block_addname( dp = args->dp; tp = args->trans; - mp = dp->i_mount; /* Read the (one and only) directory block into bp. */ error = xfs_dir3_block_read(tp, dp, &bp); @@ -618,7 +613,6 @@ xfs_dir2_block_lookup( xfs_inode_t *dp; /* incore inode */ int ent; /* entry index */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ trace_xfs_dir2_block_lookup(args); @@ -629,7 +623,6 @@ xfs_dir2_block_lookup( if ((error = xfs_dir2_block_lookup_int(args, &bp, &ent))) return error; dp = args->dp; - mp = dp->i_mount; hdr = bp->b_addr; xfs_dir3_data_check(dp, bp); btp = xfs_dir2_block_tail_p(args->geo, hdr); @@ -770,7 +763,6 @@ xfs_dir2_block_removename( xfs_inode_t *dp; /* incore inode */ int ent; /* block leaf entry index */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log block header */ int needscan; /* need to fixup bestfree */ xfs_dir2_sf_hdr_t sfh; /* shortform header */ @@ -788,7 +780,6 @@ xfs_dir2_block_removename( } dp = args->dp; tp = args->trans; - mp = dp->i_mount; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); @@ -852,7 +843,6 @@ xfs_dir2_block_replace( xfs_inode_t *dp; /* incore inode */ int ent; /* leaf entry index */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ trace_xfs_dir2_block_replace(args); @@ -864,7 +854,6 @@ xfs_dir2_block_replace( return error; } dp = args->dp; - mp = dp->i_mount; hdr = bp->b_addr; btp = xfs_dir2_block_tail_p(args->geo, hdr); blp = xfs_dir2_block_leaf_p(btp); diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index fdd803fecb8e..5ff31be9b1cd 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index a19174eb3cb2..106119955400 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -384,7 +382,6 @@ xfs_dir2_block_to_leaf( xfs_dir2_db_t ldb; /* leaf block's bno */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ @@ -395,7 +392,6 @@ xfs_dir2_block_to_leaf( trace_xfs_dir2_block_to_leaf(args); dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Add the leaf block to the inode. @@ -626,7 +622,6 @@ xfs_dir2_leaf_addname( int lfloghigh; /* high leaf logging index */ int lowstale; /* index of prev stale leaf */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ @@ -641,7 +636,6 @@ xfs_dir2_leaf_addname( dp = args->dp; tp = args->trans; - mp = dp->i_mount; error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) @@ -1356,11 +1350,9 @@ xfs_dir2_leaf_removename( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ - xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir2_data_free *bf; /* bestfree table */ struct xfs_dir2_leaf_entry *ents; struct xfs_dir3_icleaf_hdr leafhdr; @@ -1374,8 +1366,6 @@ xfs_dir2_leaf_removename( return error; } dp = args->dp; - tp = args->trans; - mp = dp->i_mount; leaf = lbp->b_addr; hdr = dbp->b_addr; xfs_dir3_data_check(dp, dbp); @@ -1607,11 +1597,9 @@ xfs_dir2_leaf_trim_data( int error; /* error return value */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Read the offending data block. We need its buffer. diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 2ae6ac2c11ae..41b80d3d3877 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -297,7 +295,6 @@ xfs_dir2_leaf_to_node( int i; /* leaf freespace index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ int n; /* count of live freespc ents */ xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ @@ -307,7 +304,6 @@ xfs_dir2_leaf_to_node( trace_xfs_dir2_leaf_to_node(args); dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Add a freespace block to the directory. @@ -387,16 +383,12 @@ xfs_dir2_leafn_add( int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ int lowstale; /* previous stale entry */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icleaf_hdr leafhdr; struct xfs_dir2_leaf_entry *ents; trace_xfs_dir2_leafn_add(args, index); dp = args->dp; - mp = dp->i_mount; - tp = args->trans; leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); @@ -1170,7 +1162,6 @@ xfs_dir2_leafn_remove( xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int longest; /* longest data free entry */ int off; /* data block entry offset */ - xfs_mount_t *mp; /* filesystem mount point */ int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ @@ -1182,7 +1173,6 @@ xfs_dir2_leafn_remove( dp = args->dp; tp = args->trans; - mp = dp->i_mount; leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); @@ -1323,7 +1313,6 @@ xfs_dir2_leafn_split( xfs_da_args_t *args; /* operation arguments */ xfs_dablk_t blkno; /* new leaf block number */ int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ struct xfs_inode *dp; /* @@ -1331,7 +1320,6 @@ xfs_dir2_leafn_split( */ args = state->args; dp = args->dp; - mp = dp->i_mount; ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -2231,12 +2219,10 @@ xfs_dir2_node_trim_free( xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_free_t *free; /* freespace structure */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; - mp = dp->i_mount; tp = args->trans; /* * Read the freespace block. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 27ce0794d196..ef9f6ead96a4 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -20,140 +20,6 @@ struct dir_context; -/* - * Directory offset/block conversion functions. - * - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -/* - * Convert dataptr to byte in file space - */ -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t)(by >> geo->blklog); -} - -/* - * Convert dataptr to a block number - */ -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); -} - -/* - * Convert byte in space to offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) -{ - return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); -} - -/* - * Convert block and offset to byte in space - */ -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)db << geo->blklog) + o; -} - -/* - * Convert block (DB) to block (dablk) - */ -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); -} - -/* - * Convert byte in space to (DA) block - */ -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) -{ - return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); -} - -/* - * Convert block and offset to dataptr - */ -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); -} - -/* - * Convert block (dablk) to byte offset in space - */ -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) -{ - return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); -} - -/* - * Directory tail pointer accessor functions. Based on block geometry. - */ -static inline struct xfs_dir2_block_tail * -xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) -{ - return ((struct xfs_dir2_block_tail *) - ((char *)hdr + geo->blksize)) - 1; -} - -static inline struct xfs_dir2_leaf_tail * -xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) -{ - return (struct xfs_dir2_leaf_tail *) - ((char *)lp + geo->blksize - - sizeof(struct xfs_dir2_leaf_tail)); -} - /* xfs_dir2.c */ extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, @@ -161,12 +27,6 @@ extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const unsigned char *name, int len); -#define S_SHIFT 12 -extern const unsigned char xfs_mode_to_ftype[]; - -extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, - __uint8_t filetype); - /* xfs_dir2_block.c */ extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 5079e051ef08..974d62e677f4 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -32,7 +30,6 @@ #include "xfs_dir2.h" #include "xfs_dir2_priv.h" #include "xfs_trace.h" -#include "xfs_dinode.h" /* * Prototypes for internal functions. @@ -455,13 +452,11 @@ xfs_dir2_sf_addname_hard( xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ - struct xfs_mount *mp; /* * Copy the old directory to the stack buffer. */ dp = args->dp; - mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; @@ -542,7 +537,6 @@ xfs_dir2_sf_addname_pick( xfs_inode_t *dp; /* incore directory inode */ int holefit; /* found hole it will fit in */ int i; /* entry number */ - xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ @@ -550,7 +544,6 @@ xfs_dir2_sf_addname_pick( int used; /* data bytes used */ dp = args->dp; - mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = dp->d_ops->data_entsize(args->namelen); @@ -616,10 +609,8 @@ xfs_dir2_sf_check( int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - struct xfs_mount *mp; dp = args->dp; - mp = dp->i_mount; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = dp->d_ops->data_first_offset; @@ -1016,12 +1007,10 @@ xfs_dir2_sf_toino4( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ - struct xfs_mount *mp; trace_xfs_dir2_sf_toino4(args); dp = args->dp; - mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1094,12 +1083,10 @@ xfs_dir2_sf_toino8( int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ - struct xfs_mount *mp; trace_xfs_dir2_sf_toino8(args); dp = args->dp; - mp = dp->i_mount; /* * Copy the old directory to the buffer. diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index bb969337efc8..6fbf2d853a54 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 7e42bba9a420..8eb718979383 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -34,6 +34,1077 @@ struct xfs_buf; struct xfs_ifork; /* + * Super block + * Fits into a sector-sized buffer at address 0 of each allocation group. + * Only the first of these is ever updated except during growfs. + */ +#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ +#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ +#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ +#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ +#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ +#define XFS_SB_VERSION_NUMBITS 0x000f +#define XFS_SB_VERSION_ALLFBITS 0xfff0 +#define XFS_SB_VERSION_ATTRBIT 0x0010 +#define XFS_SB_VERSION_NLINKBIT 0x0020 +#define XFS_SB_VERSION_QUOTABIT 0x0040 +#define XFS_SB_VERSION_ALIGNBIT 0x0080 +#define XFS_SB_VERSION_DALIGNBIT 0x0100 +#define XFS_SB_VERSION_SHAREDBIT 0x0200 +#define XFS_SB_VERSION_LOGV2BIT 0x0400 +#define XFS_SB_VERSION_SECTORBIT 0x0800 +#define XFS_SB_VERSION_EXTFLGBIT 0x1000 +#define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ +#define XFS_SB_VERSION_MOREBITSBIT 0x8000 + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) + +/* + * There are two words to hold XFS "feature" bits: the original + * word, sb_versionnum, and sb_features2. Whenever a bit is set in + * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. + * + * These defines represent bits in sb_features2. + */ +#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ +#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 +#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ + +#define XFS_SB_VERSION2_OKBITS \ + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) + +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { + __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __uint32_t sb_blocksize; /* logical block size, bytes */ + xfs_rfsblock_t sb_dblocks; /* number of data blocks */ + xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ + xfs_rtblock_t sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + xfs_fsblock_t sb_logstart; /* starting block of log if internal */ + xfs_ino_t sb_rootino; /* root inode number */ + xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ + xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ + xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ + xfs_agblock_t sb_agblocks; /* size of an allocation group */ + xfs_agnumber_t sb_agcount; /* number of allocation groups */ + xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ + xfs_extlen_t sb_logblocks; /* number of log blocks */ + __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ + __uint16_t sb_sectsize; /* volume sector size, bytes */ + __uint16_t sb_inodesize; /* inode size, bytes */ + __uint16_t sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __uint8_t sb_blocklog; /* log2 of sb_blocksize */ + __uint8_t sb_sectlog; /* log2 of sb_sectsize */ + __uint8_t sb_inodelog; /* log2 of sb_inodesize */ + __uint8_t sb_inopblog; /* log2 of sb_inopblock */ + __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __uint8_t sb_rextslog; /* log2 of sb_rextents */ + __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ + __uint8_t sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __uint64_t sb_icount; /* allocated inodes */ + __uint64_t sb_ifree; /* free inodes */ + __uint64_t sb_fdblocks; /* free data blocks */ + __uint64_t sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + xfs_ino_t sb_uquotino; /* user quota inode */ + xfs_ino_t sb_gquotino; /* group quota inode */ + __uint16_t sb_qflags; /* quota flags */ + __uint8_t sb_flags; /* misc. flags */ + __uint8_t sb_shared_vn; /* shared version number */ + xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __uint32_t sb_unit; /* stripe or raid unit */ + __uint32_t sb_width; /* stripe or raid width */ + __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ + __uint8_t sb_logsectlog; /* log2 of the log sector size */ + __uint16_t sb_logsectsize; /* sector size for the log, bytes */ + __uint32_t sb_logsunit; /* stripe unit size for the log */ + __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb structure to + * 64 bits. Some machines will be using this field for features2 bits. + * Easiest just to mark it bad and not use it for anything else. + * + * This is not kept up to date in memory; it is always overwritten by + * the value in sb_features2 when formatting the incore superblock to + * the disk buffer. + */ + __uint32_t sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_sb_t; + +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + +/* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + +/* + * Sequence number values for the fields. + */ +typedef enum { + XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, + XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, + XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, + XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, + XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, + XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, + XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, + XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, + XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, + XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, + XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, + XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, + XFS_SBS_FIELDCOUNT +} xfs_sb_field_t; + +/* + * Mask values, defined based on the xfs_sb_field_t values. + * Only define the ones we're using. + */ +#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) +#define XFS_SB_UUID XFS_SB_MVAL(UUID) +#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) +#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) +#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) +#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) +#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) +#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) +#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) +#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) +#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) +#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) +#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) +#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) +#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) +#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \ + XFS_SB_MVAL(BAD_FEATURES2)) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) +#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) +#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) +#define XFS_SB_MOD_BITS \ + (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ + XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ + XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ + XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ + XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \ + XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \ + XFS_SB_PQUOTINO) + + +/* + * Misc. Flags - warning - these will be cleared by xfs_repair unless + * a feature bit is set when the flag is used. + */ +#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ +#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ + +/* + * define max. shared version we can interoperate with + */ +#define XFS_SB_MAX_SHARED_VN 0 + +#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) + +/* + * The first XFS version we support is a v4 superblock with V2 directories. + */ +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) +{ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + return true; +} + +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +{ + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; +} + +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) +{ + return sbp->sb_bad_features2 != sbp->sb_features2; +} + +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); +} + +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; +} + +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); +} + +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; +} + +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); +} + +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); +} + +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); +} + +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); +} + +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); +} + +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) +{ + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); +} + +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); +} + +/* + * sb_features2 bit version macros. + */ +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); +} + +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); +} + +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; +} + +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) +{ + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; +} + +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); +} + +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +{ + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; +} + +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_compat & feature) != 0; +} + +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_ro_compat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_incompat & feature) != 0; +} + +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) +{ + return (sbp->sb_features_log_incompat & feature) != 0; +} + +/* + * V5 superblock specific feature checks + */ +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); +} + +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) +{ + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); +} + +/* + * end of superblock version macros + */ + +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + +#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ +#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) + +#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) +#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) +#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ + XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) + +/* + * File system sector to basic block conversions. + */ +#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) + +/* + * File system block to basic block conversions. + */ +#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) +#define XFS_BB_TO_FSB(mp,bb) \ + (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) +#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) + +/* + * File system block to byte conversions. + */ +#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSB(mp,b) \ + ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) +#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) + +/* + * Allocation group header + * + * This is divided into three structures, placed in sequential 512-byte + * buffers after a copy of the superblock (also in a 512-byte buffer). + */ +#define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ +#define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ +#define XFS_AGF_VERSION 1 +#define XFS_AGI_VERSION 1 + +#define XFS_AGF_GOOD_VERSION(v) ((v) == XFS_AGF_VERSION) +#define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) + +/* + * Btree number 0 is bno, 1 is cnt. This value gives the size of the + * arrays below. + */ +#define XFS_BTNUM_AGF ((int)XFS_BTNUM_CNTi + 1) + +/* + * The second word of agf_levels in the first a.g. overlaps the EFS + * superblock's magic number. Since the magic numbers valid for EFS + * are > 64k, our value cannot be confused for an EFS superblock's. + */ + +typedef struct xfs_agf { + /* + * Common allocation group header information + */ + __be32 agf_magicnum; /* magic number == XFS_AGF_MAGIC */ + __be32 agf_versionnum; /* header version == XFS_AGF_VERSION */ + __be32 agf_seqno; /* sequence # starting from 0 */ + __be32 agf_length; /* size in blocks of a.g. */ + /* + * Freespace information + */ + __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ + __be32 agf_spare0; /* spare field */ + __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_spare1; /* spare field */ + + __be32 agf_flfirst; /* first freelist block's index */ + __be32 agf_fllast; /* last freelist block's index */ + __be32 agf_flcount; /* count of blocks in freelist */ + __be32 agf_freeblks; /* total free blocks */ + + __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ +} xfs_agf_t; + +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + +#define XFS_AGF_MAGICNUM 0x00000001 +#define XFS_AGF_VERSIONNUM 0x00000002 +#define XFS_AGF_SEQNO 0x00000004 +#define XFS_AGF_LENGTH 0x00000008 +#define XFS_AGF_ROOTS 0x00000010 +#define XFS_AGF_LEVELS 0x00000020 +#define XFS_AGF_FLFIRST 0x00000040 +#define XFS_AGF_FLLAST 0x00000080 +#define XFS_AGF_FLCOUNT 0x00000100 +#define XFS_AGF_FREEBLKS 0x00000200 +#define XFS_AGF_LONGEST 0x00000400 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 +#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) + +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) +#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) + +/* + * Size of the unlinked inode hash table in the agi. + */ +#define XFS_AGI_UNLINKED_BUCKETS 64 + +typedef struct xfs_agi { + /* + * Common allocation group header information + */ + __be32 agi_magicnum; /* magic number == XFS_AGI_MAGIC */ + __be32 agi_versionnum; /* header version == XFS_AGI_VERSION */ + __be32 agi_seqno; /* sequence # starting from 0 */ + __be32 agi_length; /* size in blocks of a.g. */ + /* + * Inode information + * Inodes are mapped by interpreting the inode number, so no + * mapping data is needed here. + */ + __be32 agi_count; /* count of allocated inodes */ + __be32 agi_root; /* root of inode btree */ + __be32 agi_level; /* levels in inode btree */ + __be32 agi_freecount; /* number of free inodes */ + + __be32 agi_newino; /* new inode just allocated */ + __be32 agi_dirino; /* last directory inode chunk */ + /* + * Hash table of inodes which have been unlinked but are + * still being referenced. + */ + __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + /* + * This marks the end of logging region 1 and start of logging region 2. + */ + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + + /* structure must be padded to 64 bit alignment */ +} xfs_agi_t; + +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 + +/* disk block (xfs_daddr_t) in the AG */ +#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) +#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) + +/* + * The third a.g. block contains the a.g. freelist, an array + * of block pointers to blocks owned by the allocation btree code. + */ +#define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) +#define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) + +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) + +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. + */ +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + +typedef struct xfs_agfl { + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) + + +#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) +#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ + (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) +#define XFS_MIN_FREELIST(a,mp) \ + (XFS_MIN_FREELIST_RAW( \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ + be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) +#define XFS_MIN_FREELIST_PAG(pag,mp) \ + (XFS_MIN_FREELIST_RAW( \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + +#define XFS_AGB_TO_FSB(mp,agno,agbno) \ + (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) +#define XFS_FSB_TO_AGNO(mp,fsbno) \ + ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) +#define XFS_FSB_TO_AGBNO(mp,fsbno) \ + ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) +#define XFS_AGB_TO_DADDR(mp,agno,agbno) \ + ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ + (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) +#define XFS_AG_DADDR(mp,agno,d) (XFS_AGB_TO_DADDR(mp, agno, 0) + (d)) + +/* + * For checking for bad ranges of xfs_daddr_t's, covering multiple + * allocation groups or a single xfs_daddr_t that's a superblock copy. + */ +#define XFS_AG_CHECK_DADDR(mp,d,len) \ + ((len) == 1 ? \ + ASSERT((d) == XFS_SB_DADDR || \ + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) + +typedef struct xfs_timestamp { + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ +} xfs_timestamp_t; + +/* + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. + */ +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) +typedef struct xfs_dinode { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ + xfs_timestamp_t di_atime; /* time last accessed */ + xfs_timestamp_t di_mtime; /* time last modified */ + xfs_timestamp_t di_ctime; /* time created/inode modified */ + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; + +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) + +#define DI_MAX_FLUSH 0xffff + +/* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} + +/* + * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. + * Since the pathconf interface is signed, we use 2^31 - 1 instead. + * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. + */ +#define XFS_MAXLINK ((1U << 31) - 1U) +#define XFS_MAXLINK_1 65535U + +/* + * Values for di_format + */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ +} xfs_dinode_fmt_t; + +/* + * Inode minimum and maximum sizes. + */ +#define XFS_DINODE_MIN_LOG 8 +#define XFS_DINODE_MAX_LOG 11 +#define XFS_DINODE_MIN_SIZE (1 << XFS_DINODE_MIN_LOG) +#define XFS_DINODE_MAX_SIZE (1 << XFS_DINODE_MAX_LOG) + +/* + * Inode size for given fs. + */ +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) + +/* + * Inode data & attribute fork sizes, per inode. + */ +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) + +#define XFS_DFORK_DSIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version)) +#define XFS_DFORK_ASIZE(dip,mp) \ + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) + +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)dip + xfs_dinode_size(dip->di_version)) +#define XFS_DFORK_APTR(dip) \ + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) +#define XFS_DFORK_PTR(dip,w) \ + ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) + +#define XFS_DFORK_FORMAT(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + (dip)->di_format : \ + (dip)->di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ + ((w) == XFS_DATA_FORK ? \ + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) + +/* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} + +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} + +/* + * Values for di_flags + * There should be a one-to-one correspondence between these flags and the + * XFS_XFLAG_s. + */ +#define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ +#define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ +#define XFS_DIFLAG_NEWRTBM_BIT 2 /* for rtbitmap inode, new format */ +#define XFS_DIFLAG_IMMUTABLE_BIT 3 /* inode is immutable */ +#define XFS_DIFLAG_APPEND_BIT 4 /* inode is append-only */ +#define XFS_DIFLAG_SYNC_BIT 5 /* inode is written synchronously */ +#define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ +#define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ +#define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ +#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ +#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) +#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) +#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) +#define XFS_DIFLAG_IMMUTABLE (1 << XFS_DIFLAG_IMMUTABLE_BIT) +#define XFS_DIFLAG_APPEND (1 << XFS_DIFLAG_APPEND_BIT) +#define XFS_DIFLAG_SYNC (1 << XFS_DIFLAG_SYNC_BIT) +#define XFS_DIFLAG_NOATIME (1 << XFS_DIFLAG_NOATIME_BIT) +#define XFS_DIFLAG_NODUMP (1 << XFS_DIFLAG_NODUMP_BIT) +#define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) +#define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) +#define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) +#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) +#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) +#define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#define XFS_DIFLAG_ANY \ + (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ + XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ + XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) + +/* + * Inode number format: + * low inopblog bits - offset in block + * next agblklog bits - block number in ag + * next agno_log bits - ag number + * high agno_log-agblklog-inopblog bits - 0 + */ +#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) +#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog +#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog +#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log +#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log +#define XFS_INO_BITS(mp) \ + XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) +#define XFS_INO_TO_AGNO(mp,i) \ + ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGINO(mp,i) \ + ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) +#define XFS_INO_TO_AGBNO(mp,i) \ + (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ + XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) +#define XFS_INO_TO_OFFSET(mp,i) \ + ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_INO_TO_FSB(mp,i) \ + XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) +#define XFS_AGINO_TO_INO(mp,a,i) \ + (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) +#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) +#define XFS_AGINO_TO_OFFSET(mp,i) \ + ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) +#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ + ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) + +#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) +#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) + +/* * RealTime Device format definitions */ @@ -413,4 +1484,40 @@ struct xfs_btree_block { #define XFS_BTREE_LBLOCK_CRC_OFF \ offsetof(struct xfs_btree_block, bb_u.l.bb_crc) +/* + * On-disk XFS access control list structure. + */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + +struct xfs_acl { + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; +}; + +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + +/* On-disk XFS extended attribute names */ +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) +#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) + #endif /* __XFS_FORMAT_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca19f..18dc721ca19f 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 23dcb72fc5e6..116ef1ddb3e3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -22,9 +22,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -39,7 +37,6 @@ #include "xfs_buf_item.h" #include "xfs_icreate_item.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_trace.h" @@ -48,12 +45,12 @@ */ static inline int xfs_ialloc_cluster_alignment( - xfs_alloc_arg_t *args) + struct xfs_mount *mp) { - if (xfs_sb_version_hasalign(&args->mp->m_sb) && - args->mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) - return args->mp->m_sb.sb_inoalignmt; + if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + return mp->m_sb.sb_inoalignmt; return 1; } @@ -412,7 +409,7 @@ xfs_ialloc_ag_alloc( * but not to use them in the actual exact allocation. */ args.alignment = 1; - args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; + args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1; /* Allow space for the inode btree to split. */ args.minleft = args.mp->m_in_maxlevels - 1; @@ -448,7 +445,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else - args.alignment = xfs_ialloc_cluster_alignment(&args); + args.alignment = xfs_ialloc_cluster_alignment(args.mp); /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. @@ -477,7 +474,7 @@ xfs_ialloc_ag_alloc( args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); - args.alignment = xfs_ialloc_cluster_alignment(&args); + args.alignment = xfs_ialloc_cluster_alignment(args.mp); if ((error = xfs_alloc_vextent(&args))) return error; } @@ -632,10 +629,24 @@ xfs_ialloc_ag_select( } /* - * Is there enough free space for the file plus a block of - * inodes? (if we need to allocate some)? + * Check that there is enough free space for the file plus a + * chunk of inodes if we need to allocate some. If this is the + * first pass across the AGs, take into account the potential + * space needed for alignment of inode chunks when checking the + * longest contiguous free space in the AG - this prevents us + * from getting ENOSPC because we have free space larger than + * m_ialloc_blks but alignment constraints prevent us from using + * it. + * + * If we can't find an AG with space for full alignment slack to + * be taken into account, we must be near ENOSPC in all AGs. + * Hence we don't include alignment for the second pass and so + * if we fail allocation due to alignment issues then it is most + * likely a real ENOSPC condition. */ ineed = mp->m_ialloc_blks; + if (flags && ineed > 1) + ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; if (!longest) longest = pag->pagf_flcount > 0; @@ -1137,11 +1148,7 @@ xfs_dialloc_ag_update_inobt( XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && (rec.ir_freecount == frec->ir_freecount)); - error = xfs_inobt_update(cur, &rec); - if (error) - return error; - - return 0; + return xfs_inobt_update(cur, &rec); } /* diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 95ad1c002d60..100007d56449 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -160,4 +160,8 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); +int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); + + #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index c9b06f30fe86..964c465ca69c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f18fd2da49f7..002b6b3a1988 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" @@ -30,7 +28,6 @@ #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_ialloc.h" -#include "xfs_dinode.h" /* * Check that none of the inode's in the buffer have a next diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 6a00f7fed69d..0defbd02f62d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -22,9 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -34,7 +31,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_attr_sf.h" -#include "xfs_dinode.h" kmem_zone_t *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h deleted file mode 100644 index 4ff2278e147a..000000000000 --- a/fs/xfs/libxfs/xfs_inum.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_INUM_H__ -#define __XFS_INUM_H__ - -/* - * Inode number format: - * low inopblog bits - offset in block - * next agblklog bits - block number in ag - * next agno_log bits - ag number - * high agno_log-agblklog-inopblog bits - 0 - */ - -struct xfs_mount; - -#define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) -#define XFS_INO_OFFSET_BITS(mp) (mp)->m_sb.sb_inopblog -#define XFS_INO_AGBNO_BITS(mp) (mp)->m_sb.sb_agblklog -#define XFS_INO_AGINO_BITS(mp) (mp)->m_agino_log -#define XFS_INO_AGNO_BITS(mp) (mp)->m_agno_log -#define XFS_INO_BITS(mp) \ - XFS_INO_AGNO_BITS(mp) + XFS_INO_AGINO_BITS(mp) -#define XFS_INO_TO_AGNO(mp,i) \ - ((xfs_agnumber_t)((i) >> XFS_INO_AGINO_BITS(mp))) -#define XFS_INO_TO_AGINO(mp,i) \ - ((xfs_agino_t)(i) & XFS_INO_MASK(XFS_INO_AGINO_BITS(mp))) -#define XFS_INO_TO_AGBNO(mp,i) \ - (((xfs_agblock_t)(i) >> XFS_INO_OFFSET_BITS(mp)) & \ - XFS_INO_MASK(XFS_INO_AGBNO_BITS(mp))) -#define XFS_INO_TO_OFFSET(mp,i) \ - ((int)(i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) -#define XFS_INO_TO_FSB(mp,i) \ - XFS_AGB_TO_FSB(mp, XFS_INO_TO_AGNO(mp,i), XFS_INO_TO_AGBNO(mp,i)) -#define XFS_AGINO_TO_INO(mp,a,i) \ - (((xfs_ino_t)(a) << XFS_INO_AGINO_BITS(mp)) | (i)) -#define XFS_AGINO_TO_AGBNO(mp,i) ((i) >> XFS_INO_OFFSET_BITS(mp)) -#define XFS_AGINO_TO_OFFSET(mp,i) \ - ((i) & XFS_INO_MASK(XFS_INO_OFFSET_BITS(mp))) -#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ - ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) - -#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) - -#endif /* __XFS_INUM_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index aff12f2d4428..265314690415 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -361,7 +361,7 @@ typedef struct xfs_ictimestamp { /* * NOTE: This structure must be kept identical to struct xfs_dinode - * in xfs_dinode.h except for the endianness annotations. + * except for the endianness annotations. */ typedef struct xfs_icdinode { __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index ee7e0e80246b..c10597973333 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_trans_space.h" diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 7c818f1e4484..9b59ffa1fc19 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" @@ -36,7 +34,6 @@ #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5f902fa7913f..b0a5fe95a3e2 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -23,7 +23,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -33,7 +32,6 @@ #include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" -#include "xfs_dinode.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" @@ -42,69 +40,6 @@ * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ -static const struct { - short offset; - short type; /* 0 = integer - * 1 = binary / string (no translation) - */ -} xfs_sb_info[] = { - { offsetof(xfs_sb_t, sb_magicnum), 0 }, - { offsetof(xfs_sb_t, sb_blocksize), 0 }, - { offsetof(xfs_sb_t, sb_dblocks), 0 }, - { offsetof(xfs_sb_t, sb_rblocks), 0 }, - { offsetof(xfs_sb_t, sb_rextents), 0 }, - { offsetof(xfs_sb_t, sb_uuid), 1 }, - { offsetof(xfs_sb_t, sb_logstart), 0 }, - { offsetof(xfs_sb_t, sb_rootino), 0 }, - { offsetof(xfs_sb_t, sb_rbmino), 0 }, - { offsetof(xfs_sb_t, sb_rsumino), 0 }, - { offsetof(xfs_sb_t, sb_rextsize), 0 }, - { offsetof(xfs_sb_t, sb_agblocks), 0 }, - { offsetof(xfs_sb_t, sb_agcount), 0 }, - { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, - { offsetof(xfs_sb_t, sb_logblocks), 0 }, - { offsetof(xfs_sb_t, sb_versionnum), 0 }, - { offsetof(xfs_sb_t, sb_sectsize), 0 }, - { offsetof(xfs_sb_t, sb_inodesize), 0 }, - { offsetof(xfs_sb_t, sb_inopblock), 0 }, - { offsetof(xfs_sb_t, sb_fname[0]), 1 }, - { offsetof(xfs_sb_t, sb_blocklog), 0 }, - { offsetof(xfs_sb_t, sb_sectlog), 0 }, - { offsetof(xfs_sb_t, sb_inodelog), 0 }, - { offsetof(xfs_sb_t, sb_inopblog), 0 }, - { offsetof(xfs_sb_t, sb_agblklog), 0 }, - { offsetof(xfs_sb_t, sb_rextslog), 0 }, - { offsetof(xfs_sb_t, sb_inprogress), 0 }, - { offsetof(xfs_sb_t, sb_imax_pct), 0 }, - { offsetof(xfs_sb_t, sb_icount), 0 }, - { offsetof(xfs_sb_t, sb_ifree), 0 }, - { offsetof(xfs_sb_t, sb_fdblocks), 0 }, - { offsetof(xfs_sb_t, sb_frextents), 0 }, - { offsetof(xfs_sb_t, sb_uquotino), 0 }, - { offsetof(xfs_sb_t, sb_gquotino), 0 }, - { offsetof(xfs_sb_t, sb_qflags), 0 }, - { offsetof(xfs_sb_t, sb_flags), 0 }, - { offsetof(xfs_sb_t, sb_shared_vn), 0 }, - { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, - { offsetof(xfs_sb_t, sb_unit), 0 }, - { offsetof(xfs_sb_t, sb_width), 0 }, - { offsetof(xfs_sb_t, sb_dirblklog), 0 }, - { offsetof(xfs_sb_t, sb_logsectlog), 0 }, - { offsetof(xfs_sb_t, sb_logsectsize), 0 }, - { offsetof(xfs_sb_t, sb_logsunit), 0 }, - { offsetof(xfs_sb_t, sb_features2), 0 }, - { offsetof(xfs_sb_t, sb_bad_features2), 0 }, - { offsetof(xfs_sb_t, sb_features_compat), 0 }, - { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, - { offsetof(xfs_sb_t, sb_features_incompat), 0 }, - { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, - { offsetof(xfs_sb_t, sb_crc), 0 }, - { offsetof(xfs_sb_t, sb_pad), 0 }, - { offsetof(xfs_sb_t, sb_pquotino), 0 }, - { offsetof(xfs_sb_t, sb_lsn), 0 }, - { sizeof(xfs_sb_t), 0 } -}; - /* * Reference counting access wrappers to the perag structures. * Because we never free per-ag structures, the only thing we @@ -463,58 +398,49 @@ xfs_sb_from_disk( __xfs_sb_from_disk(to, from, true); } -static inline void +static void xfs_sb_quota_to_disk( - xfs_dsb_t *to, - xfs_sb_t *from, - __int64_t *fields) + struct xfs_dsb *to, + struct xfs_sb *from) { __uint16_t qflags = from->sb_qflags; + to->sb_uquotino = cpu_to_be64(from->sb_uquotino); + if (xfs_sb_version_has_pquotino(from)) { + to->sb_qflags = cpu_to_be16(from->sb_qflags); + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + to->sb_pquotino = cpu_to_be64(from->sb_pquotino); + return; + } + /* - * We need to do these manipilations only if we are working - * with an older version of on-disk superblock. + * The in-core version of sb_qflags do not have XFS_OQUOTA_* + * flags, whereas the on-disk version does. So, convert incore + * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. */ - if (xfs_sb_version_has_pquotino(from)) - return; + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); - if (*fields & XFS_SB_QFLAGS) { - /* - * The in-core version of sb_qflags do not have - * XFS_OQUOTA_* flags, whereas the on-disk version - * does. So, convert incore XFS_{PG}QUOTA_* flags - * to on-disk XFS_OQUOTA_* flags. - */ - qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | - XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); - - if (from->sb_qflags & - (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) - qflags |= XFS_OQUOTA_ENFD; - if (from->sb_qflags & - (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) - qflags |= XFS_OQUOTA_CHKD; - to->sb_qflags = cpu_to_be16(qflags); - *fields &= ~XFS_SB_QFLAGS; - } + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); /* - * GQUOTINO and PQUOTINO cannot be used together in versions of - * superblock that do not have pquotino. from->sb_flags tells us which - * quota is active and should be copied to disk. If neither are active, - * make sure we write NULLFSINO to the sb_gquotino field as a quota - * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature - * bit is set. + * GQUOTINO and PQUOTINO cannot be used together in versions + * of superblock that do not have pquotino. from->sb_flags + * tells us which quota is active and should be copied to + * disk. If neither are active, we should NULL the inode. * - * Note that we don't need to handle the sb_uquotino or sb_pquotino here - * as they do not require any translation. Hence the main sb field loop - * will write them appropriately from the in-core superblock. + * In all cases, the separate pquotino must remain 0 because it + * it beyond the "end" of the valid non-pquotino superblock. */ - if ((*fields & XFS_SB_GQUOTINO) && - (from->sb_qflags & XFS_GQUOTA_ACCT)) + if (from->sb_qflags & XFS_GQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_gquotino); - else if ((*fields & XFS_SB_PQUOTINO) && - (from->sb_qflags & XFS_PQUOTA_ACCT)) + else if (from->sb_qflags & XFS_PQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); else { /* @@ -528,63 +454,78 @@ xfs_sb_quota_to_disk( to->sb_gquotino = cpu_to_be64(NULLFSINO); } - *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); + to->sb_pquotino = 0; } -/* - * Copy in core superblock to ondisk one. - * - * The fields argument is mask of superblock fields to copy. - */ void xfs_sb_to_disk( - xfs_dsb_t *to, - xfs_sb_t *from, - __int64_t fields) + struct xfs_dsb *to, + struct xfs_sb *from) { - xfs_caddr_t to_ptr = (xfs_caddr_t)to; - xfs_caddr_t from_ptr = (xfs_caddr_t)from; - xfs_sb_field_t f; - int first; - int size; - - ASSERT(fields); - if (!fields) - return; + xfs_sb_quota_to_disk(to, from); - /* We should never write the crc here, it's updated in the IO path */ - fields &= ~XFS_SB_CRC; - - xfs_sb_quota_to_disk(to, from, &fields); - while (fields) { - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - first = xfs_sb_info[f].offset; - size = xfs_sb_info[f + 1].offset - first; - - ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); - - if (size == 1 || xfs_sb_info[f].type == 1) { - memcpy(to_ptr + first, from_ptr + first, size); - } else { - switch (size) { - case 2: - *(__be16 *)(to_ptr + first) = - cpu_to_be16(*(__u16 *)(from_ptr + first)); - break; - case 4: - *(__be32 *)(to_ptr + first) = - cpu_to_be32(*(__u32 *)(from_ptr + first)); - break; - case 8: - *(__be64 *)(to_ptr + first) = - cpu_to_be64(*(__u64 *)(from_ptr + first)); - break; - default: - ASSERT(0); - } - } + to->sb_magicnum = cpu_to_be32(from->sb_magicnum); + to->sb_blocksize = cpu_to_be32(from->sb_blocksize); + to->sb_dblocks = cpu_to_be64(from->sb_dblocks); + to->sb_rblocks = cpu_to_be64(from->sb_rblocks); + to->sb_rextents = cpu_to_be64(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = cpu_to_be64(from->sb_logstart); + to->sb_rootino = cpu_to_be64(from->sb_rootino); + to->sb_rbmino = cpu_to_be64(from->sb_rbmino); + to->sb_rsumino = cpu_to_be64(from->sb_rsumino); + to->sb_rextsize = cpu_to_be32(from->sb_rextsize); + to->sb_agblocks = cpu_to_be32(from->sb_agblocks); + to->sb_agcount = cpu_to_be32(from->sb_agcount); + to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks); + to->sb_logblocks = cpu_to_be32(from->sb_logblocks); + to->sb_versionnum = cpu_to_be16(from->sb_versionnum); + to->sb_sectsize = cpu_to_be16(from->sb_sectsize); + to->sb_inodesize = cpu_to_be16(from->sb_inodesize); + to->sb_inopblock = cpu_to_be16(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = cpu_to_be64(from->sb_icount); + to->sb_ifree = cpu_to_be64(from->sb_ifree); + to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks); + to->sb_frextents = cpu_to_be64(from->sb_frextents); - fields &= ~(1LL << f); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt); + to->sb_unit = cpu_to_be32(from->sb_unit); + to->sb_width = cpu_to_be32(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize); + to->sb_logsunit = cpu_to_be32(from->sb_logsunit); + + /* + * We need to ensure that bad_features2 always matches features2. + * Hence we enforce that here rather than having to remember to do it + * everywhere else that updates features2. + */ + from->sb_bad_features2 = from->sb_features2; + to->sb_features2 = cpu_to_be32(from->sb_features2); + to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); + + if (xfs_sb_version_hascrc(from)) { + to->sb_features_compat = cpu_to_be32(from->sb_features_compat); + to->sb_features_ro_compat = + cpu_to_be32(from->sb_features_ro_compat); + to->sb_features_incompat = + cpu_to_be32(from->sb_features_incompat); + to->sb_features_log_incompat = + cpu_to_be32(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_lsn = cpu_to_be64(from->sb_lsn); } } @@ -818,42 +759,51 @@ xfs_initialize_perag_data( } /* - * xfs_mod_sb() can be used to copy arbitrary changes to the - * in-core superblock into the superblock buffer to be logged. - * It does not provide the higher level of locking that is - * needed to protect the in-core superblock from concurrent - * access. + * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock + * into the superblock buffer to be logged. It does not provide the higher + * level of locking that is needed to protect the in-core superblock from + * concurrent access. */ void -xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +xfs_log_sb( + struct xfs_trans *tp) { - xfs_buf_t *bp; - int first; - int last; - xfs_mount_t *mp; - xfs_sb_field_t f; - - ASSERT(fields); - if (!fields) - return; - mp = tp->t_mountp; - bp = xfs_trans_getsb(tp, mp, 0); - first = sizeof(xfs_sb_t); - last = 0; - - /* translate/copy */ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); +} - /* find modified range */ - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; +/* + * xfs_sync_sb + * + * Sync the superblock to disk. + * + * Note that the caller is responsible for checking the frozen state of the + * filesystem. This procedure uses the non-blocking transaction allocator and + * thus will allow modifications to a frozen fs. This is required because this + * code can be called during the process of freezing where use of the high-level + * allocator would deadlock. + */ +int +xfs_sync_sb( + struct xfs_mount *mp, + bool wait) +{ + struct xfs_trans *tp; + int error; - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - first = xfs_sb_info[f].offset; + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, first, last); + xfs_log_sb(tp); + if (wait) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); } diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 2e739708afd3..b25bb9a343f3 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -19,590 +19,6 @@ #define __XFS_SB_H__ /* - * Super block - * Fits into a sector-sized buffer at address 0 of each allocation group. - * Only the first of these is ever updated except during growfs. - */ - -struct xfs_buf; -struct xfs_mount; -struct xfs_trans; - -#define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ -#define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ -#define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ -#define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ -#define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ -#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ -#define XFS_SB_VERSION_NUMBITS 0x000f -#define XFS_SB_VERSION_ALLFBITS 0xfff0 -#define XFS_SB_VERSION_ATTRBIT 0x0010 -#define XFS_SB_VERSION_NLINKBIT 0x0020 -#define XFS_SB_VERSION_QUOTABIT 0x0040 -#define XFS_SB_VERSION_ALIGNBIT 0x0080 -#define XFS_SB_VERSION_DALIGNBIT 0x0100 -#define XFS_SB_VERSION_SHAREDBIT 0x0200 -#define XFS_SB_VERSION_LOGV2BIT 0x0400 -#define XFS_SB_VERSION_SECTORBIT 0x0800 -#define XFS_SB_VERSION_EXTFLGBIT 0x1000 -#define XFS_SB_VERSION_DIRV2BIT 0x2000 -#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ -#define XFS_SB_VERSION_MOREBITSBIT 0x8000 - -/* - * Supported feature bit list is just all bits in the versionnum field because - * we've used them all up and understand them all. Except, of course, for the - * shared superblock bit, which nobody knows what it does and so is unsupported. - */ -#define XFS_SB_VERSION_OKBITS \ - ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ - ~XFS_SB_VERSION_SHAREDBIT) - -/* - * There are two words to hold XFS "feature" bits: the original - * word, sb_versionnum, and sb_features2. Whenever a bit is set in - * sb_features2, the feature bit XFS_SB_VERSION_MOREBITSBIT must be set. - * - * These defines represent bits in sb_features2. - */ -#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 -#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ -#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 -#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ -#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ -#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ -#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ -#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ - -#define XFS_SB_VERSION2_OKBITS \ - (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ - XFS_SB_VERSION2_ATTR2BIT | \ - XFS_SB_VERSION2_PROJID32BIT | \ - XFS_SB_VERSION2_FTYPE) - -/* - * Superblock - in core version. Must match the ondisk version below. - * Must be padded to 64 bit alignment. - */ -typedef struct xfs_sb { - __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ - __uint32_t sb_blocksize; /* logical block size, bytes */ - xfs_rfsblock_t sb_dblocks; /* number of data blocks */ - xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */ - xfs_rtblock_t sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ - xfs_fsblock_t sb_logstart; /* starting block of log if internal */ - xfs_ino_t sb_rootino; /* root inode number */ - xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ - xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ - xfs_agblock_t sb_rextsize; /* realtime extent size, blocks */ - xfs_agblock_t sb_agblocks; /* size of an allocation group */ - xfs_agnumber_t sb_agcount; /* number of allocation groups */ - xfs_extlen_t sb_rbmblocks; /* number of rt bitmap blocks */ - xfs_extlen_t sb_logblocks; /* number of log blocks */ - __uint16_t sb_versionnum; /* header version == XFS_SB_VERSION */ - __uint16_t sb_sectsize; /* volume sector size, bytes */ - __uint16_t sb_inodesize; /* inode size, bytes */ - __uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ - __uint8_t sb_blocklog; /* log2 of sb_blocksize */ - __uint8_t sb_sectlog; /* log2 of sb_sectsize */ - __uint8_t sb_inodelog; /* log2 of sb_inodesize */ - __uint8_t sb_inopblog; /* log2 of sb_inopblock */ - __uint8_t sb_agblklog; /* log2 of sb_agblocks (rounded up) */ - __uint8_t sb_rextslog; /* log2 of sb_rextents */ - __uint8_t sb_inprogress; /* mkfs is in progress, don't mount */ - __uint8_t sb_imax_pct; /* max % of fs for inode space */ - /* statistics */ - /* - * These fields must remain contiguous. If you really - * want to change their layout, make sure you fix the - * code in xfs_trans_apply_sb_deltas(). - */ - __uint64_t sb_icount; /* allocated inodes */ - __uint64_t sb_ifree; /* free inodes */ - __uint64_t sb_fdblocks; /* free data blocks */ - __uint64_t sb_frextents; /* free realtime extents */ - /* - * End contiguous fields. - */ - xfs_ino_t sb_uquotino; /* user quota inode */ - xfs_ino_t sb_gquotino; /* group quota inode */ - __uint16_t sb_qflags; /* quota flags */ - __uint8_t sb_flags; /* misc. flags */ - __uint8_t sb_shared_vn; /* shared version number */ - xfs_extlen_t sb_inoalignmt; /* inode chunk alignment, fsblocks */ - __uint32_t sb_unit; /* stripe or raid unit */ - __uint32_t sb_width; /* stripe or raid width */ - __uint8_t sb_dirblklog; /* log2 of dir block size (fsbs) */ - __uint8_t sb_logsectlog; /* log2 of the log sector size */ - __uint16_t sb_logsectsize; /* sector size for the log, bytes */ - __uint32_t sb_logsunit; /* stripe unit size for the log */ - __uint32_t sb_features2; /* additional feature bits */ - - /* - * bad features2 field as a result of failing to pad the sb - * structure to 64 bits. Some machines will be using this field - * for features2 bits. Easiest just to mark it bad and not use - * it for anything else. - */ - __uint32_t sb_bad_features2; - - /* version 5 superblock fields start here */ - - /* feature masks */ - __uint32_t sb_features_compat; - __uint32_t sb_features_ro_compat; - __uint32_t sb_features_incompat; - __uint32_t sb_features_log_incompat; - - __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; - - xfs_ino_t sb_pquotino; /* project quota inode */ - xfs_lsn_t sb_lsn; /* last write sequence */ - - /* must be padded to 64 bit alignment */ -} xfs_sb_t; - -#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) - -/* - * Superblock - on disk version. Must match the in core version above. - * Must be padded to 64 bit alignment. - */ -typedef struct xfs_dsb { - __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ - __be32 sb_blocksize; /* logical block size, bytes */ - __be64 sb_dblocks; /* number of data blocks */ - __be64 sb_rblocks; /* number of realtime blocks */ - __be64 sb_rextents; /* number of realtime extents */ - uuid_t sb_uuid; /* file system unique id */ - __be64 sb_logstart; /* starting block of log if internal */ - __be64 sb_rootino; /* root inode number */ - __be64 sb_rbmino; /* bitmap inode for realtime extents */ - __be64 sb_rsumino; /* summary inode for rt bitmap */ - __be32 sb_rextsize; /* realtime extent size, blocks */ - __be32 sb_agblocks; /* size of an allocation group */ - __be32 sb_agcount; /* number of allocation groups */ - __be32 sb_rbmblocks; /* number of rt bitmap blocks */ - __be32 sb_logblocks; /* number of log blocks */ - __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ - __be16 sb_sectsize; /* volume sector size, bytes */ - __be16 sb_inodesize; /* inode size, bytes */ - __be16 sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ - __u8 sb_blocklog; /* log2 of sb_blocksize */ - __u8 sb_sectlog; /* log2 of sb_sectsize */ - __u8 sb_inodelog; /* log2 of sb_inodesize */ - __u8 sb_inopblog; /* log2 of sb_inopblock */ - __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ - __u8 sb_rextslog; /* log2 of sb_rextents */ - __u8 sb_inprogress; /* mkfs is in progress, don't mount */ - __u8 sb_imax_pct; /* max % of fs for inode space */ - /* statistics */ - /* - * These fields must remain contiguous. If you really - * want to change their layout, make sure you fix the - * code in xfs_trans_apply_sb_deltas(). - */ - __be64 sb_icount; /* allocated inodes */ - __be64 sb_ifree; /* free inodes */ - __be64 sb_fdblocks; /* free data blocks */ - __be64 sb_frextents; /* free realtime extents */ - /* - * End contiguous fields. - */ - __be64 sb_uquotino; /* user quota inode */ - __be64 sb_gquotino; /* group quota inode */ - __be16 sb_qflags; /* quota flags */ - __u8 sb_flags; /* misc. flags */ - __u8 sb_shared_vn; /* shared version number */ - __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ - __be32 sb_unit; /* stripe or raid unit */ - __be32 sb_width; /* stripe or raid width */ - __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ - __u8 sb_logsectlog; /* log2 of the log sector size */ - __be16 sb_logsectsize; /* sector size for the log, bytes */ - __be32 sb_logsunit; /* stripe unit size for the log */ - __be32 sb_features2; /* additional feature bits */ - /* - * bad features2 field as a result of failing to pad the sb - * structure to 64 bits. Some machines will be using this field - * for features2 bits. Easiest just to mark it bad and not use - * it for anything else. - */ - __be32 sb_bad_features2; - - /* version 5 superblock fields start here */ - - /* feature masks */ - __be32 sb_features_compat; - __be32 sb_features_ro_compat; - __be32 sb_features_incompat; - __be32 sb_features_log_incompat; - - __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; - - __be64 sb_pquotino; /* project quota inode */ - __be64 sb_lsn; /* last write sequence */ - - /* must be padded to 64 bit alignment */ -} xfs_dsb_t; - -/* - * Sequence number values for the fields. - */ -typedef enum { - XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, - XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, - XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, - XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, - XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, - XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, - XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, - XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, - XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, - XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, - XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, - XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, - XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, - XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, - XFS_SBS_PQUOTINO, XFS_SBS_LSN, - XFS_SBS_FIELDCOUNT -} xfs_sb_field_t; - -/* - * Mask values, defined based on the xfs_sb_field_t values. - * Only define the ones we're using. - */ -#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) -#define XFS_SB_UUID XFS_SB_MVAL(UUID) -#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) -#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) -#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) -#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) -#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) -#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) -#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) -#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) -#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) -#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) -#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) -#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) -#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) -#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) -#define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) -#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) -#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) -#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) -#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) -#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) -#define XFS_SB_CRC XFS_SB_MVAL(CRC) -#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) -#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) -#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) -#define XFS_SB_MOD_BITS \ - (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ - XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ - XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ - XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ - XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) - - -/* - * Misc. Flags - warning - these will be cleared by xfs_repair unless - * a feature bit is set when the flag is used. - */ -#define XFS_SBF_NOFLAGS 0x00 /* no flags set */ -#define XFS_SBF_READONLY 0x01 /* only read-only mounts allowed */ - -/* - * define max. shared version we can interoperate with - */ -#define XFS_SB_MAX_SHARED_VN 0 - -#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) - -/* - * The first XFS version we support is a v4 superblock with V2 directories. - */ -static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) -{ - if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) - return false; - - /* check for unknown features in the fs */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - - return true; -} - -static inline bool xfs_sb_good_version(struct xfs_sb *sbp) -{ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return true; - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - return xfs_sb_good_v4_features(sbp); - return false; -} - -/* - * Detect a mismatched features2 field. Older kernels read/wrote - * this into the wrong slot, so to be safe we keep them in sync. - */ -static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) -{ - return sbp->sb_bad_features2 != sbp->sb_features2; -} - -static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); -} - -static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; -} - -static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); -} - -static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; -} - -static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); -} - -static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); -} - -static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); -} - -static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); -} - -static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); -} - -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); -} - -/* - * sb_features2 bit version macros. - */ -static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); -} - -static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); -} - -static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; - sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; - sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; -} - -static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) -{ - sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - if (!sbp->sb_features2) - sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; -} - -static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); -} - -static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) -{ - sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; - sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; - sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT; -} - -/* - * Extended v5 superblock feature masks. These are to be used for new v5 - * superblock features only. - * - * Compat features are new features that old kernels will not notice or affect - * and so can mount read-write without issues. - * - * RO-Compat (read only) are features that old kernels can read but will break - * if they write. Hence only read-only mounts of such filesystems are allowed on - * kernels that don't support the feature bit. - * - * InCompat features are features which old kernels will not understand and so - * must not mount. - * - * Log-InCompat features are for changes to log formats or new transactions that - * can't be replayed on older kernels. The fields are set when the filesystem is - * mounted, and a clean unmount clears the fields. - */ -#define XFS_SB_FEAT_COMPAT_ALL 0 -#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL -static inline bool -xfs_sb_has_compat_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_compat & feature) != 0; -} - -#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ -#define XFS_SB_FEAT_RO_COMPAT_ALL \ - (XFS_SB_FEAT_RO_COMPAT_FINOBT) -#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL -static inline bool -xfs_sb_has_ro_compat_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_ro_compat & feature) != 0; -} - -#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ -#define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) - -#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL -static inline bool -xfs_sb_has_incompat_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_incompat & feature) != 0; -} - -#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 -#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL -static inline bool -xfs_sb_has_incompat_log_feature( - struct xfs_sb *sbp, - __uint32_t feature) -{ - return (sbp->sb_features_log_incompat & feature) != 0; -} - -/* - * V5 superblock specific feature checks - */ -static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); -} - -static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); -} - -/* - * end of superblock version macros - */ - -static inline bool -xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) -{ - return (ino == sbp->sb_uquotino || - ino == sbp->sb_gquotino || - ino == sbp->sb_pquotino); -} - -#define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ -#define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) - -#define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) -#define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ - xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) -#define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ - XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) - -/* - * File system sector to basic block conversions. - */ -#define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) - -/* - * File system block to basic block conversions. - */ -#define XFS_FSB_TO_BB(mp,fsbno) ((fsbno) << (mp)->m_blkbb_log) -#define XFS_BB_TO_FSB(mp,bb) \ - (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) -#define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) - -/* - * File system block to byte conversions. - */ -#define XFS_FSB_TO_B(mp,fsbno) ((xfs_fsize_t)(fsbno) << (mp)->m_sb.sb_blocklog) -#define XFS_B_TO_FSB(mp,b) \ - ((((__uint64_t)(b)) + (mp)->m_blockmask) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) -#define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) - -/* * perag get/put wrappers for ref counting */ extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); @@ -611,11 +27,12 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, extern void xfs_perag_put(struct xfs_perag *pag); extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); -extern void xfs_sb_calc_crc(struct xfs_buf *); -extern void xfs_mod_sb(struct xfs_trans *, __int64_t); -extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *); -extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); -extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern void xfs_sb_calc_crc(struct xfs_buf *bp); +extern void xfs_log_sb(struct xfs_trans *tp); +extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); +extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); +extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); +extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 82404da2ca67..8dda4b321343 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -82,7 +82,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; #define XFS_TRANS_ATTR_RM 23 #define XFS_TRANS_ATTR_FLAG 24 #define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_QM_SBCHANGE 26 +#define XFS_TRANS_SB_CHANGE 26 /* * Dummy entries since we use the transaction type to index into the * trans_type[] in xlog_recover_print_trans_head() @@ -95,17 +95,15 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; #define XFS_TRANS_QM_DQCLUSTER 32 #define XFS_TRANS_QM_QINOCREATE 33 #define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_SB_UNIT 35 -#define XFS_TRANS_FSYNC_TS 36 -#define XFS_TRANS_GROWFSRT_ALLOC 37 -#define XFS_TRANS_GROWFSRT_ZERO 38 -#define XFS_TRANS_GROWFSRT_FREE 39 -#define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_SB_COUNT 41 -#define XFS_TRANS_CHECKPOINT 42 -#define XFS_TRANS_ICREATE 43 -#define XFS_TRANS_CREATE_TMPFILE 44 -#define XFS_TRANS_TYPE_MAX 44 +#define XFS_TRANS_FSYNC_TS 35 +#define XFS_TRANS_GROWFSRT_ALLOC 36 +#define XFS_TRANS_GROWFSRT_ZERO 37 +#define XFS_TRANS_GROWFSRT_FREE 38 +#define XFS_TRANS_SWAPEXT 39 +#define XFS_TRANS_CHECKPOINT 40 +#define XFS_TRANS_ICREATE 41 +#define XFS_TRANS_CREATE_TMPFILE 42 +#define XFS_TRANS_TYPE_MAX 43 /* new transaction types need to be reflected in xfs_logprint(8) */ #define XFS_TRANS_TYPES \ @@ -113,7 +111,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ { XFS_TRANS_INACTIVE, "INACTIVE" }, \ { XFS_TRANS_CREATE, "CREATE" }, \ - { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ { XFS_TRANS_REMOVE, "REMOVE" }, \ @@ -134,23 +131,23 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops; { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ - { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_SB_CHANGE, "SBCHANGE" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ - { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ - { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ - { XFS_TRANS_DUMMY1, "DUMMY1" }, \ - { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XFS_TRANS_ICREATE, "ICREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } /* diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 5782f037eab4..e7e26bd6468f 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_bmap_btree.h" #include "xfs_inode.h" @@ -180,6 +178,8 @@ xfs_symlink_local_to_remote( struct xfs_mount *mp = ip->i_mount; char *buf; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + if (!xfs_sb_version_hascrc(&mp->m_sb)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f2bda7c76b8a..68cb1e7bf2bb 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -718,17 +716,6 @@ xfs_calc_clear_agi_bucket_reservation( } /* - * Clearing the quotaflags in the superblock. - * the super block for changing quota flags: sector size - */ -STATIC uint -xfs_calc_qm_sbchange_reservation( - struct xfs_mount *mp) -{ - return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* * Adjusting quota limits. * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) */ @@ -866,9 +853,6 @@ xfs_trans_resv_calc( * The following transactions are logged in logical format with * a default log count. */ - resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp); - resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 1097d14cd583..2d5bdfce6d8f 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -56,7 +56,6 @@ struct xfs_trans_resv { struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ - struct xfs_trans_res tr_qm_sbchange; /* change quota flags */ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index b79dc66b2ecd..b79dc66b2ecd 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index a65fa5dde6e9..4b641676f258 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -19,8 +19,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" -#include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_acl.h" diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 5dc163744511..3841b07f27bf 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,42 +22,6 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_NOT_PRESENT (-1) - -/* On-disk XFS access control list structure */ -struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - __be16 ae_pad; /* fill the implicit hole in the structure */ -}; - -struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry acl_entry[0]; -}; - -/* - * The number of ACL entries allowed is defined by the on-disk format. - * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is - * limited only by the maximum size of the xattr that stores the information. - */ -#define XFS_ACL_MAX_ENTRIES(mp) \ - (xfs_sb_version_hascrc(&mp->m_sb) \ - ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ - sizeof(struct xfs_acl_entry) \ - : 25) - -#define XFS_ACL_MAX_SIZE(mp) \ - (sizeof(struct xfs_acl) + \ - sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) - -/* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" -#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) -#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) - #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index f5b2453a43b2..3a9b7a1b8704 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -33,7 +31,6 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> @@ -138,30 +135,22 @@ xfs_setfilesize_trans_alloc( */ STATIC int xfs_setfilesize( - struct xfs_ioend *ioend) + struct xfs_inode *ip, + struct xfs_trans *tp, + xfs_off_t offset, + size_t size) { - struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_trans *tp = ioend->io_append_trans; xfs_fsize_t isize; - /* - * The transaction may have been allocated in the I/O submission thread, - * thus we need to mark ourselves as beeing in a transaction manually. - * Similarly for freeze protection. - */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); - xfs_ilock(ip, XFS_ILOCK_EXCL); - isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); + isize = xfs_new_eof(ip, offset + size); if (!isize) { xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_trans_cancel(tp, 0); return 0; } - trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); + trace_xfs_setfilesize(ip, offset, size); ip->i_d.di_size = isize; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); @@ -170,6 +159,25 @@ xfs_setfilesize( return xfs_trans_commit(tp, 0); } +STATIC int +xfs_setfilesize_ioend( + struct xfs_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_trans *tp = ioend->io_append_trans; + + /* + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as being in a transaction manually. + * Similarly for freeze protection. + */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); +} + /* * Schedule IO completion handling on the final put of an ioend. * @@ -185,8 +193,7 @@ xfs_finish_ioend( if (ioend->io_type == XFS_IO_UNWRITTEN) queue_work(mp->m_unwritten_workqueue, &ioend->io_work); - else if (ioend->io_append_trans || - (ioend->io_isdirect && xfs_ioend_is_append(ioend))) + else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); else xfs_destroy_ioend(ioend); @@ -218,22 +225,8 @@ xfs_end_io( if (ioend->io_type == XFS_IO_UNWRITTEN) { error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); - } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { - /* - * For direct I/O we do not know if we need to allocate blocks - * or not so we can't preallocate an append transaction as that - * results in nested reservations and log space deadlocks. Hence - * allocate the transaction here. While this is sub-optimal and - * can block IO completion for some time, we're stuck with doing - * it this way until we can pass the ioend to the direct IO - * allocation callbacks and avoid nesting that way. - */ - error = xfs_setfilesize_trans_alloc(ioend); - if (error) - goto done; - error = xfs_setfilesize(ioend); } else if (ioend->io_append_trans) { - error = xfs_setfilesize(ioend); + error = xfs_setfilesize_ioend(ioend); } else { ASSERT(!xfs_ioend_is_append(ioend)); } @@ -245,17 +238,6 @@ done: } /* - * Call IO completion handling in caller context on the final put of an ioend. - */ -STATIC void -xfs_finish_ioend_sync( - struct xfs_ioend *ioend) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) - xfs_end_io(&ioend->io_work); -} - -/* * Allocate and initialise an IO completion structure. * We need to track unwritten extent write completion here initially. * We'll need to extend this for updating the ondisk inode size later @@ -276,7 +258,6 @@ xfs_alloc_ioend( * all the I/O from calling the completion routine too early. */ atomic_set(&ioend->io_remaining, 1); - ioend->io_isdirect = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; @@ -1462,11 +1443,7 @@ xfs_get_blocks_direct( * * If the private argument is non-NULL __xfs_get_blocks signals us that we * need to issue a transaction to convert the range from unwritten to written - * extents. In case this is regular synchronous I/O we just call xfs_end_io - * to do this and we are done. But in case this was a successful AIO - * request this handler is called from interrupt context, from which we - * can't start transactions. In that case offload the I/O completion to - * the workqueues we also use for buffered I/O completion. + * extents. */ STATIC void xfs_end_io_direct_write( @@ -1475,7 +1452,12 @@ xfs_end_io_direct_write( ssize_t size, void *private) { - struct xfs_ioend *ioend = iocb->private; + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp)) + return; /* * While the generic direct I/O code updates the inode size, it does @@ -1483,22 +1465,33 @@ xfs_end_io_direct_write( * end_io handler thinks the on-disk size is outside the in-core * size. To prevent this just update it a little bit earlier here. */ - if (offset + size > i_size_read(ioend->io_inode)) - i_size_write(ioend->io_inode, offset + size); + if (offset + size > i_size_read(inode)) + i_size_write(inode, offset + size); /* - * blockdev_direct_IO can return an error even after the I/O - * completion handler was called. Thus we need to protect - * against double-freeing. + * For direct I/O we do not know if we need to allocate blocks or not, + * so we can't preallocate an append transaction, as that results in + * nested reservations and log space deadlocks. Hence allocate the + * transaction here. While this is sub-optimal and can block IO + * completion for some time, we're stuck with doing it this way until + * we can pass the ioend to the direct IO allocation callbacks and + * avoid nesting that way. */ - iocb->private = NULL; - - ioend->io_offset = offset; - ioend->io_size = size; - if (private && size > 0) - ioend->io_type = XFS_IO_UNWRITTEN; + if (private && size > 0) { + xfs_iomap_write_unwritten(ip, offset, size); + } else if (offset + size > ip->i_d.di_size) { + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return; + } - xfs_finish_ioend_sync(ioend); + xfs_setfilesize(ip, tp, offset, size); + } } STATIC ssize_t @@ -1510,39 +1503,16 @@ xfs_vm_direct_IO( { struct inode *inode = iocb->ki_filp->f_mapping->host; struct block_device *bdev = xfs_find_bdev_for_inode(inode); - struct xfs_ioend *ioend = NULL; - ssize_t ret; if (rw & WRITE) { - size_t size = iov_iter_count(iter); - - /* - * We cannot preallocate a size update transaction here as we - * don't know whether allocation is necessary or not. Hence we - * can only tell IO completion that one is necessary if we are - * not doing unwritten extent conversion. - */ - iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); - if (offset + size > XFS_I(inode)->i_d.di_size) - ioend->io_isdirect = 1; - - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + return __blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, DIO_ASYNC_EXTEND); - if (ret != -EIOCBQUEUED && iocb->private) - goto out_destroy_ioend; - } else { - ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, - offset, xfs_get_blocks_direct, - NULL, NULL, 0); } - - return ret; - -out_destroy_ioend: - xfs_destroy_ioend(ioend); - return ret; + return __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, + NULL, NULL, 0); } /* diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index f94dd459dff9..ac644e0137a4 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ }; #define XFS_IO_TYPES \ - { 0, "" }, \ { XFS_IO_DELALLOC, "delalloc" }, \ { XFS_IO_UNWRITTEN, "unwritten" }, \ { XFS_IO_OVERWRITE, "overwrite" } @@ -45,7 +43,6 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ - unsigned int io_isdirect : 1;/* direct I/O */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index aa2a8b1838a2..83af4c149635 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -39,7 +37,6 @@ #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trace.h" -#include "xfs_dinode.h" #include "xfs_dir2.h" /* diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 62db83ab6cbc..a43d370d2c58 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -39,7 +37,6 @@ #include "xfs_trace.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" #include "xfs_dir2.h" STATIC int diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 281002689d64..22a5dcb70b32 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -42,7 +40,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" -#include "xfs_dinode.h" /* Kernel only BMAP related definitions and functions */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 2fdb72d2c908..736429a72a12 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -26,43 +26,8 @@ struct xfs_ifork; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_bmalloca; -/* - * Argument structure for xfs_bmap_alloc. - */ -struct xfs_bmalloca { - xfs_fsblock_t *firstblock; /* i/o first block allocated */ - struct xfs_bmap_free *flist; /* bmap freelist */ - struct xfs_trans *tp; /* transaction pointer */ - struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec prev; /* extent before the new one */ - struct xfs_bmbt_irec got; /* extent after, or delayed */ - - xfs_fileoff_t offset; /* offset in file filling in */ - xfs_extlen_t length; /* i/o length asked/allocated */ - xfs_fsblock_t blkno; /* starting block of new extent */ - - struct xfs_btree_cur *cur; /* btree cursor */ - xfs_extnum_t idx; /* current extent index */ - int nallocs;/* number of extents alloc'd */ - int logflags;/* flags for transaction logging */ - - xfs_extlen_t total; /* total blocks needed for xaction */ - xfs_extlen_t minlen; /* minimum allocation size (blocks) */ - xfs_extlen_t minleft; /* amount must be left after alloc */ - bool eof; /* set if allocating past last extent */ - bool wasdel; /* replacing a delayed allocation */ - bool userdata;/* set if is user data */ - bool aeof; /* allocated space at eof */ - bool conv; /* overwriting unwritten extents */ - int flags; - struct completion *done; - struct work_struct work; - int result; -}; - -int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, int whichfork, int *eof); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 24b4ebea0d4d..1790b00bea7a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -34,18 +34,16 @@ #include <linux/backing-dev.h> #include <linux/freezer.h> +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trace.h" #include "xfs_log.h" static kmem_zone_t *xfs_buf_zone; -static struct workqueue_struct *xfslogd_workqueue; - #ifdef XFS_BUF_LOCK_TRACKING # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) @@ -463,7 +461,7 @@ _xfs_buf_find( * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno >= eofs) { + if (blkno < 0 || blkno >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -1043,7 +1041,7 @@ xfs_buf_ioend_work( struct work_struct *work) { struct xfs_buf *bp = - container_of(work, xfs_buf_t, b_iodone_work); + container_of(work, xfs_buf_t, b_ioend_work); xfs_buf_ioend(bp); } @@ -1052,8 +1050,8 @@ void xfs_buf_ioend_async( struct xfs_buf *bp) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_ioend_wq, &bp->b_ioend_work); } void @@ -1222,6 +1220,13 @@ _xfs_buf_ioapply( */ bp->b_error = 0; + /* + * Initialize the I/O completion workqueue if we haven't yet or the + * submitter has not opted to specify a custom one. + */ + if (!bp->b_ioend_wq) + bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; + if (bp->b_flags & XBF_WRITE) { if (bp->b_flags & XBF_SYNCIO) rw = WRITE_SYNC; @@ -1483,6 +1488,7 @@ xfs_buf_iomove( static enum lru_status xfs_buftarg_wait_rele( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) @@ -1504,7 +1510,7 @@ xfs_buftarg_wait_rele( */ atomic_set(&bp->b_lru_ref, 0); bp->b_state |= XFS_BSTATE_DISPOSE; - list_move(item, dispose); + list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lock); return LRU_REMOVED; } @@ -1541,6 +1547,7 @@ xfs_wait_buftarg( static enum lru_status xfs_buftarg_isolate( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { @@ -1564,7 +1571,7 @@ xfs_buftarg_isolate( } bp->b_state |= XFS_BSTATE_DISPOSE; - list_move(item, dispose); + list_lru_isolate_move(lru, item, dispose); spin_unlock(&bp->b_lock); return LRU_REMOVED; } @@ -1578,10 +1585,9 @@ xfs_buftarg_shrink_scan( struct xfs_buftarg, bt_shrinker); LIST_HEAD(dispose); unsigned long freed; - unsigned long nr_to_scan = sc->nr_to_scan; - freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, - &dispose, &nr_to_scan); + freed = list_lru_shrink_walk(&btp->bt_lru, sc, + xfs_buftarg_isolate, &dispose); while (!list_empty(&dispose)) { struct xfs_buf *bp; @@ -1600,7 +1606,7 @@ xfs_buftarg_shrink_count( { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); - return list_lru_count_node(&btp->bt_lru, sc->nid); + return list_lru_shrink_count(&btp->bt_lru, sc); } void @@ -1882,15 +1888,8 @@ xfs_buf_init(void) if (!xfs_buf_zone) goto out; - xfslogd_workqueue = alloc_workqueue("xfslogd", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1); - if (!xfslogd_workqueue) - goto out_free_buf_zone; - return 0; - out_free_buf_zone: - kmem_zone_destroy(xfs_buf_zone); out: return -ENOMEM; } @@ -1898,6 +1897,5 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - destroy_workqueue(xfslogd_workqueue); kmem_zone_destroy(xfs_buf_zone); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 82002c00af90..75ff5d5a7d2e 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -164,7 +164,8 @@ typedef struct xfs_buf { struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ void *b_addr; /* virtual address of buffer */ - struct work_struct b_iodone_work; + struct work_struct b_ioend_work; + struct workqueue_struct *b_ioend_wq; /* I/O completion wq */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f15969543326..507d96a57ac7 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -17,11 +17,11 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" @@ -319,6 +319,10 @@ xfs_buf_item_format( ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || (bip->bli_flags & XFS_BLI_STALE)); + ASSERT((bip->bli_flags & XFS_BLI_STALE) || + (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF + && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + /* * If it is an inode buffer, transfer the in-memory state to the @@ -535,7 +539,7 @@ xfs_buf_item_push( if ((bp->b_flags & XBF_WRITE_FAIL) && ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { xfs_warn(bp->b_target->bt_mount, -"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", +"Detected failing async write on buffer block 0x%llx. Retrying async write.", (long long)bp->b_bn); } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index f1b69edcdf31..098cd78fe708 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -34,7 +32,6 @@ #include "xfs_trace.h" #include "xfs_bmap.h" #include "xfs_trans.h" -#include "xfs_dinode.h" /* * Directory file type support functions @@ -44,7 +41,7 @@ static unsigned char xfs_dir3_filetype_table[] = { DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, }; -unsigned char +static unsigned char xfs_dir3_get_dtype( struct xfs_mount *mp, __uint8_t filetype) @@ -57,22 +54,6 @@ xfs_dir3_get_dtype( return xfs_dir3_filetype_table[filetype]; } -/* - * @mode, if set, indicates that the type field needs to be set up. - * This uses the transformation from file mode to DT_* as defined in linux/fs.h - * for file type specification. This will be propagated into the directory - * structure if appropriate for the given operation and filesystem config. - */ -const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { - [0] = XFS_DIR3_FT_UNKNOWN, - [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, - [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, - [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, -}; STATIC int xfs_dir2_sf_getdents( diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 13d08a1b390e..799e5a2d334d 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -20,7 +20,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_quota.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 63c2de49f61d..02c01bbbc789 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -22,8 +22,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index c24c67e22a2a..2f536f33cd26 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -86,7 +86,7 @@ static inline void xfs_dqflock(xfs_dquot_t *dqp) wait_for_completion(&dqp->q_flush); } -static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) +static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp) { return try_wait_for_completion(&dqp->q_flush); } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index f33fbaaa4d8a..814cff94e78f 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b92fd7bc49e3..3ee186ac1093 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -20,8 +20,6 @@ #include "xfs_fs.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 5a6bd5d8779a..b97359ba2648 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -19,10 +19,9 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_export.h" #include "xfs_inode.h" @@ -31,6 +30,7 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_pnfs.h" /* * Note that we only accept fileids which are long enough rather than allow @@ -246,4 +246,9 @@ const struct export_operations xfs_export_operations = { .fh_to_parent = xfs_fs_fh_to_parent, .get_parent = xfs_fs_get_parent, .commit_metadata = xfs_fs_nfs_commit_metadata, +#ifdef CONFIG_NFSD_PNFS + .get_uuid = xfs_fs_get_uuid, + .map_blocks = xfs_fs_map_blocks, + .commit_blocks = xfs_fs_commit_blocks, +#endif }; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index fd22f69049d4..c263e079273e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -24,7 +24,6 @@ #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c4327419dc5c..cb7fe64cdbfa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -17,10 +17,9 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index eb596b419942..a2e1cb8a568b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -37,8 +35,8 @@ #include "xfs_ioctl.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_dinode.h" #include "xfs_icache.h" +#include "xfs_pnfs.h" #include <linux/aio.h> #include <linux/dcache.h> @@ -130,6 +128,42 @@ xfs_iozero( return (-status); } +int +xfs_update_prealloc_flags( + struct xfs_inode *ip, + enum xfs_prealloc_flags flags) +{ + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if (!(flags & XFS_PREALLOC_INVISIBLE)) { + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + if (flags & XFS_PREALLOC_SET) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + if (flags & XFS_PREALLOC_CLEAR) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (flags & XFS_PREALLOC_SYNC) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); +} + /* * Fsync operations on directories are much simpler than on regular files, * as there is no file data to flush, and thus also no need for explicit @@ -363,7 +397,8 @@ STATIC int /* error (positive) */ xfs_zero_last_block( struct xfs_inode *ip, xfs_fsize_t offset, - xfs_fsize_t isize) + xfs_fsize_t isize, + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); @@ -391,6 +426,7 @@ xfs_zero_last_block( zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; + *did_zeroing = true; return xfs_iozero(ip, isize, zero_len); } @@ -409,7 +445,8 @@ int /* error (positive) */ xfs_zero_eof( struct xfs_inode *ip, xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize) /* current inode size */ + xfs_fsize_t isize, /* current inode size */ + bool *did_zeroing) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_zero_fsb; @@ -431,7 +468,7 @@ xfs_zero_eof( * We only zero a part of that block so it is handled specially. */ if (XFS_B_FSB_OFFSET(mp, isize) != 0) { - error = xfs_zero_last_block(ip, offset, isize); + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); if (error) return error; } @@ -491,6 +528,7 @@ xfs_zero_eof( if (error) return error; + *did_zeroing = true; start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); } @@ -521,6 +559,10 @@ restart: if (error) return error; + error = xfs_break_layouts(inode, iolock); + if (error) + return error; + /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -529,13 +571,15 @@ restart: * having to redo all checks before. */ if (*pos > i_size_read(inode)) { + bool zero = false; + if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); goto restart; } - error = xfs_zero_eof(ip, *pos, i_size_read(inode)); + error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); if (error) return error; } @@ -702,7 +746,7 @@ xfs_file_buffered_aio_write( iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; + current->backing_dev_info = inode_to_bdi(inode); write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); @@ -787,8 +831,9 @@ xfs_file_fallocate( { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); - struct xfs_trans *tp; long error; + enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; if (!S_ISREG(inode->i_mode)) @@ -797,7 +842,11 @@ xfs_file_fallocate( FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock); + if (error) + goto out_unlock; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -825,6 +874,8 @@ xfs_file_fallocate( if (error) goto out_unlock; } else { + flags |= XFS_PREALLOC_SET; + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -842,28 +893,10 @@ xfs_file_fallocate( goto out_unlock; } - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto out_unlock; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - ip->i_d.di_mode &= ~S_ISUID; - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; - - if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (file->f_flags & O_DSYNC) - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + flags |= XFS_PREALLOC_SYNC; + + error = xfs_update_prealloc_flags(ip, flags); if (error) goto out_unlock; @@ -877,7 +910,7 @@ xfs_file_fallocate( } out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); return error; } @@ -933,7 +966,6 @@ xfs_file_readdir( { struct inode *inode = file_inode(file); xfs_inode_t *ip = XFS_I(inode); - int error; size_t bufsize; /* @@ -950,10 +982,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); - error = xfs_readdir(ip, ctx, bufsize); - if (error) - return error; - return 0; + return xfs_readdir(ip, ctx, bufsize); } STATIC int @@ -1391,5 +1420,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e92730c1d3ca..a2e86e8a0fea 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -20,16 +20,13 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_ag.h" #include "xfs_sb.h" #include "xfs_mount.h" -#include "xfs_inum.h" #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_alloc.h" #include "xfs_mru_cache.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_trace.h" diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c05ac8b70fa9..74efe5b760dc 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -22,7 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -40,7 +39,6 @@ #include "xfs_rtalloc.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" /* @@ -490,6 +488,7 @@ xfs_growfs_data_private( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); if (error) return error; @@ -543,7 +542,7 @@ xfs_growfs_data_private( saved_error = error; continue; } - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); error = xfs_bwrite(bp); xfs_buf_relse(bp); @@ -603,6 +602,12 @@ xfs_growfs_data( if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; error = xfs_growfs_data_private(mp, in); + /* + * Increment the generation unconditionally, the error could be from + * updating the secondary superblocks, in which case the new size + * is live already. + */ + mp->m_generation++; mutex_unlock(&mp->m_growlock); return error; } @@ -758,37 +763,6 @@ out: return 0; } -/* - * Dump a transaction into the log that contains no real change. This is needed - * to be able to make the log dirty or stamp the current tail LSN into the log - * during the covering operation. - * - * We cannot use an inode here for this - that will push dirty state back up - * into the VFS and then periodic inode flushing will prevent log covering from - * making progress. Hence we log a field in the superblock instead and use a - * synchronous transaction to ensure the superblock is immediately unpinned - * and can be written back. - */ -int -xfs_fs_log_dummy( - xfs_mount_t *mp) -{ - xfs_trans_t *tp; - int error; - - tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - /* log the UUID because it is an unchanging field */ - xfs_mod_sb(tp, XFS_SB_UUID); - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); -} - int xfs_fs_goingdown( xfs_mount_t *mp, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index b45f7b27b5df..9771b7ef62ed 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -20,9 +20,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" @@ -65,6 +63,7 @@ xfs_inode_alloc( return NULL; } + XFS_STATS_INC(vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); @@ -130,6 +129,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 46748b86b12f..62f1f91c32cb 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -34,6 +34,14 @@ struct xfs_eofblocks { #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ /* + * tags for inode radix tree + */ +#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup + in xfs_inode_ag_iterator */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ + +/* * Flags for xfs_iget() */ #define XFS_IGET_CREATE 0x1 diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 7e4549233251..d45ca72af6fb 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -18,11 +18,10 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ed049d1e332..6163767aa856 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -23,9 +23,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_da_format.h" @@ -1082,7 +1080,7 @@ xfs_create( struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; struct xfs_dquot *pdqp = NULL; - struct xfs_trans_res tres; + struct xfs_trans_res *tres; uint resblks; trace_xfs_create(dp, name); @@ -1105,13 +1103,11 @@ xfs_create( if (is_dir) { rdev = 0; resblks = XFS_MKDIR_SPACE_RES(mp, name->len); - tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; - tres.tr_logcount = XFS_MKDIR_LOG_COUNT; + tres = &M_RES(mp)->tr_mkdir; tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { resblks = XFS_CREATE_SPACE_RES(mp, name->len); - tres.tr_logres = M_RES(mp)->tr_create.tr_logres; - tres.tr_logcount = XFS_CREATE_LOG_COUNT; + tres = &M_RES(mp)->tr_create; tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } @@ -1123,17 +1119,16 @@ xfs_create( * the case we'll drop the one we have and get a more * appropriate transaction later. */ - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(tp, &tres, resblks, 0); + error = xfs_trans_reserve(tp, tres, resblks, 0); if (error == -ENOSPC) { /* flush outstanding delalloc blocks and retry */ xfs_flush_inodes(mp); - error = xfs_trans_reserve(tp, &tres, resblks, 0); + error = xfs_trans_reserve(tp, tres, resblks, 0); } if (error == -ENOSPC) { /* No space at all so try a "no-allocation" reservation */ resblks = 0; - error = xfs_trans_reserve(tp, &tres, 0, 0); + error = xfs_trans_reserve(tp, tres, 0, 0); } if (error) { cancel_flags = 0; @@ -2000,6 +1995,7 @@ xfs_iunlink( agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); return 0; @@ -2091,6 +2087,7 @@ xfs_iunlink_remove( agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); } else { @@ -2488,9 +2485,7 @@ xfs_remove( xfs_fsblock_t first_block; int cancel_flags; int committed; - int link_zero; uint resblks; - uint log_count; trace_xfs_remove(dp, name); @@ -2505,13 +2500,10 @@ xfs_remove( if (error) goto std_return; - if (is_dir) { + if (is_dir) tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - log_count = XFS_DEFAULT_LOG_COUNT; - } else { + else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - log_count = XFS_REMOVE_LOG_COUNT; - } cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* @@ -2579,9 +2571,6 @@ xfs_remove( if (error) goto out_trans_cancel; - /* Determine if this is the last link while the inode is locked */ - link_zero = (ip->i_d.di_nlink == 0); - xfs_bmap_init(&free_list, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &free_list, resblks); @@ -2669,6 +2658,124 @@ xfs_sort_for_rename( } /* + * xfs_cross_rename() + * + * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall + */ +STATIC int +xfs_cross_rename( + struct xfs_trans *tp, + struct xfs_inode *dp1, + struct xfs_name *name1, + struct xfs_inode *ip1, + struct xfs_inode *dp2, + struct xfs_name *name2, + struct xfs_inode *ip2, + struct xfs_bmap_free *free_list, + xfs_fsblock_t *first_block, + int spaceres) +{ + int error = 0; + int ip1_flags = 0; + int ip2_flags = 0; + int dp2_flags = 0; + + /* Swap inode number for dirent in first parent */ + error = xfs_dir_replace(tp, dp1, name1, + ip2->i_ino, + first_block, free_list, spaceres); + if (error) + goto out; + + /* Swap inode number for dirent in second parent */ + error = xfs_dir_replace(tp, dp2, name2, + ip1->i_ino, + first_block, free_list, spaceres); + if (error) + goto out; + + /* + * If we're renaming one or more directories across different parents, + * update the respective ".." entries (and link counts) to match the new + * parents. + */ + if (dp1 != dp2) { + dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + + if (S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, + dp1->i_ino, first_block, + free_list, spaceres); + if (error) + goto out; + + /* transfer ip2 ".." reference to dp1 */ + if (!S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_droplink(tp, dp2); + if (error) + goto out; + error = xfs_bumplink(tp, dp1); + if (error) + goto out; + } + + /* + * Although ip1 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + } + + if (S_ISDIR(ip1->i_d.di_mode)) { + error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, + dp2->i_ino, first_block, + free_list, spaceres); + if (error) + goto out; + + /* transfer ip1 ".." reference to dp2 */ + if (!S_ISDIR(ip2->i_d.di_mode)) { + error = xfs_droplink(tp, dp1); + if (error) + goto out; + error = xfs_bumplink(tp, dp2); + if (error) + goto out; + } + + /* + * Although ip2 isn't changed here, userspace needs + * to be warned about the change, so that applications + * relying on it (like backup ones), will properly + * notify the change + */ + ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + ip2_flags |= XFS_ICHGTIME_CHG; + } + } + + if (ip1_flags) { + xfs_trans_ichgtime(tp, ip1, ip1_flags); + xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); + } + if (ip2_flags) { + xfs_trans_ichgtime(tp, ip2, ip2_flags); + xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); + } + if (dp2_flags) { + xfs_trans_ichgtime(tp, dp2, dp2_flags); + xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); + } + xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); +out: + return error; +} + +/* * xfs_rename */ int @@ -2678,7 +2785,8 @@ xfs_rename( xfs_inode_t *src_ip, xfs_inode_t *target_dp, struct xfs_name *target_name, - xfs_inode_t *target_ip) + xfs_inode_t *target_ip, + unsigned int flags) { xfs_trans_t *tp = NULL; xfs_mount_t *mp = src_dp->i_mount; @@ -2756,6 +2864,22 @@ xfs_rename( } /* + * Handle RENAME_EXCHANGE flags + */ + if (flags & RENAME_EXCHANGE) { + if (target_ip == NULL) { + error = -EINVAL; + goto error_return; + } + error = xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); + if (error) + goto abort_return; + goto finish_rename; + } + + /* * Set up the target. */ if (target_ip == NULL) { @@ -2894,6 +3018,7 @@ xfs_rename( if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); +finish_rename: /* * If this is a synchronous mount, make sure that the * rename transaction goes to disk before returning to diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 9af2882e1f4c..a1cd55f3f351 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -20,7 +20,6 @@ #include "xfs_inode_buf.h" #include "xfs_inode_fork.h" -#include "xfs_dinode.h" /* * Kernel only inode definitions @@ -324,7 +323,6 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ ((pip)->i_d.di_mode & S_ISGID)) - int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, @@ -340,7 +338,7 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, - struct xfs_inode *target_ip); + struct xfs_inode *target_ip, unsigned int flags); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); @@ -379,8 +377,18 @@ int xfs_droplink(struct xfs_trans *, struct xfs_inode *); int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); /* from xfs_file.c */ -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); -int xfs_iozero(struct xfs_inode *, loff_t, size_t); +enum xfs_prealloc_flags { + XFS_PREALLOC_SET = (1 << 1), + XFS_PREALLOC_CLEAR = (1 << 2), + XFS_PREALLOC_SYNC = (1 << 3), + XFS_PREALLOC_INVISIBLE = (1 << 4), +}; + +int xfs_update_prealloc_flags(struct xfs_inode *ip, + enum xfs_prealloc_flags flags); +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, + xfs_fsize_t isize, bool *did_zeroing); +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); #define IHOLD(ip) \ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 63de0b0acc32..bf13a5a7e2f4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -29,7 +27,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" -#include "xfs_dinode.h" #include "xfs_log.h" diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 24c926b6fe85..ac4feae45eb3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ioctl.h" @@ -40,8 +38,8 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_symlink.h" -#include "xfs_dinode.h" #include "xfs_trans.h" +#include "xfs_pnfs.h" #include <linux/capability.h> #include <linux/dcache.h> @@ -289,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!S_ISLNK(dentry->d_inode->i_mode)) { + if (!d_is_symlink(dentry)) { error = -EINVAL; goto out_dput; } @@ -609,11 +607,9 @@ xfs_ioc_space( unsigned int cmd, xfs_flock64_t *bf) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; struct iattr iattr; - bool setprealloc = false; - bool clrprealloc = false; + enum xfs_prealloc_flags flags = 0; + uint iolock = XFS_IOLOCK_EXCL; int error; /* @@ -633,11 +629,19 @@ xfs_ioc_space( if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (filp->f_flags & O_DSYNC) + flags |= XFS_PREALLOC_SYNC; + if (ioflags & XFS_IO_INVIS) + flags |= XFS_PREALLOC_INVISIBLE; + error = mnt_want_write_file(filp); if (error) return error; - xfs_ilock(ip, XFS_IOLOCK_EXCL); + xfs_ilock(ip, iolock); + error = xfs_break_layouts(inode, &iolock); + if (error) + goto out_unlock; switch (bf->l_whence) { case 0: /*SEEK_SET*/ @@ -676,25 +680,23 @@ xfs_ioc_space( } if (bf->l_start < 0 || - bf->l_start > mp->m_super->s_maxbytes || + bf->l_start > inode->i_sb->s_maxbytes || bf->l_start + bf->l_len < 0 || - bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { + bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) { error = -EINVAL; goto out_unlock; } switch (cmd) { case XFS_IOC_ZERO_RANGE: + flags |= XFS_PREALLOC_SET; error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); - if (!error) - setprealloc = true; break; case XFS_IOC_RESVSP: case XFS_IOC_RESVSP64: + flags |= XFS_PREALLOC_SET; error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, XFS_BMAPI_PREALLOC); - if (!error) - setprealloc = true; break; case XFS_IOC_UNRESVSP: case XFS_IOC_UNRESVSP64: @@ -704,6 +706,7 @@ xfs_ioc_space( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP: case XFS_IOC_FREESP64: + flags |= XFS_PREALLOC_CLEAR; if (bf->l_start > XFS_ISIZE(ip)) { error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), bf->l_start - XFS_ISIZE(ip), 0); @@ -715,8 +718,6 @@ xfs_ioc_space( iattr.ia_size = bf->l_start; error = xfs_setattr_size(ip, &iattr); - if (!error) - clrprealloc = true; break; default: ASSERT(0); @@ -726,35 +727,10 @@ xfs_ioc_space( if (error) goto out_unlock; - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - goto out_unlock; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - if (!(ioflags & XFS_IO_INVIS)) { - ip->i_d.di_mode &= ~S_ISUID; - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - if (setprealloc) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - else if (clrprealloc) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (filp->f_flags & O_DSYNC) - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_update_prealloc_flags(ip, flags); out_unlock: - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(ip, iolock); mnt_drop_write_file(filp); return error; } @@ -1016,20 +992,182 @@ xfs_diflags_to_linux( inode->i_flags &= ~S_NOATIME; } -#define FSX_PROJID 1 -#define FSX_EXTSIZE 2 -#define FSX_XFLAGS 4 -#define FSX_NONBLOCK 8 +static int +xfs_ioctl_setattr_xflags( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + /* Can't change realtime flag if any extents are allocated. */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + return -EINVAL; + + /* If realtime flag is set then must have realtime device */ + if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) + return -EINVAL; + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || + (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + XFS_STATS_INC(xs_ig_attrchg); + return 0; +} + +/* + * Set up the transaction structure for the setattr operation, checking that we + * have permission to do so. On success, return a clean transaction and the + * inode locked exclusively ready for further operation specific checks. On + * failure, return an error without modifying or locking the inode. + */ +static struct xfs_trans * +xfs_ioctl_setattr_get_trans( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return ERR_PTR(-EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return ERR_PTR(-EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_cancel; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal to the file owner + * ID, except in cases where the CAP_FSETID capability is applicable. + */ + if (!inode_owner_or_capable(VFS_I(ip))) { + error = -EPERM; + goto out_cancel; + } + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + return tp; + +out_cancel: + xfs_trans_cancel(tp, 0); + return ERR_PTR(error); +} + +/* + * extent size hint validation is somewhat cumbersome. Rules are: + * + * 1. extent size hint is only valid for directories and regular files + * 2. XFS_XFLAG_EXTSIZE is only valid for regular files + * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 4. can only be changed on regular files if no extents are allocated + * 5. can be changed on directories at any time + * 6. extsize hint of 0 turns off hints, clears inode flags. + * 7. Extent size must be a multiple of the appropriate block size. + * 8. for non-realtime files, the extent size hint must be limited + * to half the AG size to avoid alignment extending the extent beyond the + * limits of the AG. + */ +static int +xfs_ioctl_setattr_check_extsize( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + struct xfs_mount *mp = ip->i_mount; + + if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + return -EINVAL; + + if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + !S_ISDIR(ip->i_d.di_mode)) + return -EINVAL; + + if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize)) + return -EINVAL; + + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) + return -EINVAL; + + if (XFS_IS_REALTIME_INODE(ip) || + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) + return -EINVAL; + } + + if (fa->fsx_extsize % size) + return -EINVAL; + } else + fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + + return 0; +} + +static int +xfs_ioctl_setattr_check_projid( + struct xfs_inode *ip, + struct fsxattr *fa) +{ + /* Disallow 32bit project ids if projid32bit feature is not enabled. */ + if (fa->fsx_projid > (__uint16_t)-1 && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + return -EINVAL; + + /* + * Project Quota ID state is only allowed to change from within the init + * namespace. Enforce that restriction only if we are trying to change + * the quota ID state. Everything else is allowed in user namespaces. + */ + if (current_user_ns() == &init_user_ns) + return 0; + + if (xfs_get_projid(ip) != fa->fsx_projid) + return -EINVAL; + if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) + return -EINVAL; + + return 0; +} STATIC int xfs_ioctl_setattr( xfs_inode_t *ip, - struct fsxattr *fa, - int mask) + struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - unsigned int lock_flags = 0; struct xfs_dquot *udqp = NULL; struct xfs_dquot *pdqp = NULL; struct xfs_dquot *olddquot = NULL; @@ -1037,17 +1175,9 @@ xfs_ioctl_setattr( trace_xfs_ioctl_setattr(ip); - if (mp->m_flags & XFS_MOUNT_RDONLY) - return -EROFS; - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - /* - * Disallow 32bit project ids when projid32bit feature is not enabled. - */ - if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && - !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) - return -EINVAL; + code = xfs_ioctl_setattr_check_projid(ip, fa); + if (code) + return code; /* * If disk quotas is on, we make sure that the dquots do exist on disk, @@ -1057,7 +1187,7 @@ xfs_ioctl_setattr( * If the IDs do change before we take the ilock, we're covered * because the i_*dquot fields will get updated anyway. */ - if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { + if (XFS_IS_QUOTA_ON(mp)) { code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, ip->i_d.di_gid, fa->fsx_projid, XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); @@ -1065,175 +1195,49 @@ xfs_ioctl_setattr( return code; } - /* - * For the other attributes, we acquire the inode lock and - * first do an error checking pass. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); - if (code) - goto error_return; - - lock_flags = XFS_ILOCK_EXCL; - xfs_ilock(ip, lock_flags); - - /* - * CAP_FOWNER overrides the following restrictions: - * - * The user ID of the calling process must be equal - * to the file owner ID, except in cases where the - * CAP_FSETID capability is applicable. - */ - if (!inode_owner_or_capable(VFS_I(ip))) { - code = -EPERM; - goto error_return; - } - - /* - * Do a quota reservation only if projid is actually going to change. - * Only allow changing of projid from init_user_ns since it is a - * non user namespace aware identifier. - */ - if (mask & FSX_PROJID) { - if (current_user_ns() != &init_user_ns) { - code = -EINVAL; - goto error_return; - } - - if (XFS_IS_QUOTA_RUNNING(mp) && - XFS_IS_PQUOTA_ON(mp) && - xfs_get_projid(ip) != fa->fsx_projid) { - ASSERT(tp); - code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, - pdqp, capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (code) /* out of quota */ - goto error_return; - } + tp = xfs_ioctl_setattr_get_trans(ip); + if (IS_ERR(tp)) { + code = PTR_ERR(tp); + goto error_free_dquots; } - if (mask & FSX_EXTSIZE) { - /* - * Can't change extent size if any extents are allocated. - */ - if (ip->i_d.di_nextents && - ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != - fa->fsx_extsize)) { - code = -EINVAL; /* EFBIG? */ - goto error_return; - } - /* - * Extent size must be a multiple of the appropriate block - * size, if set at all. It must also be smaller than the - * maximum extent size supported by the filesystem. - * - * Also, for non-realtime files, limit the extent size hint to - * half the size of the AGs in the filesystem so alignment - * doesn't result in extents larger than an AG. - */ - if (fa->fsx_extsize != 0) { - xfs_extlen_t size; - xfs_fsblock_t extsize_fsb; - - extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); - if (extsize_fsb > MAXEXTLEN) { - code = -EINVAL; - goto error_return; - } - - if (XFS_IS_REALTIME_INODE(ip) || - ((mask & FSX_XFLAGS) && - (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { - size = mp->m_sb.sb_rextsize << - mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { - code = -EINVAL; - goto error_return; - } - } - - if (fa->fsx_extsize % size) { - code = -EINVAL; - goto error_return; - } - } + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && + xfs_get_projid(ip) != fa->fsx_projid) { + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp, + capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_trans_cancel; } + code = xfs_ioctl_setattr_check_extsize(ip, fa); + if (code) + goto error_trans_cancel; - if (mask & FSX_XFLAGS) { - /* - * Can't change realtime flag if any extents are allocated. - */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - (XFS_IS_REALTIME_INODE(ip)) != - (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { - code = -EINVAL; /* EFBIG? */ - goto error_return; - } - - /* - * If realtime flag is set then must have realtime data. - */ - if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { - if ((mp->m_sb.sb_rblocks == 0) || - (mp->m_sb.sb_rextsize == 0) || - (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { - code = -EINVAL; - goto error_return; - } - } - - /* - * Can't modify an immutable/append-only file unless - * we have appropriate permission. - */ - if ((ip->i_d.di_flags & - (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || - (fa->fsx_xflags & - (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && - !capable(CAP_LINUX_IMMUTABLE)) { - code = -EPERM; - goto error_return; - } - } - - xfs_trans_ijoin(tp, ip, 0); + code = xfs_ioctl_setattr_xflags(tp, ip, fa); + if (code) + goto error_trans_cancel; /* - * Change file ownership. Must be the owner or privileged. + * Change file ownership. Must be the owner or privileged. CAP_FSETID + * overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be cleared upon + * successful return from chown() */ - if (mask & FSX_PROJID) { - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) - ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (xfs_get_projid(ip) != fa->fsx_projid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { - olddquot = xfs_qm_vop_chown(tp, ip, - &ip->i_pdquot, pdqp); - } - ASSERT(ip->i_d.di_version > 1); - xfs_set_projid(ip, fa->fsx_projid); - } - } + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); - if (mask & FSX_XFLAGS) { - xfs_set_diflags(ip, fa->fsx_xflags); - xfs_diflags_to_linux(ip); + /* Change the ownerships and register project quota modifications */ + if (xfs_get_projid(ip) != fa->fsx_projid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + olddquot = xfs_qm_vop_chown(tp, ip, + &ip->i_pdquot, pdqp); + } + ASSERT(ip->i_d.di_version > 1); + xfs_set_projid(ip, fa->fsx_projid); } /* @@ -1241,34 +1245,12 @@ xfs_ioctl_setattr( * extent size hint should be set on the inode. If no extent size flags * are set on the inode then unconditionally clear the extent size hint. */ - if (mask & FSX_EXTSIZE) { - int extsize = 0; - - if (ip->i_d.di_flags & - (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) - extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; - ip->i_d.di_extsize = extsize; - } - - xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - XFS_STATS_INC(xs_ig_attrchg); + if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + else + ip->i_d.di_extsize = 0; - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - * This is slightly sub-optimal in that truncates require - * two sync transactions instead of one for wsync filesystems. - * One for the truncate and one for the timestamps since we - * don't want to change the timestamps unless we're sure the - * truncate worked. Truncates are less than 1% of the laddis - * mix so this probably isn't worth the trouble to optimize. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); code = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, lock_flags); /* * Release any dquot(s) the inode had kept before chown. @@ -1279,12 +1261,11 @@ xfs_ioctl_setattr( return code; - error_return: +error_trans_cancel: + xfs_trans_cancel(tp, 0); +error_free_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); - xfs_trans_cancel(tp, 0); - if (lock_flags) - xfs_iunlock(ip, lock_flags); return code; } @@ -1295,20 +1276,15 @@ xfs_ioc_fssetxattr( void __user *arg) { struct fsxattr fa; - unsigned int mask; int error; if (copy_from_user(&fa, arg, sizeof(fa))) return -EFAULT; - mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - mask |= FSX_NONBLOCK; - error = mnt_want_write_file(filp); if (error) return error; - error = xfs_ioctl_setattr(ip, &fa, mask); + error = xfs_ioctl_setattr(ip, &fa); mnt_drop_write_file(filp); return error; } @@ -1328,14 +1304,14 @@ xfs_ioc_getxflags( STATIC int xfs_ioc_setxflags( - xfs_inode_t *ip, + struct xfs_inode *ip, struct file *filp, void __user *arg) { + struct xfs_trans *tp; struct fsxattr fa; unsigned int flags; - unsigned int mask; - int error; + int error; if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; @@ -1345,15 +1321,26 @@ xfs_ioc_setxflags( FS_SYNC_FL)) return -EOPNOTSUPP; - mask = FSX_XFLAGS; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - mask |= FSX_NONBLOCK; fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); error = mnt_want_write_file(filp); if (error) return error; - error = xfs_ioctl_setattr(ip, &fa, mask); + + tp = xfs_ioctl_setattr_get_trans(ip); + if (IS_ERR(tp)) { + error = PTR_ERR(tp); + goto out_drop_write; + } + + error = xfs_ioctl_setattr_xflags(tp, ip, &fa); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_write; + } + + error = xfs_trans_commit(tp, 0); +out_drop_write: mnt_drop_write_file(filp); return error; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 94ce027e28e3..bfc7c7c8a0c8 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -25,8 +25,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_itable.h" @@ -425,7 +423,7 @@ xfs_compat_attrmulti_by_handle( ops = memdup_user(compat_ptr(am_hreq.ops), size); if (IS_ERR(ops)) { - error = -PTR_ERR(ops); + error = PTR_ERR(ops); goto out_dput; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index afcf3c926565..ccb1dd0d509e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -38,7 +36,6 @@ #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" -#include "xfs_dinode.h" #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ @@ -52,7 +49,6 @@ xfs_iomap_eof_align_last_fsb( xfs_extlen_t extsize, xfs_fileoff_t *last_fsb) { - xfs_fileoff_t new_last_fsb = 0; xfs_extlen_t align = 0; int eof, error; @@ -70,8 +66,8 @@ xfs_iomap_eof_align_last_fsb( else if (mp->m_dalign) align = mp->m_dalign; - if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) - new_last_fsb = roundup_64(*last_fsb, align); + if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align)) + align = 0; } /* @@ -79,14 +75,14 @@ xfs_iomap_eof_align_last_fsb( * (when file on a real-time subvolume or has di_extsize hint). */ if (extsize) { - if (new_last_fsb) - align = roundup_64(new_last_fsb, extsize); + if (align) + align = roundup_64(align, extsize); else align = extsize; - new_last_fsb = roundup_64(*last_fsb, align); } - if (new_last_fsb) { + if (align) { + xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; @@ -264,7 +260,6 @@ xfs_iomap_eof_want_preallocate( { xfs_fileoff_t start_fsb; xfs_filblks_t count_fsb; - xfs_fsblock_t firstblock; int n, error, imaps; int found_delalloc = 0; @@ -289,7 +284,6 @@ xfs_iomap_eof_want_preallocate( count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; - firstblock = NULLFSBLOCK; error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, 0); if (error) @@ -808,7 +802,7 @@ int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, - size_t count) + xfs_off_t count) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 411fbb8919ef..8688e663d744 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -27,6 +27,6 @@ int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *); int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); -int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ec6dcdc181ee..e53a90331422 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -37,9 +35,9 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_da_btree.h" -#include "xfs_dir2_priv.h" -#include "xfs_dinode.h" +#include "xfs_dir2.h" #include "xfs_trans_space.h" +#include "xfs_pnfs.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -383,18 +381,27 @@ xfs_vn_rename( struct inode *odir, struct dentry *odentry, struct inode *ndir, - struct dentry *ndentry) + struct dentry *ndentry, + unsigned int flags) { struct inode *new_inode = ndentry->d_inode; + int omode = 0; struct xfs_name oname; struct xfs_name nname; - xfs_dentry_to_name(&oname, odentry, 0); + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + /* if we are exchanging files, we need to set i_mode of both files */ + if (flags & RENAME_EXCHANGE) + omode = ndentry->d_inode->i_mode; + + xfs_dentry_to_name(&oname, odentry, omode); xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), - XFS_I(ndir), &nname, new_inode ? - XFS_I(new_inode) : NULL); + XFS_I(ndir), &nname, + new_inode ? XFS_I(new_inode) : NULL, flags); } /* @@ -499,7 +506,7 @@ xfs_setattr_mode( inode->i_mode |= mode & ~S_IFMT; } -static void +void xfs_setattr_time( struct xfs_inode *ip, struct iattr *iattr) @@ -744,6 +751,7 @@ xfs_setattr_size( int error; uint lock_flags = 0; uint commit_flags = 0; + bool did_zeroing = false; trace_xfs_setattr(ip); @@ -787,20 +795,16 @@ xfs_setattr_size( return error; /* - * Now we can make the changes. Before we join the inode to the - * transaction, take care of the part of the truncation that must be - * done without the inode lock. This needs to be done before joining - * the inode to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. + * File data changes must be complete before we start the transaction to + * modify the inode. This needs to be done before joining the inode to + * the transaction because the inode cannot be unlocked once it is a + * part of the transaction. + * + * Start with zeroing any data block beyond EOF that we may expose on + * file extension. */ if (newsize > oldsize) { - /* - * Do the first part of growing a file: zero any data in the - * last block that is beyond the old EOF. We need to do this - * before the inode is joined to the transaction to modify - * i_size. - */ - error = xfs_zero_eof(ip, newsize, oldsize); + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); if (error) return error; } @@ -810,23 +814,18 @@ xfs_setattr_size( * any previous writes that are beyond the on disk EOF and the new * EOF that have not been written out need to be written here. If we * do not write the data out, we expose ourselves to the null files - * problem. - * - * Only flush from the on disk size to the smaller of the in memory - * file size or the new size as that's the range we really care about - * here and prevents waiting for other data not within the range we - * care about here. + * problem. Note that this includes any block zeroing we did above; + * otherwise those blocks may not be zeroed after a crash. */ - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + if (newsize > ip->i_d.di_size && + (oldsize != ip->i_d.di_size || did_zeroing)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) return error; } - /* - * Wait for all direct I/O to complete. - */ + /* Now wait for all direct I/O to complete. */ inode_dio_wait(inode); /* @@ -973,9 +972,13 @@ xfs_vn_setattr( int error; if (iattr->ia_valid & ATTR_SIZE) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - error = xfs_setattr_size(ip, iattr); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); + uint iolock = XFS_IOLOCK_EXCL; + + xfs_ilock(ip, iolock); + error = xfs_break_layouts(dentry->d_inode, &iolock); + if (!error) + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); } @@ -1147,7 +1150,7 @@ static const struct inode_operations xfs_dir_inode_operations = { */ .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, - .rename = xfs_vn_rename, + .rename2 = xfs_vn_rename, .get_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, @@ -1175,7 +1178,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { */ .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, - .rename = xfs_vn_rename, + .rename2 = xfs_vn_rename, .get_acl = xfs_get_acl, .set_acl = xfs_set_acl, .getattr = xfs_vn_getattr, diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 1c34e4335920..ea7a98e9cb70 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *); */ #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ +extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, int flags); extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 894924a5129b..82e314258f73 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -21,9 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" @@ -33,7 +30,6 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_dinode.h" STATIC int xfs_internal_inum( @@ -352,7 +348,6 @@ xfs_bulkstat( int *done) /* 1 if there are more stats to get */ { xfs_buf_t *agbp; /* agi header buffer */ - xfs_agi_t *agi; /* agi header data */ xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ @@ -403,7 +398,6 @@ xfs_bulkstat( error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) break; - agi = XFS_BUF_TO_AGI(agbp); /* * Allocate and initialize a btree cursor for ialloc btree. */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 6a51619d8690..c31d2c2eadc4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -384,4 +384,10 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #endif /* XFS_WARN */ #endif /* DEBUG */ +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fe88ef67f93a..bcc7cfabb787 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_trans.h" @@ -35,6 +33,7 @@ #include "xfs_fsops.h" #include "xfs_cksum.h" #include "xfs_sysfs.h" +#include "xfs_sb.h" kmem_zone_t *xfs_log_ticket_zone; @@ -1031,7 +1030,7 @@ xfs_log_need_covered(xfs_mount_t *mp) struct xlog *log = mp->m_log; int needed = 0; - if (!xfs_fs_writable(mp)) + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) return 0; if (!xlog_cil_empty(log)) @@ -1292,9 +1291,20 @@ xfs_log_worker( struct xfs_mount *mp = log->l_mp; /* dgc: errors ignored - not fatal and nowhere to report them */ - if (xfs_log_need_covered(mp)) - xfs_fs_log_dummy(mp); - else + if (xfs_log_need_covered(mp)) { + /* + * Dump a transaction into the log that contains no real change. + * This is needed to stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty + * state back up into the VFS and then periodic inode flushing + * will prevent log covering from making progress. Hence we + * synchronously log the superblock instead to ensure the + * superblock is immediately unpinned and can be written back. + */ + xfs_sync_sb(mp, true); + } else xfs_log_force(mp, 0); /* start pushing all the metadata that is currently dirty */ @@ -1397,6 +1407,8 @@ xlog_alloc_log( ASSERT(xfs_buf_islocked(bp)); xfs_buf_unlock(bp); + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; bp->b_iodone = xlog_iodone; log->l_xbuf = bp; @@ -1429,6 +1441,8 @@ xlog_alloc_log( ASSERT(xfs_buf_islocked(bp)); xfs_buf_unlock(bp); + /* use high priority wq for log I/O completion */ + bp->b_ioend_wq = mp->m_log_workqueue; bp->b_iodone = xlog_iodone; iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; @@ -2025,7 +2039,7 @@ xlog_print_tic_res( " total reg = %u bytes (o/flow = %u bytes)\n" " ophdrs = %u (ophdr space = %u bytes)\n" " ophdr + reg = %u bytes\n" - " num regions = %u\n", + " num regions = %u", ((ticket->t_trans_type <= 0 || ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f506c457011e..45cc0ce18adf 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -17,11 +17,10 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_error.h" #include "xfs_alloc.h" diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 00cd7f3a8f59..a5a945fc3bdc 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -22,11 +22,10 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_log.h" @@ -42,7 +41,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_btree.h" -#include "xfs_dinode.h" #include "xfs_error.h" #include "xfs_dir2.h" diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 63ca2f0420b1..d8b67547ab34 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -17,10 +17,9 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 51435dbce9c4..4fa80e63eea2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -22,11 +22,10 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" #include "xfs_dir2.h" #include "xfs_ialloc.h" @@ -41,7 +40,6 @@ #include "xfs_fsops.h" #include "xfs_trace.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_sysfs.h" @@ -410,11 +408,11 @@ xfs_update_alignment(xfs_mount_t *mp) if (xfs_sb_version_hasdalign(sbp)) { if (sbp->sb_unit != mp->m_dalign) { sbp->sb_unit = mp->m_dalign; - mp->m_update_flags |= XFS_SB_UNIT; + mp->m_update_sb = true; } if (sbp->sb_width != mp->m_swidth) { sbp->sb_width = mp->m_swidth; - mp->m_update_flags |= XFS_SB_WIDTH; + mp->m_update_sb = true; } } else { xfs_warn(mp, @@ -585,38 +583,19 @@ int xfs_mount_reset_sbqflags( struct xfs_mount *mp) { - int error; - struct xfs_trans *tp; - mp->m_qflags = 0; - /* - * It is OK to look at sb_qflags here in mount path, - * without m_sb_lock. - */ + /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */ if (mp->m_sb.sb_qflags == 0) return 0; spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags = 0; spin_unlock(&mp->m_sb_lock); - /* - * If the fs is readonly, let the incore superblock run - * with quotas off but don't flush the update out to disk - */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (!xfs_fs_writable(mp, SB_FREEZE_WRITE)) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - xfs_alert(mp, "%s: Superblock update failed!", __func__); - return error; - } - - xfs_mod_sb(tp, XFS_SB_QFLAGS); - return xfs_trans_commit(tp, 0); + return xfs_sync_sb(mp, false); } __uint64_t @@ -661,26 +640,25 @@ xfs_mountfs( xfs_sb_mount_common(mp, sbp); /* - * Check for a mismatched features2 values. Older kernels - * read & wrote into the wrong sb offset for sb_features2 - * on some platforms due to xfs_sb_t not being 64bit size aligned - * when sb_features2 was added, which made older superblock - * reading/writing routines swap it as a 64-bit value. + * Check for a mismatched features2 values. Older kernels read & wrote + * into the wrong sb offset for sb_features2 on some platforms due to + * xfs_sb_t not being 64bit size aligned when sb_features2 was added, + * which made older superblock reading/writing routines swap it as a + * 64-bit value. * * For backwards compatibility, we make both slots equal. * - * If we detect a mismatched field, we OR the set bits into the - * existing features2 field in case it has already been modified; we - * don't want to lose any features. We then update the bad location - * with the ORed value so that older kernels will see any features2 - * flags, and mark the two fields as needing updates once the - * transaction subsystem is online. + * If we detect a mismatched field, we OR the set bits into the existing + * features2 field in case it has already been modified; we don't want + * to lose any features. We then update the bad location with the ORed + * value so that older kernels will see any features2 flags. The + * superblock writeback code ensures the new sb_features2 is copied to + * sb_bad_features2 before it is logged or written to disk. */ if (xfs_sb_has_mismatched_features2(sbp)) { xfs_warn(mp, "correcting sb_features alignment problem"); sbp->sb_features2 |= sbp->sb_bad_features2; - sbp->sb_bad_features2 = sbp->sb_features2; - mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; + mp->m_update_sb = true; /* * Re-check for ATTR2 in case it was found in bad_features2 @@ -694,17 +672,17 @@ xfs_mountfs( if (xfs_sb_version_hasattr2(&mp->m_sb) && (mp->m_flags & XFS_MOUNT_NOATTR2)) { xfs_sb_version_removeattr2(&mp->m_sb); - mp->m_update_flags |= XFS_SB_FEATURES2; + mp->m_update_sb = true; /* update sb_versionnum for the clearing of the morebits */ if (!sbp->sb_features2) - mp->m_update_flags |= XFS_SB_VERSIONNUM; + mp->m_update_sb = true; } /* always use v2 inodes by default now */ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; - mp->m_update_flags |= XFS_SB_VERSIONNUM; + mp->m_update_sb = true; } /* @@ -897,8 +875,8 @@ xfs_mountfs( * the next remount into writeable mode. Otherwise we would never * perform the update e.g. for the root filesystem. */ - if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { - error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_sync_sb(mp, false); if (error) { xfs_warn(mp, "failed to write sb changes"); goto out_rtunmount; @@ -1074,11 +1052,23 @@ xfs_unmountfs( xfs_sysfs_del(&mp->m_kobj); } -int -xfs_fs_writable(xfs_mount_t *mp) +/* + * Determine whether modifications can proceed. The caller specifies the minimum + * freeze level for which modifications should not be allowed. This allows + * certain operations to proceed while the freeze sequence is in progress, if + * necessary. + */ +bool +xfs_fs_writable( + struct xfs_mount *mp, + int level) { - return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || - (mp->m_flags & XFS_MOUNT_RDONLY)); + ASSERT(level > SB_UNFROZEN); + if ((mp->m_super->s_writers.frozen >= level) || + XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + return false; + + return true; } /* @@ -1086,17 +1076,15 @@ xfs_fs_writable(xfs_mount_t *mp) * * Sync the superblock counters to disk. * - * Note this code can be called during the process of freezing, so - * we may need to use the transaction allocator which does not - * block when the transaction subsystem is in its frozen state. + * Note this code can be called during the process of freezing, so we use the + * transaction allocator that does not block when the transaction subsystem is + * in its frozen state. */ int xfs_log_sbcount(xfs_mount_t *mp) { - xfs_trans_t *tp; - int error; - - if (!xfs_fs_writable(mp)) + /* allow this to proceed during the freeze sequence... */ + if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; xfs_icsb_sync_counters(mp, 0); @@ -1108,17 +1096,7 @@ xfs_log_sbcount(xfs_mount_t *mp) if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) return 0; - tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - return error; + return xfs_sync_sb(mp, true); } /* @@ -1412,34 +1390,6 @@ xfs_freesb( } /* - * Used to log changes to the superblock unit and width fields which could - * be altered by the mount options, as well as any potential sb_features2 - * fixup. Only the first superblock is updated. - */ -int -xfs_mount_log_sb( - xfs_mount_t *mp, - __int64_t fields) -{ - xfs_trans_t *tp; - int error; - - ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | - XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 | - XFS_SB_VERSIONNUM)); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_mod_sb(tp, fields); - error = xfs_trans_commit(tp, 0); - return error; -} - -/* * If the underlying (data/log/rt) device is readonly, there are some * operations that cannot proceed. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b0447c86e7e2..0d8abd6364d9 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -162,18 +162,29 @@ typedef struct xfs_mount { struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ - __int64_t m_update_flags; /* sb flags we need to update - on the next remount,rw */ + bool m_update_sb; /* sb needs update in mount */ int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + + /* + * Generation of the filesysyem layout. This is incremented by each + * growfs, and used by the pNFS server to ensure the client updates + * its view of the block device once it gets a layout that might + * reference the newly added blocks. Does not need to be persistent + * as long as we only allow file system size increments, but if we + * ever support shrinks it would have to be persisted in addition + * to various other kinds of pain inflicted on the pNFS server. + */ + __uint32_t m_generation; } xfs_mount_t; /* @@ -320,10 +331,7 @@ typedef struct xfs_mod_sb { /* * Per-ag incore structure, copies of information in agf and agi, to improve the - * performance of allocation group selection. This is defined for the kernel - * only, and hence is defined here instead of in xfs_ag.h. You need the struct - * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree), - * so this doesn't introduce any strange header file dependencies. + * performance of allocation group selection. */ typedef struct xfs_perag { struct xfs_mount *pag_mount; /* owner filesystem */ @@ -380,11 +388,11 @@ extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); -extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); +extern int xfs_mount_log_sb(xfs_mount_t *); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); -extern int xfs_fs_writable(xfs_mount_t *); +extern bool xfs_fs_writable(struct xfs_mount *mp, int level); extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern int xfs_dev_is_read_only(struct xfs_mount *, char *); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c new file mode 100644 index 000000000000..365dd57ea760 --- /dev/null +++ b/fs/xfs/xfs_pnfs.c @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_pnfs.h" + +/* + * Ensure that we do not have any outstanding pNFS layouts that can be used by + * clients to directly read from or write to this inode. This must be called + * before every operation that can remove blocks from the extent map. + * Additionally we call it during the write operation, where aren't concerned + * about exposing unallocated blocks but just want to provide basic + * synchronization between a local writer and pNFS clients. mmap writes would + * also benefit from this sort of synchronization, but due to the tricky locking + * rules in the page fault path we don't bother. + */ +int +xfs_break_layouts( + struct inode *inode, + uint *iolock) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { + xfs_iunlock(ip, *iolock); + error = break_layout(inode, true); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + } + + return error; +} + +/* + * Get a unique ID including its location so that the client can identify + * the exported device. + */ +int +xfs_fs_get_uuid( + struct super_block *sb, + u8 *buf, + u32 *len, + u64 *offset) +{ + struct xfs_mount *mp = XFS_M(sb); + + printk_once(KERN_NOTICE +"XFS (%s): using experimental pNFS feature, use at your own risk!\n", + mp->m_fsname); + + if (*len < sizeof(uuid_t)) + return -EINVAL; + + memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + *len = sizeof(uuid_t); + *offset = offsetof(struct xfs_dsb, sb_uuid); + return 0; +} + +static void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = + XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); +} + +/* + * Get a layout for the pNFS client. + */ +int +xfs_fs_map_blocks( + struct inode *inode, + loff_t offset, + u64 length, + struct iomap *iomap, + bool write, + u32 *device_generation) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + loff_t limit; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + uint lock_flags; + int error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * We can't export inodes residing on the realtime device. The realtime + * device doesn't have a UUID to identify it, so the client has no way + * to find it. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return -ENXIO; + + /* + * Lock out any other I/O before we flush and invalidate the pagecache, + * and then hand out a layout to the remote system. This is very + * similar to direct I/O, except that the synchronization is much more + * complicated. See the comment near xfs_break_layouts for a detailed + * explanation. + */ + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + error = -EINVAL; + limit = mp->m_super->s_maxbytes; + if (!write) + limit = max(limit, round_up(i_size_read(inode), + inode->i_sb->s_blocksize)); + if (offset > limit) + goto out_unlock; + if (offset > limit - length) + length = limit - offset; + + error = filemap_write_and_wait(inode->i_mapping); + if (error) + goto out_unlock; + error = invalidate_inode_pages2(inode->i_mapping); + if (WARN_ON_ONCE(error)) + return error; + + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + lock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, lock_flags); + + if (error) + goto out_unlock; + + if (write) { + enum xfs_prealloc_flags flags = 0; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + error = xfs_iomap_write_direct(ip, offset, length, + &imap, nimaps); + if (error) + goto out_unlock; + + /* + * Ensure the next transaction is committed + * synchronously so that the blocks allocated and + * handed out to the client are guaranteed to be + * present even after a server crash. + */ + flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC; + } + + error = xfs_update_prealloc_flags(ip, flags); + if (error) + goto out_unlock; + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + + xfs_bmbt_to_iomap(ip, iomap, &imap); + *device_generation = mp->m_generation; + return error; +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} + +/* + * Ensure the size update falls into a valid allocated block. + */ +static int +xfs_pnfs_validate_isize( + struct xfs_inode *ip, + xfs_off_t isize) +{ + struct xfs_bmbt_irec imap; + int nimaps = 1; + int error = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (error) + return error; + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) + return -EIO; + return 0; +} + +/* + * Make sure the blocks described by maps are stable on disk. This includes + * converting any unwritten extents, flushing the disk cache and updating the + * time stamps. + * + * Note that we rely on the caller to always send us a timestamp update so that + * we always commit a transaction here. If that stops being true we will have + * to manually flush the cache here similar to what the fsync code path does + * for datasyncs on files that have no dirty metadata. + */ +int +xfs_fs_commit_blocks( + struct inode *inode, + struct iomap *maps, + int nr_maps, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + bool update_isize = false; + int error, i; + loff_t size; + + ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)); + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + size = i_size_read(inode); + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) { + update_isize = true; + size = iattr->ia_size; + } + + for (i = 0; i < nr_maps; i++) { + u64 start, length, end; + + start = maps[i].offset; + if (start > size) + continue; + + end = start + maps[i].length; + if (end > size) + end = size; + + length = end - start; + if (!length) + continue; + + /* + * Make sure reads through the pagecache see the new data. + */ + error = invalidate_inode_pages2_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + (end - 1) >> PAGE_CACHE_SHIFT); + WARN_ON_ONCE(error); + + error = xfs_iomap_write_unwritten(ip, start, length); + if (error) + goto out_drop_iolock; + } + + if (update_isize) { + error = xfs_pnfs_validate_isize(ip, size); + if (error) + goto out_drop_iolock; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_drop_iolock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + xfs_setattr_time(ip, iattr); + if (update_isize) { + i_size_write(inode, iattr->ia_size); + ip->i_d.di_size = iattr->ia_size; + } + + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_drop_iolock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; +} diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h new file mode 100644 index 000000000000..b7fbfce660f6 --- /dev/null +++ b/fs/xfs/xfs_pnfs.h @@ -0,0 +1,18 @@ +#ifndef _XFS_PNFS_H +#define _XFS_PNFS_H 1 + +#ifdef CONFIG_NFSD_PNFS +int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset); +int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, + struct iomap *iomap, bool write, u32 *device_generation); +int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, + struct iattr *iattr); + +int xfs_break_layouts(struct inode *inode, uint *iolock); +#else +static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +{ + return 0; +} +#endif /* CONFIG_NFSD_PNFS */ +#endif /* _XFS_PNFS_H */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index d68f23021af3..fbbb9e62e274 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -23,7 +23,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_ialloc.h" @@ -38,7 +37,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_cksum.h" -#include "xfs_dinode.h" /* * The global quota manager. There is only one of these for the entire @@ -432,6 +430,7 @@ struct xfs_qm_isolate { static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) __releases(lru_lock) __acquires(lru_lock) @@ -452,7 +451,7 @@ xfs_qm_dquot_isolate( XFS_STATS_INC(xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); - list_del_init(&dqp->q_lru); + list_lru_isolate(lru, &dqp->q_lru); XFS_STATS_DEC(xs_qm_dquot_unused); return LRU_REMOVED; } @@ -496,7 +495,7 @@ xfs_qm_dquot_isolate( xfs_dqunlock(dqp); ASSERT(dqp->q_nrefs == 0); - list_move_tail(&dqp->q_lru, &isol->dispose); + list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); XFS_STATS_INC(xs_qm_dqreclaims); @@ -525,7 +524,6 @@ xfs_qm_shrink_scan( struct xfs_qm_isolate isol; unsigned long freed; int error; - unsigned long nr_to_scan = sc->nr_to_scan; if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) return 0; @@ -533,8 +531,8 @@ xfs_qm_shrink_scan( INIT_LIST_HEAD(&isol.buffers); INIT_LIST_HEAD(&isol.dispose); - freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, - &nr_to_scan); + freed = list_lru_shrink_walk(&qi->qi_lru, sc, + xfs_qm_dquot_isolate, &isol); error = xfs_buf_delwri_submit(&isol.buffers); if (error) @@ -559,7 +557,7 @@ xfs_qm_shrink_count( struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo, qi_shrinker); - return list_lru_count_node(&qi->qi_lru, sc->nid); + return list_lru_shrink_count(&qi->qi_lru, sc); } /* @@ -716,7 +714,6 @@ STATIC int xfs_qm_qino_alloc( xfs_mount_t *mp, xfs_inode_t **ip, - __int64_t sbfields, uint flags) { xfs_trans_t *tp; @@ -779,11 +776,6 @@ xfs_qm_qino_alloc( spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); - ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) == - (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | - XFS_SB_QFLAGS)); xfs_sb_version_addquota(&mp->m_sb); mp->m_sb.sb_uquotino = NULLFSINO; @@ -800,7 +792,7 @@ xfs_qm_qino_alloc( else mp->m_sb.sb_pquotino = (*ip)->i_ino; spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, sbfields); + xfs_log_sb(tp); if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { xfs_alert(mp, "%s failed (error %d)!", __func__, error); @@ -844,6 +836,11 @@ xfs_qm_reset_dqcounts( */ xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, "xfs_quotacheck"); + /* + * Reset type in case we are reusing group quota file for + * project quotas or vice versa + */ + ddq->d_flags = type; ddq->d_bcount = 0; ddq->d_icount = 0; ddq->d_rtbcount = 0; @@ -1453,7 +1450,7 @@ xfs_qm_mount_quotas( spin_unlock(&mp->m_sb_lock); if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { - if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { + if (xfs_sync_sb(mp, false)) { /* * We could only have been turning quotas off. * We aren't in very good shape actually because @@ -1484,7 +1481,6 @@ xfs_qm_init_quotainos( struct xfs_inode *gip = NULL; struct xfs_inode *pip = NULL; int error; - __int64_t sbflags = 0; uint flags = 0; ASSERT(mp->m_quotainfo); @@ -1519,9 +1515,6 @@ xfs_qm_init_quotainos( } } else { flags |= XFS_QMOPT_SBVERSION; - sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | - XFS_SB_QFLAGS); } /* @@ -1532,7 +1525,6 @@ xfs_qm_init_quotainos( */ if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { error = xfs_qm_qino_alloc(mp, &uip, - sbflags | XFS_SB_UQUOTINO, flags | XFS_QMOPT_UQUOTA); if (error) goto error_rele; @@ -1541,7 +1533,6 @@ xfs_qm_init_quotainos( } if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { error = xfs_qm_qino_alloc(mp, &gip, - sbflags | XFS_SB_GQUOTINO, flags | XFS_QMOPT_GQUOTA); if (error) goto error_rele; @@ -1550,7 +1541,6 @@ xfs_qm_init_quotainos( } if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { error = xfs_qm_qino_alloc(mp, &pip, - sbflags | XFS_SB_PQUOTINO, flags | XFS_QMOPT_PQUOTA); if (error) goto error_rele; @@ -1589,32 +1579,6 @@ xfs_qm_dqfree_one( xfs_qm_dqdestroy(dqp); } -/* - * Start a transaction and write the incore superblock changes to - * disk. flags parameter indicates which fields have changed. - */ -int -xfs_qm_write_sb_changes( - xfs_mount_t *mp, - __int64_t flags) -{ - xfs_trans_t *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_mod_sb(tp, flags); - error = xfs_trans_commit(tp, 0); - - return error; -} - - /* --------------- utility functions for vnodeops ---------------- */ @@ -1749,23 +1713,21 @@ xfs_qm_vop_dqalloc( xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; - else if (uq) + else xfs_qm_dqrele(uq); if (O_gdqpp) *O_gdqpp = gq; - else if (gq) + else xfs_qm_dqrele(gq); if (O_pdqpp) *O_pdqpp = pq; - else if (pq) + else xfs_qm_dqrele(pq); return 0; error_rele: - if (gq) - xfs_qm_dqrele(gq); - if (uq) - xfs_qm_dqrele(uq); + xfs_qm_dqrele(gq); + xfs_qm_dqrele(uq); return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 3a07a937e232..0d4d3590cf85 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -157,7 +157,6 @@ struct xfs_dquot_acct { #define XFS_QM_RTBWARNLIMIT 5 extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); -extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); /* dquot stuff */ extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); @@ -166,9 +165,9 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, - uint, struct fs_disk_quota *); + uint, struct qc_dqblk *); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, - struct fs_disk_quota *); + struct qc_dqblk *); extern int xfs_qm_scall_getqstat(struct xfs_mount *, struct fs_quota_stat *); extern int xfs_qm_scall_getqstatv(struct xfs_mount *, diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 2c61e61b0205..3e52d5de7ae1 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 80f2d77d929a..9b965db45800 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -26,7 +26,6 @@ #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -40,7 +39,6 @@ STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, uint); STATIC uint xfs_qm_export_flags(uint); -STATIC uint xfs_qm_export_qtype_flags(uint); /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -93,8 +91,7 @@ xfs_qm_scall_quotaoff( mutex_unlock(&q->qi_quotaofflock); /* XXX what to do if error ? Revert back to old vals incore ? */ - error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); - return error; + return xfs_sync_sb(mp, false); } dqtype = 0; @@ -315,7 +312,6 @@ xfs_qm_scall_quotaon( { int error; uint qf; - __int64_t sbflags; flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); /* @@ -323,30 +319,22 @@ xfs_qm_scall_quotaon( */ flags &= ~(XFS_ALL_QUOTA_ACCT); - sbflags = 0; - if (flags == 0) { xfs_debug(mp, "%s: zero flags, m_qflags=%x", __func__, mp->m_qflags); return -EINVAL; } - /* No fs can turn on quotas with a delayed effect */ - ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0); - /* * Can't enforce without accounting. We check the superblock * qflags here instead of m_qflags because rootfs can have * quota acct on ondisk without m_qflags' knowing. */ - if (((flags & XFS_UQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && (flags & XFS_UQUOTA_ENFD)) || - ((flags & XFS_GQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && (flags & XFS_GQUOTA_ENFD)) || - ((flags & XFS_PQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && (flags & XFS_PQUOTA_ENFD))) { xfs_debug(mp, "%s: Can't enforce without acct, flags=%x sbflags=%x", @@ -371,11 +359,11 @@ xfs_qm_scall_quotaon( /* * There's nothing to change if it's the same. */ - if ((qf & flags) == flags && sbflags == 0) + if ((qf & flags) == flags) return -EEXIST; - sbflags |= XFS_SB_QFLAGS; - if ((error = xfs_qm_write_sb_changes(mp, sbflags))) + error = xfs_sync_sb(mp, false); + if (error) return error; /* * If we aren't trying to switch on quota enforcement, we are done. @@ -385,8 +373,7 @@ xfs_qm_scall_quotaon( ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != (mp->m_qflags & XFS_PQUOTA_ACCT)) || ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != - (mp->m_qflags & XFS_GQUOTA_ACCT)) || - (flags & XFS_ALL_QUOTA_ENFD) == 0) + (mp->m_qflags & XFS_GQUOTA_ACCT))) return 0; if (! XFS_IS_QUOTA_RUNNING(mp)) @@ -423,20 +410,12 @@ xfs_qm_scall_getqstat( memset(out, 0, sizeof(fs_quota_stat_t)); out->qs_version = FS_QSTAT_VERSION; - if (!xfs_sb_version_hasquota(&mp->m_sb)) { - out->qs_uquota.qfs_ino = NULLFSINO; - out->qs_gquota.qfs_ino = NULLFSINO; - return 0; - } - out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & (XFS_ALL_QUOTA_ACCT| XFS_ALL_QUOTA_ENFD)); - if (q) { - uip = q->qi_uquotaip; - gip = q->qi_gquotaip; - pip = q->qi_pquotaip; - } + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + pip = q->qi_pquotaip; if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip) == 0) @@ -482,14 +461,13 @@ xfs_qm_scall_getqstat( if (temppqip) IRELE(pip); } - if (q) { - out->qs_incoredqs = q->qi_dquots; - out->qs_btimelimit = q->qi_btimelimit; - out->qs_itimelimit = q->qi_itimelimit; - out->qs_rtbtimelimit = q->qi_rtbtimelimit; - out->qs_bwarnlimit = q->qi_bwarnlimit; - out->qs_iwarnlimit = q->qi_iwarnlimit; - } + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + return 0; } @@ -510,13 +488,6 @@ xfs_qm_scall_getqstatv( bool tempgqip = false; bool temppqip = false; - if (!xfs_sb_version_hasquota(&mp->m_sb)) { - out->qs_uquota.qfs_ino = NULLFSINO; - out->qs_gquota.qfs_ino = NULLFSINO; - out->qs_pquota.qfs_ino = NULLFSINO; - return 0; - } - out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & (XFS_ALL_QUOTA_ACCT| XFS_ALL_QUOTA_ENFD)); @@ -524,11 +495,9 @@ xfs_qm_scall_getqstatv( out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; - if (q) { - uip = q->qi_uquotaip; - gip = q->qi_gquotaip; - pip = q->qi_pquotaip; - } + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + pip = q->qi_pquotaip; if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &uip) == 0) @@ -563,19 +532,18 @@ xfs_qm_scall_getqstatv( if (temppqip) IRELE(pip); } - if (q) { - out->qs_incoredqs = q->qi_dquots; - out->qs_btimelimit = q->qi_btimelimit; - out->qs_itimelimit = q->qi_itimelimit; - out->qs_rtbtimelimit = q->qi_rtbtimelimit; - out->qs_bwarnlimit = q->qi_bwarnlimit; - out->qs_iwarnlimit = q->qi_iwarnlimit; - } + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + return 0; } -#define XFS_DQ_MASK \ - (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) +#define XFS_QC_MASK \ + (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) /* * Adjust quota limits, and start/stop timers accordingly. @@ -585,7 +553,7 @@ xfs_qm_scall_setqlim( struct xfs_mount *mp, xfs_dqid_t id, uint type, - fs_disk_quota_t *newlim) + struct qc_dqblk *newlim) { struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_disk_dquot *ddq; @@ -594,9 +562,9 @@ xfs_qm_scall_setqlim( int error; xfs_qcnt_t hard, soft; - if (newlim->d_fieldmask & ~XFS_DQ_MASK) + if (newlim->d_fieldmask & ~XFS_QC_MASK) return -EINVAL; - if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) + if ((newlim->d_fieldmask & XFS_QC_MASK) == 0) return 0; /* @@ -634,11 +602,11 @@ xfs_qm_scall_setqlim( /* * Make sure that hardlimits are >= soft limits before changing. */ - hard = (newlim->d_fieldmask & FS_DQ_BHARD) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) : + hard = (newlim->d_fieldmask & QC_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) : be64_to_cpu(ddq->d_blk_hardlimit); - soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) : + soft = (newlim->d_fieldmask & QC_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) : be64_to_cpu(ddq->d_blk_softlimit); if (hard == 0 || hard >= soft) { ddq->d_blk_hardlimit = cpu_to_be64(hard); @@ -651,11 +619,11 @@ xfs_qm_scall_setqlim( } else { xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); } - hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : + hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) : be64_to_cpu(ddq->d_rtb_hardlimit); - soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) : + soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ? + (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) : be64_to_cpu(ddq->d_rtb_softlimit); if (hard == 0 || hard >= soft) { ddq->d_rtb_hardlimit = cpu_to_be64(hard); @@ -668,10 +636,10 @@ xfs_qm_scall_setqlim( xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); } - hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? + hard = (newlim->d_fieldmask & QC_INO_HARD) ? (xfs_qcnt_t) newlim->d_ino_hardlimit : be64_to_cpu(ddq->d_ino_hardlimit); - soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ? + soft = (newlim->d_fieldmask & QC_INO_SOFT) ? (xfs_qcnt_t) newlim->d_ino_softlimit : be64_to_cpu(ddq->d_ino_softlimit); if (hard == 0 || hard >= soft) { @@ -688,12 +656,12 @@ xfs_qm_scall_setqlim( /* * Update warnings counter(s) if requested */ - if (newlim->d_fieldmask & FS_DQ_BWARNS) - ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns); - if (newlim->d_fieldmask & FS_DQ_IWARNS) - ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns); - if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns); + if (newlim->d_fieldmask & QC_SPC_WARNS) + ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns); + if (newlim->d_fieldmask & QC_INO_WARNS) + ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns); + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns); if (id == 0) { /* @@ -703,24 +671,24 @@ xfs_qm_scall_setqlim( * soft and hard limit values (already done, above), and * for warnings. */ - if (newlim->d_fieldmask & FS_DQ_BTIMER) { - q->qi_btimelimit = newlim->d_btimer; - ddq->d_btimer = cpu_to_be32(newlim->d_btimer); + if (newlim->d_fieldmask & QC_SPC_TIMER) { + q->qi_btimelimit = newlim->d_spc_timer; + ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer); } - if (newlim->d_fieldmask & FS_DQ_ITIMER) { - q->qi_itimelimit = newlim->d_itimer; - ddq->d_itimer = cpu_to_be32(newlim->d_itimer); + if (newlim->d_fieldmask & QC_INO_TIMER) { + q->qi_itimelimit = newlim->d_ino_timer; + ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer); } - if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { - q->qi_rtbtimelimit = newlim->d_rtbtimer; - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); + if (newlim->d_fieldmask & QC_RT_SPC_TIMER) { + q->qi_rtbtimelimit = newlim->d_rt_spc_timer; + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer); } - if (newlim->d_fieldmask & FS_DQ_BWARNS) - q->qi_bwarnlimit = newlim->d_bwarns; - if (newlim->d_fieldmask & FS_DQ_IWARNS) - q->qi_iwarnlimit = newlim->d_iwarns; - if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - q->qi_rtbwarnlimit = newlim->d_rtbwarns; + if (newlim->d_fieldmask & QC_SPC_WARNS) + q->qi_bwarnlimit = newlim->d_spc_warns; + if (newlim->d_fieldmask & QC_INO_WARNS) + q->qi_iwarnlimit = newlim->d_ino_warns; + if (newlim->d_fieldmask & QC_RT_SPC_WARNS) + q->qi_rtbwarnlimit = newlim->d_rt_spc_warns; } else { /* * If the user is now over quota, start the timelimit. @@ -784,23 +752,25 @@ xfs_qm_log_quotaoff( { xfs_trans_t *tp; int error; - xfs_qoff_logitem_t *qoffi=NULL; - uint oldsbqflag=0; + xfs_qoff_logitem_t *qoffi; + + *qoffstartp = NULL; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); - if (error) - goto error0; + if (error) { + xfs_trans_cancel(tp, 0); + goto out; + } qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); xfs_trans_log_quotaoff_item(tp, qoffi); spin_lock(&mp->m_sb_lock); - oldsbqflag = mp->m_sb.sb_qflags; mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; spin_unlock(&mp->m_sb_lock); - xfs_mod_sb(tp, XFS_SB_QFLAGS); + xfs_log_sb(tp); /* * We have to make sure that the transaction is secure on disk before we @@ -809,19 +779,11 @@ xfs_qm_log_quotaoff( */ xfs_trans_set_sync(tp); error = xfs_trans_commit(tp, 0); + if (error) + goto out; -error0: - if (error) { - xfs_trans_cancel(tp, 0); - /* - * No one else is modifying sb_qflags, so this is OK. - * We still hold the quotaofflock. - */ - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = oldsbqflag; - spin_unlock(&mp->m_sb_lock); - } *qoffstartp = qoffi; +out: return error; } @@ -831,7 +793,7 @@ xfs_qm_scall_getquota( struct xfs_mount *mp, xfs_dqid_t id, uint type, - struct fs_disk_quota *dst) + struct qc_dqblk *dst) { struct xfs_dquot *dqp; int error; @@ -855,28 +817,25 @@ xfs_qm_scall_getquota( } memset(dst, 0, sizeof(*dst)); - dst->d_version = FS_DQUOT_VERSION; - dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags); - dst->d_id = be32_to_cpu(dqp->q_core.d_id); - dst->d_blk_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); - dst->d_blk_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_spc_hardlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); + dst->d_spc_softlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); - dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount); - dst->d_icount = dqp->q_res_icount; - dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer); - dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer); - dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns); - dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns); - dst->d_rtb_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); - dst->d_rtb_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); - dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount); - dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer); - dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns); + dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount); + dst->d_ino_count = dqp->q_res_icount; + dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer); + dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns); + dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns); + dst->d_rt_spc_hardlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); + dst->d_rt_spc_softlimit = + XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount); + dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns); /* * Internally, we don't reset all the timers when quota enforcement @@ -889,23 +848,23 @@ xfs_qm_scall_getquota( dqp->q_core.d_flags == XFS_DQ_GROUP) || (!XFS_IS_PQUOTA_ENFORCED(mp) && dqp->q_core.d_flags == XFS_DQ_PROJ)) { - dst->d_btimer = 0; - dst->d_itimer = 0; - dst->d_rtbtimer = 0; + dst->d_spc_timer = 0; + dst->d_ino_timer = 0; + dst->d_rt_spc_timer = 0; } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || - (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) || - (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) && - dst->d_id != 0) { - if ((dst->d_bcount > dst->d_blk_softlimit) && - (dst->d_blk_softlimit > 0)) { - ASSERT(dst->d_btimer != 0); + if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || + (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || + (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && + id != 0) { + if ((dst->d_space > dst->d_spc_softlimit) && + (dst->d_spc_softlimit > 0)) { + ASSERT(dst->d_spc_timer != 0); } - if ((dst->d_icount > dst->d_ino_softlimit) && + if ((dst->d_ino_count > dst->d_ino_softlimit) && (dst->d_ino_softlimit > 0)) { - ASSERT(dst->d_itimer != 0); + ASSERT(dst->d_ino_timer != 0); } } #endif @@ -915,26 +874,6 @@ out_put: } STATIC uint -xfs_qm_export_qtype_flags( - uint flags) -{ - /* - * Can't be more than one, or none. - */ - ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) != - (FS_PROJ_QUOTA | FS_USER_QUOTA)); - ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) != - (FS_PROJ_QUOTA | FS_GROUP_QUOTA)); - ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) != - (FS_USER_QUOTA | FS_GROUP_QUOTA)); - ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0); - - return (flags & XFS_DQ_USER) ? - FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? - FS_PROJ_QUOTA : FS_GROUP_QUOTA; -} - -STATIC uint xfs_qm_export_flags( uint flags) { diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index b238027df987..6923905ab33d 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -19,8 +19,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_quota.h" @@ -66,19 +64,10 @@ xfs_fs_get_xstatev( return xfs_qm_scall_getqstatv(mp, fqs); } -STATIC int -xfs_fs_set_xstate( - struct super_block *sb, - unsigned int uflags, - int op) +static unsigned int +xfs_quota_flags(unsigned int uflags) { - struct xfs_mount *mp = XFS_M(sb); - unsigned int flags = 0; - - if (sb->s_flags & MS_RDONLY) - return -EROFS; - if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; + unsigned int flags = 0; if (uflags & FS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; @@ -93,16 +82,39 @@ xfs_fs_set_xstate( if (uflags & FS_QUOTA_PDQ_ENFD) flags |= XFS_PQUOTA_ENFD; - switch (op) { - case Q_XQUOTAON: - return xfs_qm_scall_quotaon(mp, flags); - case Q_XQUOTAOFF: - if (!XFS_IS_QUOTA_ON(mp)) - return -EINVAL; - return xfs_qm_scall_quotaoff(mp, flags); - } + return flags; +} + +STATIC int +xfs_quota_enable( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + + return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags)); +} + +STATIC int +xfs_quota_disable( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; - return -EINVAL; + return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags)); } STATIC int @@ -133,7 +145,7 @@ STATIC int xfs_fs_get_dqblk( struct super_block *sb, struct kqid qid, - struct fs_disk_quota *fdq) + struct qc_dqblk *qdq) { struct xfs_mount *mp = XFS_M(sb); @@ -143,14 +155,14 @@ xfs_fs_get_dqblk( return -ESRCH; return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), - xfs_quota_type(qid.type), fdq); + xfs_quota_type(qid.type), qdq); } STATIC int xfs_fs_set_dqblk( struct super_block *sb, struct kqid qid, - struct fs_disk_quota *fdq) + struct qc_dqblk *qdq) { struct xfs_mount *mp = XFS_M(sb); @@ -162,13 +174,14 @@ xfs_fs_set_dqblk( return -ESRCH; return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), - xfs_quota_type(qid.type), fdq); + xfs_quota_type(qid.type), qdq); } const struct quotactl_ops xfs_quotactl_operations = { .get_xstatev = xfs_fs_get_xstatev, .get_xstate = xfs_fs_get_xstate, - .set_xstate = xfs_fs_set_xstate, + .quota_enable = xfs_quota_enable, + .quota_disable = xfs_quota_disable, .rm_xquota = xfs_fs_rm_xquota, .get_dqblk = xfs_fs_get_dqblk, .set_dqblk = xfs_fs_set_dqblk, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index e1175ea9b551..f2079b6911cc 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -22,8 +22,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_bmap.h" @@ -36,7 +34,6 @@ #include "xfs_trace.h" #include "xfs_buf.h" #include "xfs_icache.h" -#include "xfs_dinode.h" #include "xfs_rtalloc.h" diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9f622feda6a4..8fcc4ccc5c79 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -21,9 +21,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_inum.h" #include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" @@ -44,7 +42,6 @@ #include "xfs_icache.h" #include "xfs_trace.h" #include "xfs_icreate_item.h" -#include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_quota.h" #include "xfs_sysfs.h" @@ -688,7 +685,7 @@ xfs_blkdev_get( mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); - xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); + xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } return error; @@ -796,8 +793,7 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: - if (rtdev) - xfs_blkdev_put(rtdev); + xfs_blkdev_put(rtdev); out_close_logdev: if (logdev && logdev != ddev) xfs_blkdev_put(logdev); @@ -842,10 +838,15 @@ STATIC int xfs_init_mount_workqueues( struct xfs_mount *mp) { + mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1, mp->m_fsname); + if (!mp->m_buf_workqueue) + goto out; + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_data_workqueue) - goto out; + goto out_destroy_buf; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); @@ -863,7 +864,7 @@ xfs_init_mount_workqueues( goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_FREEZABLE, 0, mp->m_fsname); + WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; @@ -884,6 +885,8 @@ out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: destroy_workqueue(mp->m_data_workqueue); +out_destroy_buf: + destroy_workqueue(mp->m_buf_workqueue); out: return -ENOMEM; } @@ -898,6 +901,7 @@ xfs_destroy_mount_workqueues( destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); + destroy_workqueue(mp->m_buf_workqueue); } /* @@ -1000,7 +1004,6 @@ xfs_fs_evict_inode( clear_inode(inode); XFS_STATS_INC(vn_rele); XFS_STATS_INC(vn_remove); - XFS_STATS_DEC(vn_active); xfs_inactive(ip); } @@ -1108,6 +1111,11 @@ xfs_fs_statfs( statp->f_files, mp->m_maxicount); + /* If sb_icount overshot maxicount, report actual allocation */ + statp->f_files = max_t(typeof(statp->f_files), + statp->f_files, + sbp->sb_icount); + /* make sure statp->f_ffree does not underflow */ ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); statp->f_ffree = max_t(__int64_t, ffree, 0); @@ -1254,13 +1262,13 @@ xfs_fs_remount( * If this is the first remount to writeable state we * might have some superblock changes to update. */ - if (mp->m_update_flags) { - error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (mp->m_update_sb) { + error = xfs_sync_sb(mp, false); if (error) { xfs_warn(mp, "failed to write sb changes"); return error; } - mp->m_update_flags = 0; + mp->m_update_sb = false; } /* @@ -1290,8 +1298,9 @@ xfs_fs_remount( /* * Second stage of a freeze. The data is already frozen so we only - * need to take care of the metadata. Once that's done write a dummy - * record to dirty the log in case of a crash while frozen. + * need to take care of the metadata. Once that's done sync the superblock + * to the log to dirty it in case of a crash while frozen. This ensures that we + * will recover the unlinked inode lists on the next mount. */ STATIC int xfs_fs_freeze( @@ -1301,7 +1310,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return xfs_fs_log_dummy(mp); + return xfs_sync_sb(mp, true); } STATIC int @@ -1425,6 +1434,7 @@ xfs_fs_fill_super( sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA sb->s_qcop = &xfs_quotactl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &xfs_super_operations; @@ -1527,7 +1537,7 @@ xfs_fs_mount( static long xfs_fs_nr_cached_objects( struct super_block *sb, - int nid) + struct shrink_control *sc) { return xfs_reclaim_inodes_count(XFS_M(sb)); } @@ -1535,10 +1545,9 @@ xfs_fs_nr_cached_objects( static long xfs_fs_free_cached_objects( struct super_block *sb, - long nr_to_scan, - int nid) + struct shrink_control *sc) { - return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); } static const struct super_operations xfs_super_operations = { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 02ae62a998e0..25791df6f638 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,8 +23,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" @@ -42,7 +40,6 @@ #include "xfs_symlink.h" #include "xfs_trans.h" #include "xfs_log.h" -#include "xfs_dinode.h" /* ----- Kernel only functions below ----- */ STATIC int diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 1743b9f8e23d..a0c8067cea6f 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -149,24 +149,6 @@ static struct ctl_table xfs_table[] = { .extra2 = &xfs_params.inherit_noatim.max }, { - .procname = "xfsbufd_centisecs", - .data = &xfs_params.xfs_buf_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.xfs_buf_timer.min, - .extra2 = &xfs_params.xfs_buf_timer.max - }, - { - .procname = "age_buffer_centisecs", - .data = &xfs_params.xfs_buf_age.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &xfs_params.xfs_buf_age.min, - .extra2 = &xfs_params.xfs_buf_age.max - }, - { .procname = "inherit_nosymlinks", .data = &xfs_params.inherit_nosym.val, .maxlen = sizeof(int), diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 1e85bcd0e418..13a029806805 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 30e8e3410955..eb90cd59a0ec 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -22,8 +22,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_extent_busy.h" @@ -474,6 +472,7 @@ xfs_trans_apply_sb_deltas( whole = 1; } + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); if (whole) /* * Log the whole thing, the fields are noncontiguous. diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 859482f53b5a..573aefb5a573 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -18,10 +18,9 @@ */ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index e2b2216b1635..75798412859a 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" @@ -229,13 +227,6 @@ xfs_trans_getsb(xfs_trans_t *tp, return bp; } -#ifdef DEBUG -xfs_buftarg_t *xfs_error_target; -int xfs_do_error; -int xfs_req_num; -int xfs_error_mod = 33; -#endif - /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been @@ -257,46 +248,11 @@ xfs_trans_read_buf_map( struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; - xfs_buf_log_item_t *bip; + struct xfs_buf *bp = NULL; + struct xfs_buf_log_item *bip; int error; *bpp = NULL; - if (!tp) { - bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (!bp) - return (flags & XBF_TRYLOCK) ? - -EAGAIN : -ENOMEM; - - if (bp->b_error) { - error = bp->b_error; - xfs_buf_ioerror_alert(bp, __func__); - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - xfs_buf_relse(bp); - - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - return error; - } -#ifdef DEBUG - if (xfs_do_error) { - if (xfs_error_target == target) { - if (((xfs_req_num++) % xfs_error_mod) == 0) { - xfs_buf_relse(bp); - xfs_debug(mp, "Returning error!"); - return -EIO; - } - } - } -#endif - if (XFS_FORCED_SHUTDOWN(mp)) - goto shutdown_abort; - *bpp = bp; - return 0; - } - /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already @@ -305,49 +261,24 @@ xfs_trans_read_buf_map( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - bp = xfs_trans_buf_item_match(tp, target, map, nmaps); - if (bp != NULL) { + if (tp) + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); + if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_fspriv != NULL); ASSERT(!bp->b_error); - if (!(XFS_BUF_ISDONE(bp))) { - trace_xfs_trans_read_buf_io(bp, _RET_IP_); - ASSERT(!XFS_BUF_ISASYNC(bp)); - ASSERT(bp->b_iodone == NULL); - XFS_BUF_READ(bp); - bp->b_ops = ops; - - error = xfs_buf_submit_wait(bp); - if (error) { - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_buf_ioerror_alert(bp, __func__); - xfs_buf_relse(bp); - /* - * We can gracefully recover from most read - * errors. Ones we can't are those that happen - * after the transaction's already dirty. - */ - if (tp->t_flags & XFS_TRANS_DIRTY) - xfs_force_shutdown(tp->t_mountp, - SHUTDOWN_META_IO_ERROR); - /* bad CRC means corrupted metadata */ - if (error == -EFSBADCRC) - error = -EFSCORRUPTED; - return error; - } - } + ASSERT(bp->b_flags & XBF_DONE); + /* * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ if (XFS_FORCED_SHUTDOWN(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); - *bpp = NULL; return -EIO; } - bip = bp->b_fspriv; bip->bli_recur++; @@ -358,17 +289,29 @@ xfs_trans_read_buf_map( } bp = xfs_buf_read_map(target, map, nmaps, flags, ops); - if (bp == NULL) { - *bpp = NULL; - return (flags & XBF_TRYLOCK) ? - 0 : -ENOMEM; + if (!bp) { + if (!(flags & XBF_TRYLOCK)) + return -ENOMEM; + return tp ? 0 : -EAGAIN; } + + /* + * If we've had a read error, then the contents of the buffer are + * invalid and should not be used. To ensure that a followup read tries + * to pull the buffer from disk again, we clear the XBF_DONE flag and + * mark the buffer stale. This ensures that anyone who has a current + * reference to the buffer will interpret it's contents correctly and + * future cache lookups will also treat it as an empty, uninitialised + * buffer. + */ if (bp->b_error) { error = bp->b_error; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); + bp->b_flags &= ~XBF_DONE; xfs_buf_stale(bp); - XFS_BUF_DONE(bp); - xfs_buf_ioerror_alert(bp, __func__); - if (tp->t_flags & XFS_TRANS_DIRTY) + + if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); @@ -377,33 +320,20 @@ xfs_trans_read_buf_map( error = -EFSCORRUPTED; return error; } -#ifdef DEBUG - if (xfs_do_error && !(tp->t_flags & XFS_TRANS_DIRTY)) { - if (xfs_error_target == target) { - if (((xfs_req_num++) % xfs_error_mod) == 0) { - xfs_force_shutdown(tp->t_mountp, - SHUTDOWN_META_IO_ERROR); - xfs_buf_relse(bp); - xfs_debug(mp, "Returning trans error!"); - return -EIO; - } - } - } -#endif - if (XFS_FORCED_SHUTDOWN(mp)) - goto shutdown_abort; - _xfs_trans_bjoin(tp, bp, 1); - trace_xfs_trans_read_buf(bp->b_fspriv); + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_relse(bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); + return -EIO; + } + if (tp) { + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); + } *bpp = bp; return 0; -shutdown_abort: - trace_xfs_trans_read_buf_shut(bp, _RET_IP_); - xfs_buf_relse(bp); - *bpp = NULL; - return -EIO; } /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 846e061c2e98..76a16df55ef7 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_error.h" diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 47978ba89dae..284397dd7990 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -18,10 +18,9 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index cdb4d86520e1..17280cd71934 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -21,8 +21,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 93455b998041..69f6e475de97 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -20,8 +20,6 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" -#include "xfs_sb.h" -#include "xfs_ag.h" #include "xfs_mount.h" #include "xfs_da_format.h" #include "xfs_inode.h" |