diff options
Diffstat (limited to 'fs')
75 files changed, 2534 insertions, 1239 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index a6bb530b1ec5..ec35851e5b71 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -13,13 +13,6 @@ if BLOCK source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" - -config FS_XIP -# execute in place - bool - depends on EXT2_FS_XIP - default y - source "fs/jbd/Kconfig" source "fs/jbd2/Kconfig" @@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +config FS_DAX + bool "Direct Access (DAX) support" + depends on MMU + depends on !(ARM || MIPS || SPARC) + help + Direct Access (DAX) can be used on memory-backed block devices. + If the block device supports DAX and the filesystem supports DAX, + then you can avoid using the pagecache to buffer I/Os. Turning + on this option will compile in support for DAX; you will need to + mount the filesystem using the -o dax option. + + If you do not have a block device that is capable of using this, + or if unsure, say N. Saying Y will increase the size of the kernel + by about 5kB. + endif # BLOCK # Posix ACL utility routines diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c055d56ec63d..270c48148f79 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -149,13 +149,6 @@ config BINFMT_EM86 later load the module when you want to use a Linux/Intel binary. The module will be called binfmt_em86. If unsure, say Y. -config BINFMT_SOM - tristate "Kernel support for SOM binaries" - depends on PARISC && HPUX - help - SOM is a binary executable format inherited from HP/UX. Say - Y here to be able to load and execute SOM binaries directly. - config BINFMT_MISC tristate "Kernel support for MISC binaries" ---help--- diff --git a/fs/Makefile b/fs/Makefile index bedff48e8fdc..a88ac4838c9e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o @@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o -obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o obj-$(CONFIG_FS_MBCACHE) += mbcache.o diff --git a/fs/affs/affs.h b/fs/affs/affs.h index ff44ff3ff015..c8764bd7497d 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -30,6 +30,8 @@ #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) +#define AFFSNAMEMAX 30U + struct affs_ext_key { u32 ext; /* idx of the extended block */ u32 key; /* block number */ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index c852f2fa1710..388da1ea815d 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -30,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) ino = bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]); - pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino); + pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino); dir_bh = affs_bread(sb, dir->i_ino); if (!dir_bh) @@ -80,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) sb = dir->i_sb; rem_ino = rem_bh->b_blocknr; offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]); - pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n", - __func__, (u32)dir->i_ino, rem_ino, offset); + pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino, + rem_ino, offset); bh = affs_bread(sb, dir->i_ino); if (!bh) @@ -483,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate) { int i; - if (len > 30) { + if (len > AFFSNAMEMAX) { if (notruncate) return -ENAMETOOLONG; - else - len = 30; + len = AFFSNAMEMAX; } for (i = 0; i < len; i++) { if (name[i] < ' ' || name[i] == ':' @@ -508,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate) int affs_copy_name(unsigned char *bstr, struct dentry *dentry) { - int len = min(dentry->d_name.len, 30u); + u32 len = min(dentry->d_name.len, AFFSNAMEMAX); *bstr++ = len; memcpy(bstr, dentry->d_name.name, len); diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index c8de51185c23..675148950fed 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -99,7 +99,6 @@ err_bh_read: err_range: affs_error(sb, "affs_free_block","Block %u outside partition", block); - return; } /* diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 59f07bec92a6..ac4f318aafba 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx) u32 ino; int error = 0; - pr_debug("%s(ino=%lu,f_pos=%lx)\n", - __func__, inode->i_ino, (unsigned long)ctx->pos); + pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); if (ctx->pos < 2) { file->private_data = (void *)0; @@ -115,11 +114,11 @@ inside: break; } - namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30); + namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], + (u8)AFFSNAMEMAX); name = AFFS_TAIL(sb, fh_bh)->name + 1; - pr_debug("readdir(): dir_emit(\"%.*s\", " - "ino=%u), hash=%d, f_pos=%x\n", - namelen, name, ino, hash_pos, (u32)ctx->pos); + pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n", + namelen, name, ino, hash_pos, ctx->pos); if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN)) goto done; diff --git a/fs/affs/file.c b/fs/affs/file.c index 8faa6593ca6d..d2468bf95669 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -180,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext) ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension); if (ext < AFFS_I(inode)->i_extcnt) goto read_ext; - if (ext > AFFS_I(inode)->i_extcnt) - BUG(); + BUG_ON(ext > AFFS_I(inode)->i_extcnt); bh = affs_alloc_extblock(inode, bh, ext); if (IS_ERR(bh)) return bh; @@ -198,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext) struct buffer_head *prev_bh; /* allocate a new extended block */ - if (ext > AFFS_I(inode)->i_extcnt) - BUG(); + BUG_ON(ext > AFFS_I(inode)->i_extcnt); /* get previous extended block */ prev_bh = affs_get_extblock(inode, ext - 1); @@ -299,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul struct buffer_head *ext_bh; u32 ext; - pr_debug("%s(%u, %lu)\n", - __func__, (u32)inode->i_ino, (unsigned long)block); + pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino, + (unsigned long long)block); BUG_ON(block > (sector_t)0x7fffffffUL); @@ -330,8 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul /* store new block */ if (bh_result->b_blocknr) - affs_warning(sb, "get_block", "block already set (%lx)", - (unsigned long)bh_result->b_blocknr); + affs_warning(sb, "get_block", + "block already set (%llx)", + (unsigned long long)bh_result->b_blocknr); AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr); AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1); affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1); @@ -353,8 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul return 0; err_big: - affs_error(inode->i_sb, "get_block", "strange block request %d", - (int)block); + affs_error(inode->i_sb, "get_block", "strange block request %llu", + (unsigned long long)block); return -EIO; err_ext: // unlock cache @@ -399,6 +398,13 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; + if (rw == WRITE) { + loff_t size = offset + count; + + if (AFFS_I(inode)->mmu_private < size) + return 0; + } + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); if (ret < 0 && (rw & WRITE)) affs_write_failed(mapping, offset + count); @@ -503,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to) u32 bidx, boff, bsize; u32 tmp; - pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino, + pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, page->index, to); BUG_ON(to > PAGE_CACHE_SIZE); kmap(page); @@ -539,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) u32 size, bsize; u32 tmp; - pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize); + pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize); bsize = AFFS_SB(sb)->s_data_blksize; bh = NULL; size = AFFS_I(inode)->mmu_private; @@ -608,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page) u32 to; int err; - pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index); + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); to = PAGE_CACHE_SIZE; if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) { to = inode->i_size & ~PAGE_CACHE_MASK; @@ -631,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping pgoff_t index; int err = 0; - pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino, - (unsigned long long)pos, (unsigned long long)pos + len); + pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pos + len); if (pos > AFFS_I(inode)->mmu_private) { /* XXX: this probably leaves a too-big i_size in case of * failure. Should really be updating i_size at write_end time @@ -681,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, * due to write_begin. */ - pr_debug("%s(%u, %llu, %llu)\n", - __func__, (u32)inode->i_ino, (unsigned long long)pos, - (unsigned long long)pos + len); + pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, + pos + len); bsize = AFFS_SB(sb)->s_data_blksize; data = page_address(page); @@ -831,8 +836,8 @@ affs_truncate(struct inode *inode) struct buffer_head *ext_bh; int i; - pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n", - (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size); + pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n", + inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size); last_blk = 0; ext = 0; @@ -863,7 +868,7 @@ affs_truncate(struct inode *inode) if (IS_ERR(ext_bh)) { affs_warning(sb, "truncate", "unexpected read error for ext block %u (%ld)", - (unsigned int)ext, PTR_ERR(ext_bh)); + ext, PTR_ERR(ext_bh)); return; } if (AFFS_I(inode)->i_lc) { @@ -911,7 +916,7 @@ affs_truncate(struct inode *inode) if (IS_ERR(bh)) { affs_warning(sb, "truncate", "unexpected read error for last block %u (%ld)", - (unsigned int)ext, PTR_ERR(bh)); + ext, PTR_ERR(bh)); return; } tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index d0609a282e1d..6f34510449e8 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -13,8 +13,6 @@ #include <linux/gfp.h> #include "affs.h" -extern const struct inode_operations affs_symlink_inode_operations; - struct inode *affs_iget(struct super_block *sb, unsigned long ino) { struct affs_sb_info *sbi = AFFS_SB(sb); @@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 u32 block = 0; int retval; - pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n", - __func__, (u32)dir->i_ino, - (u32)inode->i_ino, dentry, type); + pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__, + dir->i_ino, inode->i_ino, dentry, type); retval = -EIO; bh = affs_bread(sb, inode->i_ino); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index bbc38530e924..ffb7bd82c2a5 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; - int i; + int retval; + u32 len; - i = affs_check_name(qstr->name, qstr->len, notruncate); - if (i) - return i; + retval = affs_check_name(qstr->name, qstr->len, notruncate); + if (retval) + return retval; hash = init_name_hash(); - i = min(qstr->len, 30u); - for (; i > 0; name++, i--) + len = min(qstr->len, AFFSNAMEMAX); + for (; len > 0; name++, len--) hash = partial_name_hash(toupper(*name), hash); qstr->hash = end_name_hash(hash); @@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len, * If the names are longer than the allowed 30 chars, * the excess is ignored, so their length may differ. */ - if (len >= 30) { - if (name->len < 30) + if (len >= AFFSNAMEMAX) { + if (name->len < AFFSNAMEMAX) return 1; - len = 30; + len = AFFSNAMEMAX; } else if (len != name->len) return 1; @@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) const u8 *name = dentry->d_name.name; int len = dentry->d_name.len; - if (len >= 30) { - if (*name2 < 30) + if (len >= AFFSNAMEMAX) { + if (*name2 < AFFSNAMEMAX) return 0; - len = 30; + len = AFFSNAMEMAX; } else if (len != *name2) return 0; @@ -173,9 +174,9 @@ int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) { toupper_t toupper = affs_get_toupper(sb); - int hash; + u32 hash; - hash = len = min(len, 30u); + hash = len = min(len, AFFSNAMEMAX); for (; len > 0; len--) hash = (hash * 13 + toupper(*name++)) & 0x7ff; @@ -248,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%d, %lu \"%pd\")\n", - __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, - dentry); + pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + dentry->d_inode->i_ino, dentry); return affs_remove_header(dentry); } @@ -317,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("%s(dir=%u, %lu \"%pd\")\n", - __func__, (u32)dir->i_ino, dentry->d_inode->i_ino, - dentry); + pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, + dentry->d_inode->i_ino, dentry); return affs_remove_header(dentry); } @@ -404,8 +403,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - pr_debug("%s(%u, %u, \"%pd\")\n", - __func__, (u32)inode->i_ino, (u32)dir->i_ino, + pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, dentry); return affs_add_entry(dir, inode, dentry, ST_LINKFILE); @@ -419,9 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, struct buffer_head *bh = NULL; int retval; - pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n", - __func__, (u32)old_dir->i_ino, old_dentry, - (u32)new_dir->i_ino, new_dentry); + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); retval = affs_check_name(new_dentry->d_name.name, new_dentry->d_name.len, diff --git a/fs/affs/super.c b/fs/affs/super.c index f754ab68a840..4cf0e9113fb6 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -432,39 +432,39 @@ got_root: sb->s_flags |= MS_RDONLY; } switch (chksum) { - case MUFS_FS: - case MUFS_INTLFFS: - case MUFS_DCFFS: - sbi->s_flags |= SF_MUFS; - /* fall thru */ - case FS_INTLFFS: - case FS_DCFFS: - sbi->s_flags |= SF_INTL; - break; - case MUFS_FFS: - sbi->s_flags |= SF_MUFS; - break; - case FS_FFS: - break; - case MUFS_OFS: - sbi->s_flags |= SF_MUFS; - /* fall thru */ - case FS_OFS: - sbi->s_flags |= SF_OFS; - sb->s_flags |= MS_NOEXEC; - break; - case MUFS_DCOFS: - case MUFS_INTLOFS: - sbi->s_flags |= SF_MUFS; - case FS_DCOFS: - case FS_INTLOFS: - sbi->s_flags |= SF_INTL | SF_OFS; - sb->s_flags |= MS_NOEXEC; - break; - default: - pr_err("Unknown filesystem on device %s: %08X\n", - sb->s_id, chksum); - return -EINVAL; + case MUFS_FS: + case MUFS_INTLFFS: + case MUFS_DCFFS: + sbi->s_flags |= SF_MUFS; + /* fall thru */ + case FS_INTLFFS: + case FS_DCFFS: + sbi->s_flags |= SF_INTL; + break; + case MUFS_FFS: + sbi->s_flags |= SF_MUFS; + break; + case FS_FFS: + break; + case MUFS_OFS: + sbi->s_flags |= SF_MUFS; + /* fall thru */ + case FS_OFS: + sbi->s_flags |= SF_OFS; + sb->s_flags |= MS_NOEXEC; + break; + case MUFS_DCOFS: + case MUFS_INTLOFS: + sbi->s_flags |= SF_MUFS; + case FS_DCOFS: + case FS_INTLOFS: + sbi->s_flags |= SF_INTL | SF_OFS; + sb->s_flags |= MS_NOEXEC; + break; + default: + pr_err("Unknown filesystem on device %s: %08X\n", + sb->s_id, chksum); + return -EINVAL; } if (mount_flags & SF_VERBOSE) { @@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = free; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - buf->f_namelen = 30; + buf->f_namelen = AFFSNAMEMAX; return 0; } @@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb) affs_free_bitmap(sb); affs_brelse(sbi->s_root_bh); kfree(sbi->s_prefix); + mutex_destroy(&sbi->s_bmlock); kfree(sbi); } } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index edf47774b03d..e089f1985fca 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -274,9 +274,9 @@ more: static struct inode * befs_alloc_inode(struct super_block *sb) { - struct befs_inode_info *bi; - bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep, - GFP_KERNEL); + struct befs_inode_info *bi; + + bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c deleted file mode 100644 index 4e00ed68d4a6..000000000000 --- a/fs/binfmt_som.c +++ /dev/null @@ -1,299 +0,0 @@ -/* - * linux/fs/binfmt_som.c - * - * These are the functions used to load SOM format executables as used - * by HP-UX. - * - * Copyright 1999 Matthew Wilcox <willy@bofh.ai> - * based on binfmt_elf which is - * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). - */ - -#include <linux/module.h> - -#include <linux/fs.h> -#include <linux/stat.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/mman.h> -#include <linux/errno.h> -#include <linux/signal.h> -#include <linux/binfmts.h> -#include <linux/som.h> -#include <linux/string.h> -#include <linux/file.h> -#include <linux/fcntl.h> -#include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/shm.h> -#include <linux/personality.h> -#include <linux/init.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> - - -#include <linux/elf.h> - -static int load_som_binary(struct linux_binprm * bprm); -static int load_som_library(struct file *); - -/* - * If we don't support core dumping, then supply a NULL so we - * don't even try. - */ -#if 0 -static int som_core_dump(struct coredump_params *cprm); -#else -#define som_core_dump NULL -#endif - -#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1)) -#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1)) -#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1)) - -static struct linux_binfmt som_format = { - .module = THIS_MODULE, - .load_binary = load_som_binary, - .load_shlib = load_som_library, - .core_dump = som_core_dump, - .min_coredump = SOM_PAGESIZE -}; - -/* - * create_som_tables() parses the env- and arg-strings in new user - * memory and creates the pointer tables from them, and puts their - * addresses on the "stack", returning the new stack pointer value. - */ -static void create_som_tables(struct linux_binprm *bprm) -{ - char **argv, **envp; - int argc = bprm->argc; - int envc = bprm->envc; - unsigned long p; - unsigned long *sp; - - /* Word-align the stack pointer */ - sp = (unsigned long *)((bprm->p + 3) & ~3); - - envp = (char **) sp; - sp += envc + 1; - argv = (char **) sp; - sp += argc + 1; - - __put_user((unsigned long) envp,++sp); - __put_user((unsigned long) argv,++sp); - - __put_user(argc, ++sp); - - bprm->p = (unsigned long) sp; - - p = current->mm->arg_start; - while (argc-- > 0) { - __put_user((char *)p,argv++); - p += strlen_user((char *)p); - } - __put_user(NULL, argv); - current->mm->arg_end = current->mm->env_start = p; - while (envc-- > 0) { - __put_user((char *)p,envp++); - p += strlen_user((char *)p); - } - __put_user(NULL, envp); - current->mm->env_end = p; -} - -static int check_som_header(struct som_hdr *som_ex) -{ - int *buf = (int *)som_ex; - int i, ck; - - if (som_ex->system_id != SOM_SID_PARISC_1_0 && - som_ex->system_id != SOM_SID_PARISC_1_1 && - som_ex->system_id != SOM_SID_PARISC_2_0) - return -ENOEXEC; - - if (som_ex->a_magic != SOM_EXEC_NONSHARE && - som_ex->a_magic != SOM_EXEC_SHARE && - som_ex->a_magic != SOM_EXEC_DEMAND) - return -ENOEXEC; - - if (som_ex->version_id != SOM_ID_OLD && - som_ex->version_id != SOM_ID_NEW) - return -ENOEXEC; - - ck = 0; - for (i=0; i<32; i++) - ck ^= buf[i]; - if (ck != 0) - return -ENOEXEC; - - return 0; -} - -static int map_som_binary(struct file *file, - const struct som_exec_auxhdr *hpuxhdr) -{ - unsigned long code_start, code_size, data_start, data_size; - unsigned long bss_start, som_brk; - int retval; - int prot = PROT_READ | PROT_EXEC; - int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE; - - mm_segment_t old_fs = get_fs(); - set_fs(get_ds()); - - code_start = SOM_PAGESTART(hpuxhdr->exec_tmem); - code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize); - current->mm->start_code = code_start; - current->mm->end_code = code_start + code_size; - retval = vm_mmap(file, code_start, code_size, prot, - flags, SOM_PAGESTART(hpuxhdr->exec_tfile)); - if (retval < 0 && retval > -1024) - goto out; - - data_start = SOM_PAGESTART(hpuxhdr->exec_dmem); - data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize); - current->mm->start_data = data_start; - current->mm->end_data = bss_start = data_start + data_size; - retval = vm_mmap(file, data_start, data_size, - prot | PROT_WRITE, flags, - SOM_PAGESTART(hpuxhdr->exec_dfile)); - if (retval < 0 && retval > -1024) - goto out; - - som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize); - current->mm->start_brk = current->mm->brk = som_brk; - retval = vm_mmap(NULL, bss_start, som_brk - bss_start, - prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0); - if (retval > 0 || retval < -1024) - retval = 0; -out: - set_fs(old_fs); - return retval; -} - - -/* - * These are the functions used to load SOM executables and shared - * libraries. There is no binary dependent code anywhere else. - */ - -static int -load_som_binary(struct linux_binprm * bprm) -{ - int retval; - unsigned int size; - unsigned long som_entry; - struct som_hdr *som_ex; - struct som_exec_auxhdr *hpuxhdr; - struct pt_regs *regs = current_pt_regs(); - - /* Get the exec-header */ - som_ex = (struct som_hdr *) bprm->buf; - - retval = check_som_header(som_ex); - if (retval != 0) - goto out; - - /* Now read in the auxiliary header information */ - - retval = -ENOMEM; - size = som_ex->aux_header_size; - if (size > SOM_PAGESIZE) - goto out; - hpuxhdr = kmalloc(size, GFP_KERNEL); - if (!hpuxhdr) - goto out; - - retval = kernel_read(bprm->file, som_ex->aux_header_location, - (char *) hpuxhdr, size); - if (retval != size) { - if (retval >= 0) - retval = -EIO; - goto out_free; - } - - /* Flush all traces of the currently running executable */ - retval = flush_old_exec(bprm); - if (retval) - goto out_free; - - /* OK, This is the point of no return */ - current->personality = PER_HPUX; - setup_new_exec(bprm); - - /* Set the task size for HP-UX processes such that - * the gateway page is outside the address space. - * This can be fixed later, but for now, this is much - * easier. - */ - - current->thread.task_size = 0xc0000000; - - /* Set map base to allow enough room for hp-ux heap growth */ - - current->thread.map_base = 0x80000000; - - retval = map_som_binary(bprm->file, hpuxhdr); - if (retval < 0) - goto out_free; - - som_entry = hpuxhdr->exec_entry; - kfree(hpuxhdr); - - set_binfmt(&som_format); - install_exec_creds(bprm); - setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - - create_som_tables(bprm); - - current->mm->start_stack = bprm->p; - -#if 0 - printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); - printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code); - printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code); - printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data); - printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack); - printk("(brk) %08lx\n" , (unsigned long) current->mm->brk); -#endif - - map_hpux_gateway_page(current,current->mm); - - start_thread_som(regs, som_entry, bprm->p); - return 0; - - /* error cleanup */ -out_free: - kfree(hpuxhdr); -out: - return retval; -} - -static int load_som_library(struct file *f) -{ -/* No lib support in SOM yet. gizza chance.. */ - return -ENOEXEC; -} - /* Install the SOM loader. - * N.B. We *rely* on the table being the right size with the - * right number of free slots... - */ - -static int __init init_som_binfmt(void) -{ - register_binfmt(&som_format); - return 0; -} - -static void __exit exit_som_binfmt(void) -{ - /* Remove the SOM loader. */ - unregister_binfmt(&som_format); -} - -core_initcall(init_som_binfmt); -module_exit(exit_som_binfmt); - -MODULE_LICENSE("GPL"); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 86c893884eb9..281ee011bb6a 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -28,29 +28,6 @@ #include "coda_int.h" -/* dir inode-ops */ -static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl); -static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags); -static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, - struct dentry *entry); -static int coda_unlink(struct inode *dir_inode, struct dentry *entry); -static int coda_symlink(struct inode *dir_inode, struct dentry *entry, - const char *symname); -static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode); -static int coda_rmdir(struct inode *dir_inode, struct dentry *entry); -static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, - struct inode *new_inode, struct dentry *new_dentry); - -/* dir file-ops */ -static int coda_readdir(struct file *file, struct dir_context *ctx); - -/* dentry ops */ -static int coda_dentry_revalidate(struct dentry *de, unsigned int flags); -static int coda_dentry_delete(const struct dentry *); - -/* support routines */ -static int coda_venus_readdir(struct file *, struct dir_context *); - /* same as fs/bad_inode.c */ static int coda_return_EIO(void) { @@ -58,38 +35,6 @@ static int coda_return_EIO(void) } #define CODA_EIO_ERROR ((void *) (coda_return_EIO)) -const struct dentry_operations coda_dentry_operations = -{ - .d_revalidate = coda_dentry_revalidate, - .d_delete = coda_dentry_delete, -}; - -const struct inode_operations coda_dir_inode_operations = -{ - .create = coda_create, - .lookup = coda_lookup, - .link = coda_link, - .unlink = coda_unlink, - .symlink = coda_symlink, - .mkdir = coda_mkdir, - .rmdir = coda_rmdir, - .mknod = CODA_EIO_ERROR, - .rename = coda_rename, - .permission = coda_permission, - .getattr = coda_getattr, - .setattr = coda_setattr, -}; - -const struct file_operations coda_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate = coda_readdir, - .open = coda_open, - .release = coda_release, - .fsync = coda_fsync, -}; - - /* inode operations for directories */ /* access routines: lookup, readlink, permission */ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) @@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, return error; } - -/* file operations for directories */ -static int coda_readdir(struct file *coda_file, struct dir_context *ctx) -{ - struct coda_file_info *cfi; - struct file *host_file; - int ret; - - cfi = CODA_FTOC(coda_file); - BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - - if (host_file->f_op->iterate) { - struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); - ret = -ENOENT; - if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); - } - mutex_unlock(&host_inode->i_mutex); - return ret; - } - /* Venus: we must read Venus dirents from a file */ - return coda_venus_readdir(coda_file, ctx); -} - static inline unsigned int CDT2DT(unsigned char cdt) { unsigned int dt; @@ -495,6 +413,33 @@ out: return 0; } +/* file operations for directories */ +static int coda_readdir(struct file *coda_file, struct dir_context *ctx) +{ + struct coda_file_info *cfi; + struct file *host_file; + int ret; + + cfi = CODA_FTOC(coda_file); + BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); + host_file = cfi->cfi_container; + + if (host_file->f_op->iterate) { + struct inode *host_inode = file_inode(host_file); + + mutex_lock(&host_inode->i_mutex); + ret = -ENOENT; + if (!IS_DEADDIR(host_inode)) { + ret = host_file->f_op->iterate(host_file, ctx); + file_accessed(host_file); + } + mutex_unlock(&host_inode->i_mutex); + return ret; + } + /* Venus: we must read Venus dirents from a file */ + return coda_venus_readdir(coda_file, ctx); +} + /* called when a cache lookup succeeds */ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) { @@ -603,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode) } return 0; } + +const struct dentry_operations coda_dentry_operations = { + .d_revalidate = coda_dentry_revalidate, + .d_delete = coda_dentry_delete, +}; + +const struct inode_operations coda_dir_inode_operations = { + .create = coda_create, + .lookup = coda_lookup, + .link = coda_link, + .unlink = coda_unlink, + .symlink = coda_symlink, + .mkdir = coda_mkdir, + .rmdir = coda_rmdir, + .mknod = CODA_EIO_ERROR, + .rename = coda_rename, + .permission = coda_permission, + .getattr = coda_getattr, + .setattr = coda_setattr, +}; + +const struct file_operations coda_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate = coda_readdir, + .open = coda_open, + .release = coda_release, + .fsync = coda_fsync, +}; diff --git a/fs/dax.c b/fs/dax.c new file mode 100644 index 000000000000..ed1619ec6537 --- /dev/null +++ b/fs/dax.c @@ -0,0 +1,534 @@ +/* + * fs/dax.c - Direct Access filesystem code + * Copyright (c) 2013-2014 Intel Corporation + * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> + * Author: Ross Zwisler <ross.zwisler@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/atomic.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/highmem.h> +#include <linux/memcontrol.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/uio.h> +#include <linux/vmstat.h> + +int dax_clear_blocks(struct inode *inode, sector_t block, long size) +{ + struct block_device *bdev = inode->i_sb->s_bdev; + sector_t sector = block << (inode->i_blkbits - 9); + + might_sleep(); + do { + void *addr; + unsigned long pfn; + long count; + + count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + if (count < 0) + return count; + BUG_ON(size < count); + while (count > 0) { + unsigned pgsz = PAGE_SIZE - offset_in_page(addr); + if (pgsz > count) + pgsz = count; + if (pgsz < PAGE_SIZE) + memset(addr, 0, pgsz); + else + clear_page(addr); + addr += pgsz; + size -= pgsz; + count -= pgsz; + BUG_ON(pgsz & 511); + sector += pgsz / 512; + cond_resched(); + } + } while (size); + + return 0; +} +EXPORT_SYMBOL_GPL(dax_clear_blocks); + +static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) +{ + unsigned long pfn; + sector_t sector = bh->b_blocknr << (blkbits - 9); + return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); +} + +static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, + loff_t end) +{ + loff_t final = end - pos + first; /* The final byte of the buffer */ + + if (first > 0) + memset(addr, 0, first); + if (final < size) + memset(addr + final, 0, size - final); +} + +static bool buffer_written(struct buffer_head *bh) +{ + return buffer_mapped(bh) && !buffer_unwritten(bh); +} + +/* + * When ext4 encounters a hole, it returns without modifying the buffer_head + * which means that we can't trust b_size. To cope with this, we set b_state + * to 0 before calling get_block and, if any bit is set, we know we can trust + * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is + * and would save us time calling get_block repeatedly. + */ +static bool buffer_size_valid(struct buffer_head *bh) +{ + return bh->b_state != 0; +} + +static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) +{ + ssize_t retval = 0; + loff_t pos = start; + loff_t max = start; + loff_t bh_max = start; + void *addr; + bool hole = false; + + if (rw != WRITE) + end = min(end, i_size_read(inode)); + + while (pos < end) { + unsigned len; + if (pos == max) { + unsigned blkbits = inode->i_blkbits; + sector_t block = pos >> blkbits; + unsigned first = pos - (block << blkbits); + long size; + + if (pos == bh_max) { + bh->b_size = PAGE_ALIGN(end - pos); + bh->b_state = 0; + retval = get_block(inode, block, bh, + rw == WRITE); + if (retval) + break; + if (!buffer_size_valid(bh)) + bh->b_size = 1 << blkbits; + bh_max = pos - first + bh->b_size; + } else { + unsigned done = bh->b_size - + (bh_max - (pos - first)); + bh->b_blocknr += done >> blkbits; + bh->b_size -= done; + } + + hole = (rw != WRITE) && !buffer_written(bh); + if (hole) { + addr = NULL; + size = bh->b_size - first; + } else { + retval = dax_get_addr(bh, &addr, blkbits); + if (retval < 0) + break; + if (buffer_unwritten(bh) || buffer_new(bh)) + dax_new_buf(addr, retval, first, pos, + end); + addr += first; + size = retval - first; + } + max = min(pos + size, end); + } + + if (rw == WRITE) + len = copy_from_iter(addr, max - pos, iter); + else if (!hole) + len = copy_to_iter(addr, max - pos, iter); + else + len = iov_iter_zero(max - pos, iter); + + if (!len) + break; + + pos += len; + addr += len; + } + + return (pos == start) ? retval : pos - start; +} + +/** + * dax_do_io - Perform I/O to a DAX file + * @rw: READ to read or WRITE to write + * @iocb: The control block for this I/O + * @inode: The file which the I/O is directed at + * @iter: The addresses to do I/O from or to + * @pos: The file offset where the I/O starts + * @get_block: The filesystem method used to translate file offsets to blocks + * @end_io: A filesystem callback for I/O completion + * @flags: See below + * + * This function uses the same locking scheme as do_blockdev_direct_IO: + * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the + * caller for writes. For reads, we take and release the i_mutex ourselves. + * If DIO_LOCKING is not set, the filesystem takes care of its own locking. + * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O + * is in progress. + */ +ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, + get_block_t get_block, dio_iodone_t end_io, int flags) +{ + struct buffer_head bh; + ssize_t retval = -EINVAL; + loff_t end = pos + iov_iter_count(iter); + + memset(&bh, 0, sizeof(bh)); + + if ((flags & DIO_LOCKING) && (rw == READ)) { + struct address_space *mapping = inode->i_mapping; + mutex_lock(&inode->i_mutex); + retval = filemap_write_and_wait_range(mapping, pos, end - 1); + if (retval) { + mutex_unlock(&inode->i_mutex); + goto out; + } + } + + /* Protects against truncate */ + atomic_inc(&inode->i_dio_count); + + retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); + + if ((flags & DIO_LOCKING) && (rw == READ)) + mutex_unlock(&inode->i_mutex); + + if ((retval > 0) && end_io) + end_io(iocb, pos, retval, bh.b_private); + + inode_dio_done(inode); + out: + return retval; +} +EXPORT_SYMBOL_GPL(dax_do_io); + +/* + * The user has performed a load from a hole in the file. Allocating + * a new page in the file would cause excessive storage usage for + * workloads with sparse files. We allocate a page cache page instead. + * We'll kick it out of the page cache if it's ever written to, + * otherwise it will simply fall out of the page cache under memory + * pressure without ever having been dirtied. + */ +static int dax_load_hole(struct address_space *mapping, struct page *page, + struct vm_fault *vmf) +{ + unsigned long size; + struct inode *inode = mapping->host; + if (!page) + page = find_or_create_page(mapping, vmf->pgoff, + GFP_KERNEL | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + /* Recheck i_size under page lock to avoid truncate race */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) { + unlock_page(page); + page_cache_release(page); + return VM_FAULT_SIGBUS; + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static int copy_user_bh(struct page *to, struct buffer_head *bh, + unsigned blkbits, unsigned long vaddr) +{ + void *vfrom, *vto; + if (dax_get_addr(bh, &vfrom, blkbits) < 0) + return -EIO; + vto = kmap_atomic(to); + copy_user_page(vto, vfrom, vaddr, to); + kunmap_atomic(vto); + return 0; +} + +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + i_mmap_lock_read(mapping); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) + clear_page(addr); + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + i_mmap_unlock_read(mapping); + + if (bh->b_end_io) + bh->b_end_io(bh, 1); + + return error; +} + +static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head bh; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned blkbits = inode->i_blkbits; + sector_t block; + pgoff_t size; + int error; + int major = 0; + + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + return VM_FAULT_SIGBUS; + + memset(&bh, 0, sizeof(bh)); + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_size = PAGE_SIZE; + + repeat: + page = find_get_page(mapping, vmf->pgoff); + if (page) { + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return VM_FAULT_RETRY; + } + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + /* + * We have a struct page covering a hole in the file + * from a read fault and we've raced with a truncate + */ + error = -EIO; + goto unlock_page; + } + } + + error = get_block(inode, block, &bh, 0); + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; /* fs corruption? */ + if (error) + goto unlock_page; + + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_page; + } else { + return dax_load_hole(mapping, page, vmf); + } + } + + if (vmf->cow_page) { + struct page *new_page = vmf->cow_page; + if (buffer_written(&bh)) + error = copy_user_bh(new_page, &bh, blkbits, vaddr); + else + clear_user_highpage(new_page, vaddr); + if (error) + goto unlock_page; + vmf->page = page; + if (!page) { + i_mmap_lock_read(mapping); + /* Check we didn't race with truncate */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + if (vmf->pgoff >= size) { + i_mmap_unlock_read(mapping); + error = -EIO; + goto out; + } + } + return VM_FAULT_LOCKED; + } + + /* Check we didn't race with a read fault installing a new page */ + if (!page && major) + page = find_lock_page(mapping, vmf->pgoff); + + if (page) { + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_CACHE_SIZE, 0); + delete_from_page_cache(page); + unlock_page(page); + page_cache_release(page); + } + + error = dax_insert_mapping(inode, &bh, vma, vmf); + + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if ((error < 0) && (error != -EBUSY)) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; + + unlock_page: + if (page) { + unlock_page(page); + page_cache_release(page); + } + goto out; +} + +/** + * dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. + */ +int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block) +{ + int result; + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + if (vmf->flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + } + result = do_dax_fault(vma, vmf, get_block); + if (vmf->flags & FAULT_FLAG_WRITE) + sb_end_pagefault(sb); + + return result; +} +EXPORT_SYMBOL_GPL(dax_fault); + +/** + * dax_zero_page_range - zero a range within a page of a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @length: The number of bytes to zero + * @get_block: The filesystem method used to translate file offsets to blocks + * + * This function can be called by a filesystem when it is zeroing part of a + * page in a DAX file. This is intended for hole-punch operations. If + * you are truncating a file, the helper function dax_truncate_page() may be + * more convenient. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, + get_block_t get_block) +{ + struct buffer_head bh; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + int err; + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + BUG_ON((offset + length) > PAGE_CACHE_SIZE); + + memset(&bh, 0, sizeof(bh)); + bh.b_size = PAGE_CACHE_SIZE; + err = get_block(inode, index, &bh, 0); + if (err < 0) + return err; + if (buffer_written(&bh)) { + void *addr; + err = dax_get_addr(&bh, &addr, inode->i_blkbits); + if (err < 0) + return err; + memset(addr + offset, 0, length); + } + + return 0; +} +EXPORT_SYMBOL_GPL(dax_zero_page_range); + +/** + * dax_truncate_page - handle a partial page being truncated in a DAX file + * @inode: The file being truncated + * @from: The file offset that is being truncated to + * @get_block: The filesystem method used to translate file offsets to blocks + * + * Similar to block_truncate_page(), this function can be called by a + * filesystem when it is truncating a DAX file to handle the partial page. + * + * We work in terms of PAGE_CACHE_SIZE here for commonality with + * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem + * took care of disposing of the unnecessary blocks. Even if the filesystem + * block size is smaller than PAGE_SIZE, we have to zero the rest of the page + * since the file might be mmapped. + */ +int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) +{ + unsigned length = PAGE_CACHE_ALIGN(from) - from; + return dax_zero_page_range(inode, from, length, get_block); +} +EXPORT_SYMBOL_GPL(dax_truncate_page); diff --git a/fs/dcache.c b/fs/dcache.c index 7d34f04ec7aa..dc400fd29f4d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -511,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry) * dentry_iput drops the locks, at which point nobody (except * transient RCU lookups) can reach this dentry. */ - BUG_ON((int)dentry->d_lockref.count > 0); + BUG_ON(dentry->d_lockref.count > 0); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); @@ -564,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry) struct dentry *parent = dentry->d_parent; if (IS_ROOT(dentry)) return NULL; - if (unlikely((int)dentry->d_lockref.count < 0)) + if (unlikely(dentry->d_lockref.count < 0)) return NULL; if (likely(spin_trylock(&parent->d_lock))) return parent; @@ -593,6 +593,110 @@ again: return parent; } +/* + * Try to do a lockless dput(), and return whether that was successful. + * + * If unsuccessful, we return false, having already taken the dentry lock. + * + * The caller needs to hold the RCU read lock, so that the dentry is + * guaranteed to stay around even if the refcount goes down to zero! + */ +static inline bool fast_dput(struct dentry *dentry) +{ + int ret; + unsigned int d_flags; + + /* + * If we have a d_op->d_delete() operation, we sould not + * let the dentry count go to zero, so use "put__or_lock". + */ + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) + return lockref_put_or_lock(&dentry->d_lockref); + + /* + * .. otherwise, we can try to just decrement the + * lockref optimistically. + */ + ret = lockref_put_return(&dentry->d_lockref); + + /* + * If the lockref_put_return() failed due to the lock being held + * by somebody else, the fast path has failed. We will need to + * get the lock, and then check the count again. + */ + if (unlikely(ret < 0)) { + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count > 1) { + dentry->d_lockref.count--; + spin_unlock(&dentry->d_lock); + return 1; + } + return 0; + } + + /* + * If we weren't the last ref, we're done. + */ + if (ret) + return 1; + + /* + * Careful, careful. The reference count went down + * to zero, but we don't hold the dentry lock, so + * somebody else could get it again, and do another + * dput(), and we need to not race with that. + * + * However, there is a very special and common case + * where we don't care, because there is nothing to + * do: the dentry is still hashed, it does not have + * a 'delete' op, and it's referenced and already on + * the LRU list. + * + * NOTE! Since we aren't locked, these values are + * not "stable". However, it is sufficient that at + * some point after we dropped the reference the + * dentry was hashed and the flags had the proper + * value. Other dentry users may have re-gotten + * a reference to the dentry and change that, but + * our work is done - we can leave the dentry + * around with a zero refcount. + */ + smp_rmb(); + d_flags = ACCESS_ONCE(dentry->d_flags); + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST; + + /* Nothing to do? Dropping the reference was all we needed? */ + if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) + return 1; + + /* + * Not the fast normal case? Get the lock. We've already decremented + * the refcount, but we'll need to re-check the situation after + * getting the lock. + */ + spin_lock(&dentry->d_lock); + + /* + * Did somebody else grab a reference to it in the meantime, and + * we're no longer the last user after all? Alternatively, somebody + * else could have killed it and marked it dead. Either way, we + * don't need to do anything else. + */ + if (dentry->d_lockref.count) { + spin_unlock(&dentry->d_lock); + return 1; + } + + /* + * Re-get the reference we optimistically dropped. We hold the + * lock, and we just tested that it was zero, so we can just + * set it to 1. + */ + dentry->d_lockref.count = 1; + return 0; +} + + /* * This is dput * @@ -625,8 +729,14 @@ void dput(struct dentry *dentry) return; repeat: - if (lockref_put_or_lock(&dentry->d_lockref)) + rcu_read_lock(); + if (likely(fast_dput(dentry))) { + rcu_read_unlock(); return; + } + + /* Slow case: now with the dentry lock held */ + rcu_read_unlock(); /* Unreachable? Get rid of it */ if (unlikely(d_unhashed(dentry))) @@ -813,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if ((int)dentry->d_lockref.count > 0) { + if (dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); if (parent) spin_unlock(&parent->d_lock); @@ -2191,37 +2301,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) } EXPORT_SYMBOL(d_hash_and_lookup); -/** - * d_validate - verify dentry provided from insecure source (deprecated) - * @dentry: The dentry alleged to be valid child of @dparent - * @dparent: The parent dentry (known to be valid) - * - * An insecure source has sent us a dentry, here we verify it and dget() it. - * This is used by ncpfs in its readdir implementation. - * Zero is returned in the dentry is invalid. - * - * This function is slow for big directories, and deprecated, do not use it. - */ -int d_validate(struct dentry *dentry, struct dentry *dparent) -{ - struct dentry *child; - - spin_lock(&dparent->d_lock); - list_for_each_entry(child, &dparent->d_subdirs, d_child) { - if (dentry == child) { - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - __dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dparent->d_lock); - return 1; - } - } - spin_unlock(&dparent->d_lock); - - return 0; -} -EXPORT_SYMBOL(d_validate); - /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 05f2960ed7c3..45b18a5e225c 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev, - void *data, const struct file_operations *fops) - +static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); - if (inode) { inode->i_ino = get_next_ino(); - inode->i_mode = mode; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - switch (mode & S_IFMT) { - default: - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - inode->i_fop = fops ? fops : &debugfs_file_operations; - inode->i_private = data; - break; - case S_IFLNK: - inode->i_op = &debugfs_link_operations; - inode->i_private = data; - break; - case S_IFDIR: - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* directory inodes start off with i_nlink == 2 - * (for "." entry) */ - inc_nlink(inode); - break; - } } return inode; } -/* SMP-safe */ -static int debugfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t dev, void *data, - const struct file_operations *fops) -{ - struct inode *inode; - int error = -EPERM; - - if (dentry->d_inode) - return -EEXIST; - - inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops); - if (inode) { - d_instantiate(dentry, inode); - dget(dentry); - error = 0; - } - return error; -} - -static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int res; - - mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR; - res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL); - if (!res) { - inc_nlink(dir); - fsnotify_mkdir(dir, dentry); - } - return res; -} - -static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode, - void *data) -{ - mode = (mode & S_IALLUGO) | S_IFLNK; - return debugfs_mknod(dir, dentry, mode, 0, data, NULL); -} - -static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - void *data, const struct file_operations *fops) -{ - int res; - - mode = (mode & S_IALLUGO) | S_IFREG; - res = debugfs_mknod(dir, dentry, mode, 0, data, fops); - if (!res) - fsnotify_create(dir, dentry); - return res; -} - static inline int debugfs_positive(struct dentry *dentry) { return dentry->d_inode && !d_unhashed(dentry); @@ -252,6 +175,18 @@ static const struct super_operations debugfs_super_operations = { .show_options = debugfs_show_options, }; +static struct vfsmount *debugfs_automount(struct path *path) +{ + struct vfsmount *(*f)(void *); + f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; + return f(path->dentry->d_inode->i_private); +} + +static const struct dentry_operations debugfs_dops = { + .d_delete = always_delete_dentry, + .d_automount = debugfs_automount, +}; + static int debug_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr debug_files[] = {{""}}; @@ -276,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) goto fail; sb->s_op = &debugfs_super_operations; + sb->s_d_op = &debugfs_dops; debugfs_apply_options(sb); @@ -302,11 +238,9 @@ static struct file_system_type debug_fs_type = { }; MODULE_ALIAS_FS("debugfs"); -static struct dentry *__create_file(const char *name, umode_t mode, - struct dentry *parent, void *data, - const struct file_operations *fops) +static struct dentry *start_creating(const char *name, struct dentry *parent) { - struct dentry *dentry = NULL; + struct dentry *dentry; int error; pr_debug("debugfs: creating file '%s'\n",name); @@ -314,7 +248,7 @@ static struct dentry *__create_file(const char *name, umode_t mode, error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) - goto exit; + return ERR_PTR(error); /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super @@ -326,31 +260,26 @@ static struct dentry *__create_file(const char *name, umode_t mode, mutex_lock(&parent->d_inode->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry)) { - switch (mode & S_IFMT) { - case S_IFDIR: - error = debugfs_mkdir(parent->d_inode, dentry, mode); - - break; - case S_IFLNK: - error = debugfs_link(parent->d_inode, dentry, mode, - data); - break; - default: - error = debugfs_create(parent->d_inode, dentry, mode, - data, fops); - break; - } + if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); - } else - error = PTR_ERR(dentry); - mutex_unlock(&parent->d_inode->i_mutex); - - if (error) { - dentry = NULL; - simple_release_fs(&debugfs_mount, &debugfs_mount_count); + dentry = ERR_PTR(-EEXIST); } -exit: + if (IS_ERR(dentry)) + mutex_unlock(&parent->d_inode->i_mutex); + return dentry; +} + +static struct dentry *failed_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + dput(dentry); + simple_release_fs(&debugfs_mount, &debugfs_mount_count); + return NULL; +} + +static struct dentry *end_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); return dentry; } @@ -384,19 +313,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { - switch (mode & S_IFMT) { - case S_IFREG: - case 0: - break; - default: - BUG(); - } + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); + + if (IS_ERR(dentry)) + return NULL; - return __create_file(name, mode, parent, data, fops); + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_fop = fops ? fops : &debugfs_file_operations; + inode->i_private = data; + d_instantiate(dentry, inode); + fsnotify_create(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_file); /** + * debugfs_create_file_size - create a file in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * @file_size: initial file size + * + * This is the basic "create a file" function for debugfs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the debugfs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If debugfs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *debugfs_create_file_size(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops, + loff_t file_size) +{ + struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); + + if (de) + de->d_inode->i_size = file_size; + return de; +} +EXPORT_SYMBOL_GPL(debugfs_create_file_size); + +/** * debugfs_create_dir - create a directory in the debugfs filesystem * @name: a pointer to a string containing the name of the directory to * create. @@ -416,12 +397,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file); */ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) { - return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, - parent, NULL, NULL); + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); /** + * debugfs_create_automount - create automount point in the debugfs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the debugfs filesystem. + * @f: function to be called when pathname resolution steps on that one. + * @data: opaque argument to pass to f(). + * + * @f should return what ->d_automount() would. + */ +struct dentry *debugfs_create_automount(const char *name, + struct dentry *parent, + struct vfsmount *(*f)(void *), + void *data) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_flags |= S_AUTOMOUNT; + inode->i_private = data; + dentry->d_fsdata = (void *)f; + d_instantiate(dentry, inode); + return end_creating(dentry); +} +EXPORT_SYMBOL(debugfs_create_automount); + +/** * debugfs_create_symlink- create a symbolic link in the debugfs filesystem * @name: a pointer to a string containing the name of the symbolic link to * create. @@ -447,17 +481,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir); struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, const char *target) { - struct dentry *result; - char *link; - - link = kstrdup(target, GFP_KERNEL); + struct dentry *dentry; + struct inode *inode; + char *link = kstrdup(target, GFP_KERNEL); if (!link) return NULL; - result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL); - if (!result) + dentry = start_creating(name, parent); + if (IS_ERR(dentry)) { kfree(link); - return result; + return NULL; + } + + inode = debugfs_get_inode(dentry->d_sb); + if (unlikely(!inode)) { + kfree(link); + return failed_creating(dentry); + } + inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_op = &debugfs_link_operations; + inode->i_private = link; + d_instantiate(dentry, inode); + return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); diff --git a/fs/eventfd.c b/fs/eventfd.c index 4b0a226024fa..8d0c0df01854 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; unsigned int events = 0; - unsigned long flags; + u64 count; poll_wait(file, &ctx->wqh, wait); + smp_rmb(); + count = ctx->count; - spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->count > 0) + if (count > 0) events |= POLLIN; - if (ctx->count == ULLONG_MAX) + if (count == ULLONG_MAX) events |= POLLERR; - if (ULLONG_MAX - 1 > ctx->count) + if (ULLONG_MAX - 1 > count) events |= POLLOUT; - spin_unlock_irqrestore(&ctx->wqh.lock, flags); return events; } diff --git a/fs/exec.c b/fs/exec.c index ad8798e26be9..c7f9b733406d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -794,8 +794,14 @@ exit: struct file *open_exec(const char *name) { - struct filename tmp = { .name = name }; - return do_open_execat(AT_FDCWD, &tmp, 0); + struct filename *filename = getname_kernel(name); + struct file *f = ERR_CAST(filename); + + if (!IS_ERR(filename)) { + f = do_open_execat(AT_FDCWD, filename, 0); + putname(filename); + } + return f; } EXPORT_SYMBOL(open_exec); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 6fc91df99ff8..a198e94813fe 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = { .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ - .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, .is_partially_uptodate = NULL, diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 14a6780fd034..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -42,14 +42,3 @@ config EXT2_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. - -config EXT2_FS_XIP - bool "Ext2 execute in place support" - depends on EXT2_FS && MMU - help - Execute in place can be used on memory-backed block devices. If you - enable this option, you can select to mount block devices which are - capable of this feature without using the page cache. - - If you do not use a block device that is capable of using this, - or if unsure, say N. diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index f42af45cfd88..445b0e996a12 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o -ext2-$(CONFIG_EXT2_FS_XIP) += xip.o diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e4279ead4a05..678f9ab08c48 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -380,10 +380,15 @@ struct ext2_inode { #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ #define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */ #define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */ -#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */ +#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */ #define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */ #define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */ #define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */ +#ifdef CONFIG_FS_DAX +#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */ +#else +#define EXT2_MOUNT_DAX 0 +#endif #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt @@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_xip_file_operations; +extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_aops_xip; extern const struct address_space_operations ext2_nobh_aops; /* namei.c */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 7c87b22a7228..e31701713516 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -25,6 +25,36 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext2_get_block); +} + +static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext2_get_block); +} + +static const struct vm_operations_struct ext2_dax_vm_ops = { + .fault = ext2_dax_fault, + .page_mkwrite = ext2_dax_mkwrite, +}; + +static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_mmap(file, vma); + + file_accessed(file); + vma->vm_ops = &ext2_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} +#else +#define ext2_file_mmap generic_file_mmap +#endif + /* * Called when filp is released. This happens when all file descriptors * for a single struct file are closed. Note that different open() calls @@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, @@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_EXT2_FS_XIP -const struct file_operations ext2_xip_file_operations = { +#ifdef CONFIG_FS_DAX +const struct file_operations ext2_dax_file_operations = { .llseek = generic_file_llseek, - .read = xip_file_read, - .write = xip_file_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .mmap = xip_file_mmap, + .mmap = ext2_file_mmap, .open = dquot_file_open, .release = ext2_release_file, .fsync = ext2_fsync, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 36d35c36311d..6434bc000125 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -34,7 +34,6 @@ #include <linux/aio.h> #include "ext2.h" #include "acl.h" -#include "xip.h" #include "xattr.h" static int __ext2_write_inode(struct inode *inode, int do_sync); @@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode, goto cleanup; } - if (ext2_use_xip(inode->i_sb)) { + if (IS_DAX(inode)) { /* - * we need to clear the block + * block must be initialised before we put it in the tree + * so that it's not found by another thread before it's + * initialised */ - err = ext2_clear_xip_target (inode, - le32_to_cpu(chain[depth-1].key)); + err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block, + NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ext2_get_block); if (ret < 0 && (rw & WRITE)) ext2_write_failed(mapping, offset + count); return ret; @@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = { .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_aops_xip = { - .bmap = ext2_bmap, - .get_xip_mem = ext2_get_xip_mem, -}; - const struct address_space_operations ext2_nobh_aops = { .readpage = ext2_readpage, .readpages = ext2_readpages, @@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (mapping_is_xip(inode->i_mapping)) - error = xip_truncate_page(inode->i_mapping, newsize); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, ext2_get_block); else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode) { unsigned int flags = EXT2_I(inode)->i_flags; - inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | + S_DIRSYNC | S_DAX); if (flags & EXT2_SYNC_FL) inode->i_flags |= S_SYNC; if (flags & EXT2_APPEND_FL) @@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + inode->i_flags |= S_DAX; } /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ @@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c268d0af1db9..148f6e3789ea 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode) { @@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; @@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (ext2_use_xip(inode->i_sb)) { - inode->i_mapping->a_ops = &ext2_aops_xip; - inode->i_fop = &ext2_xip_file_operations; + if (test_opt(inode->i_sb, DAX)) { + inode->i_mapping->a_ops = &ext2_aops; + inode->i_fop = &ext2_dax_file_operations; } else if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index ae55fddc26a9..d0e746e96511 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -35,7 +35,6 @@ #include "ext2.h" #include "xattr.h" #include "acl.h" -#include "xip.h" static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, int wait); @@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",grpquota"); #endif -#if defined(CONFIG_EXT2_FS_XIP) +#ifdef CONFIG_FS_DAX if (sbi->s_mount_opt & EXT2_MOUNT_XIP) seq_puts(seq, ",xip"); + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) + seq_puts(seq, ",dax"); #endif if (!test_opt(sb, RESERVATION)) @@ -403,7 +404,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota, + Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; @@ -432,6 +433,7 @@ static const match_table_t tokens = { {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_xip, "xip"}, + {Opt_dax, "dax"}, {Opt_grpquota, "grpquota"}, {Opt_ignore, "noquota"}, {Opt_quota, "quota"}, @@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb) break; #endif case Opt_xip: -#ifdef CONFIG_EXT2_FS_XIP - set_opt (sbi->s_mount_opt, XIP); + ext2_msg(sb, KERN_INFO, "use dax instead of xip"); + set_opt(sbi->s_mount_opt, XIP); + /* Fall through */ + case Opt_dax: +#ifdef CONFIG_FS_DAX + set_opt(sbi->s_mount_opt, DAX); #else - ext2_msg(sb, KERN_INFO, "xip option not supported"); + ext2_msg(sb, KERN_INFO, "dax option not supported"); #endif break; @@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) || @@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); - if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) { - if (!silent) + if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { ext2_msg(sb, KERN_ERR, - "error: unsupported blocksize for xip"); - goto failed_mount; + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext2_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } } /* If the blocksize doesn't match, re-read the thing.. */ @@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) { struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; - unsigned long old_mount_opt = sbi->s_mount_opt; struct ext2_mount_options old_opts; unsigned long old_sb_flags; int err; @@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); - ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset - EXT2_MOUNT_XIP if not */ - - if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) { - ext2_msg(sb, KERN_WARNING, - "warning: unsupported blocksize for xip"); - err = -EINVAL; - goto restore_opts; - } - es = sbi->s_es; - if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) { + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) { ext2_msg(sb, KERN_WARNING, "warning: refusing change of " - "xip flag with busy inodes while remounting"); - sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; - sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT2_MOUNT_DAX; } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { spin_unlock(&sbi->s_lock); diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c deleted file mode 100644 index bbc5fec6ff7f..000000000000 --- a/fs/ext2/xip.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * linux/fs/ext2/xip.c - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/genhd.h> -#include <linux/buffer_head.h> -#include <linux/blkdev.h> -#include "ext2.h" -#include "xip.h" - -static inline long __inode_direct_access(struct inode *inode, sector_t block, - void **kaddr, unsigned long *pfn, long size) -{ - struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block * (PAGE_SIZE / 512); - return bdev_direct_access(bdev, sector, kaddr, pfn, size); -} - -static inline int -__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, - sector_t *result) -{ - struct buffer_head tmp; - int rc; - - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = 1 << inode->i_blkbits; - rc = ext2_get_block(inode, pgoff, &tmp, create); - *result = tmp.b_blocknr; - - /* did we get a sparse block (hole in the file)? */ - if (!tmp.b_blocknr && !rc) { - BUG_ON(create); - rc = -ENODATA; - } - - return rc; -} - -int -ext2_clear_xip_target(struct inode *inode, sector_t block) -{ - void *kaddr; - unsigned long pfn; - long size; - - size = __inode_direct_access(inode, block, &kaddr, &pfn, PAGE_SIZE); - if (size < 0) - return size; - clear_page(kaddr); - return 0; -} - -void ext2_xip_verify_sb(struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - - if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && - !sb->s_bdev->bd_disk->fops->direct_access) { - sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_msg(sb, KERN_WARNING, - "warning: ignoring xip option - " - "not supported by bdev"); - } -} - -int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, - void **kmem, unsigned long *pfn) -{ - long rc; - sector_t block; - - /* first, retrieve the sector number */ - rc = __ext2_get_block(mapping->host, pgoff, create, &block); - if (rc) - return rc; - - /* retrieve address of the target data */ - rc = __inode_direct_access(mapping->host, block, kmem, pfn, PAGE_SIZE); - return (rc < 0) ? rc : 0; -} diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h deleted file mode 100644 index 18b34d2f31b3..000000000000 --- a/fs/ext2/xip.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * linux/fs/ext2/xip.h - * - * Copyright (C) 2005 IBM Corporation - * Author: Carsten Otte (cotte@de.ibm.com) - */ - -#ifdef CONFIG_EXT2_FS_XIP -extern void ext2_xip_verify_sb (struct super_block *); -extern int ext2_clear_xip_target (struct inode *, sector_t); - -static inline int ext2_use_xip (struct super_block *sb) -{ - struct ext2_sb_info *sbi = EXT2_SB(sb); - return (sbi->s_mount_opt & EXT2_MOUNT_XIP); -} -int ext2_get_xip_mem(struct address_space *, pgoff_t, int, - void **, unsigned long *); -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) -#else -#define mapping_is_xip(map) 0 -#define ext2_xip_verify_sb(sb) do { } while (0) -#define ext2_use_xip(sb) 0 -#define ext2_clear_xip_target(inode, chain) 0 -#define ext2_get_xip_mem NULL -#endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a75fba67bb1f..982d934fd9ac 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -965,6 +965,11 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 7cb592386121..33a09da16c9c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = file->f_flags & O_DIRECT; + int o_direct = io_is_direct(file); int overwrite = 0; size_t length = iov_iter_count(from); ssize_t ret; @@ -191,6 +191,26 @@ errout: return ret; } +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, @@ -200,7 +220,12 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -599,6 +624,26 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_DAX +const struct file_operations ext4_dax_file_operations = { + .llseek = ext4_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + /* Splice not yet supported with DAX */ + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36b369697a13..6b9878a24182 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -689,14 +689,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5653fa42930b..85404f15e53a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -657,6 +657,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, - offset, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle, return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3278,6 +3281,33 @@ unlock: } /* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end @@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4139,6 +4174,65 @@ static int ext4_inode_blocks_set(handle_t *handle, return 0; } +struct other_inode { + unsigned long orig_ino; + struct ext4_inode *raw_inode; +}; + +static int other_inode_match(struct inode * inode, unsigned long ino, + void *data) +{ + struct other_inode *oi = (struct other_inode *) data; + + if ((inode->i_ino != ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + ((inode->i_state & I_DIRTY_TIME) == 0)) + return 0; + spin_lock(&inode->i_lock); + if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && + (inode->i_state & I_DIRTY_TIME)) { + struct ext4_inode_info *ei = EXT4_I(inode); + + inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + spin_unlock(&inode->i_lock); + + spin_lock(&ei->i_raw_lock); + EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); + ext4_inode_csum_set(inode, oi->raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + trace_ext4_other_inode_update_time(inode, oi->orig_ino); + return -1; + } + spin_unlock(&inode->i_lock); + return -1; +} + +/* + * Opportunistically update the other time fields for other inodes in + * the same inode table block. + */ +static void ext4_update_other_inodes_time(struct super_block *sb, + unsigned long orig_ino, char *buf) +{ + struct other_inode oi; + unsigned long ino; + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int inode_size = EXT4_INODE_SIZE(sb); + + oi.orig_ino = orig_ino; + ino = orig_ino & ~(inodes_per_block - 1); + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { + if (ino == orig_ino) + continue; + oi.raw_inode = (struct ext4_inode *) buf; + (void) find_inode_nowait(sb, ino, other_inode_match, &oi); + } +} + /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the @@ -4248,10 +4342,11 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } - ext4_inode_csum_set(inode, raw_inode, ei); - spin_unlock(&ei->i_raw_lock); + if (inode->i_sb->s_flags & MS_LAZYTIME) + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, + bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -4534,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == @@ -4840,11 +4935,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. + * + * If only the I_DIRTY_TIME flag is set, we can skip everything. If + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need + * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; + if (flags == I_DIRTY_TIME) + return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 2291923dae4e..28fe71a2904c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,7 +2235,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2299,7 +2302,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 64c39c7c594f..1adac6868e6f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1124,8 +1124,9 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, @@ -1187,8 +1188,11 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_removed, "mblk_io_submit"}, {Opt_removed, "nomblk_io_submit"}, @@ -1371,6 +1375,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1446,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_i_version: sb->s_flags |= MS_I_VERSION; return 1; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + return 1; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + return 1; } for (m = ext4_mount_opts; m->token != Opt_err; m++) @@ -1607,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; +#endif } else { if (!args->from) arg = 1; @@ -3589,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3652,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4869,6 +4903,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) @@ -5007,6 +5053,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } #endif + *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 7b41a2dcdd76..497c7c5263c7 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -580,7 +580,7 @@ static void fat_set_state(struct super_block *sb, { struct buffer_head *bh; struct fat_boot_sector *b; - struct msdos_sb_info *sbi = sb->s_fs_info; + struct msdos_sb_info *sbi = MSDOS_SB(sb); /* do not change any thing if mounted read only */ if ((sb->s_flags & MS_RDONLY) && !force) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c399152de397..073657f755d4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -253,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t) return ret; } +#define EXPIRE_DIRTY_ATIME 0x0001 + /* * Move expired (dirtied before work->older_than_this) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, + int flags, struct wb_writeback_work *work) { + unsigned long *older_than_this = NULL; + unsigned long expire_time; LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; @@ -268,13 +273,21 @@ static int move_expired_inodes(struct list_head *delaying_queue, int do_sb_sort = 0; int moved = 0; + if ((flags & EXPIRE_DIRTY_ATIME) == 0) + older_than_this = work->older_than_this; + else if ((work->reason == WB_REASON_SYNC) == 0) { + expire_time = jiffies - (HZ * 86400); + older_than_this = &expire_time; + } while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); - if (work->older_than_this && - inode_dirtied_after(inode, *work->older_than_this)) + if (older_than_this && + inode_dirtied_after(inode, *older_than_this)) break; list_move(&inode->i_wb_list, &tmp); moved++; + if (flags & EXPIRE_DIRTY_ATIME) + set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) @@ -315,9 +328,12 @@ out: static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) { int moved; + assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); - moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work); + moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, + EXPIRE_DIRTY_ATIME, work); trace_writeback_queue_io(wb, work, moved); } @@ -441,6 +457,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, * updates after data IO completion. */ redirty_tail(inode, wb); + } else if (inode->i_state & I_DIRTY_TIME) { + list_move(&inode->i_wb_list, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ list_del_init(&inode->i_wb_list); @@ -487,7 +505,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; - inode->i_state &= ~I_DIRTY; + if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) && + (inode->i_state & I_DIRTY_TIME)) || + (inode->i_state & I_DIRTY_TIME_EXPIRED)) { + dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED; + trace_writeback_lazytime(inode); + } + inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows @@ -507,8 +531,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_unlock(&inode->i_lock); + if (dirty & I_DIRTY_TIME) + mark_inode_dirty_sync(inode); /* Don't write the inode if only I_DIRTY_PAGES was set */ - if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; @@ -556,7 +582,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * make sure inode is on some writeback list and leave it there unless * we have completely cleaned the inode. */ - if (!(inode->i_state & I_DIRTY) && + if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; @@ -571,7 +597,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * If inode is clean, remove it from writeback lists. Otherwise don't * touch it. See comment above for explanation. */ - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) list_del_init(&inode->i_wb_list); spin_unlock(&wb->list_lock); inode_sync_complete(inode); @@ -713,7 +739,7 @@ static long writeback_sb_inodes(struct super_block *sb, wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) wrote++; requeue_inode(inode, wb, &wbc); inode_sync_complete(inode); @@ -1151,16 +1177,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ +#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; struct backing_dev_info *bdi = NULL; + int dirtytime; + + trace_writeback_mark_inode_dirty(inode, flags); /* * Don't do this for I_DIRTY_PAGES - that doesn't actually * dirty the inode itself */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) { trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) @@ -1168,6 +1198,9 @@ void __mark_inode_dirty(struct inode *inode, int flags) trace_writeback_dirty_inode(inode, flags); } + if (flags & I_DIRTY_INODE) + flags &= ~I_DIRTY_TIME; + dirtytime = flags & I_DIRTY_TIME; /* * Paired with smp_mb() in __writeback_single_inode() for the @@ -1175,16 +1208,21 @@ void __mark_inode_dirty(struct inode *inode, int flags) */ smp_mb(); - if ((inode->i_state & flags) == flags) + if (((inode->i_state & flags) == flags) || + (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; if (unlikely(block_dump)) block_dump___mark_inode_dirty(inode); spin_lock(&inode->i_lock); + if (dirtytime && (inode->i_state & I_DIRTY_INODE)) + goto out_unlock_inode; if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + if (flags & I_DIRTY_INODE) + inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; /* @@ -1231,8 +1269,10 @@ void __mark_inode_dirty(struct inode *inode, int flags) } inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + list_move(&inode->i_wb_list, dirtytime ? + &bdi->wb.b_dirty_time : &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); + trace_writeback_dirty_inode_enqueue(inode); if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); diff --git a/fs/fs_pin.c b/fs/fs_pin.c index 9368236ca100..b06c98796afb 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -1,78 +1,102 @@ #include <linux/fs.h> +#include <linux/sched.h> #include <linux/slab.h> -#include <linux/fs_pin.h> #include "internal.h" #include "mount.h" -static void pin_free_rcu(struct rcu_head *head) -{ - kfree(container_of(head, struct fs_pin, rcu)); -} - static DEFINE_SPINLOCK(pin_lock); -void pin_put(struct fs_pin *p) -{ - if (atomic_long_dec_and_test(&p->count)) - call_rcu(&p->rcu, pin_free_rcu); -} - void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); hlist_del(&pin->m_list); hlist_del(&pin->s_list); spin_unlock(&pin_lock); + spin_lock_irq(&pin->wait.lock); + pin->done = 1; + wake_up_locked(&pin->wait); + spin_unlock_irq(&pin->wait.lock); } -void pin_insert(struct fs_pin *pin, struct vfsmount *m) +void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p) { spin_lock(&pin_lock); - hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins); + if (p) + hlist_add_head(&pin->s_list, p); hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins); spin_unlock(&pin_lock); } +void pin_insert(struct fs_pin *pin, struct vfsmount *m) +{ + pin_insert_group(pin, m, &m->mnt_sb->s_pins); +} + +void pin_kill(struct fs_pin *p) +{ + wait_queue_t wait; + + if (!p) { + rcu_read_unlock(); + return; + } + init_wait(&wait); + spin_lock_irq(&p->wait.lock); + if (likely(!p->done)) { + p->done = -1; + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + p->kill(p); + return; + } + if (p->done > 0) { + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + return; + } + __add_wait_queue(&p->wait, &wait); + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&p->wait.lock); + rcu_read_unlock(); + schedule(); + rcu_read_lock(); + if (likely(list_empty(&wait.task_list))) + break; + /* OK, we know p couldn't have been freed yet */ + spin_lock_irq(&p->wait.lock); + if (p->done > 0) { + spin_unlock_irq(&p->wait.lock); + break; + } + } + rcu_read_unlock(); +} + void mnt_pin_kill(struct mount *m) { while (1) { struct hlist_node *p; - struct fs_pin *pin; rcu_read_lock(); p = ACCESS_ONCE(m->mnt_pins.first); if (!p) { rcu_read_unlock(); break; } - pin = hlist_entry(p, struct fs_pin, m_list); - if (!atomic_long_inc_not_zero(&pin->count)) { - rcu_read_unlock(); - cpu_relax(); - continue; - } - rcu_read_unlock(); - pin->kill(pin); + pin_kill(hlist_entry(p, struct fs_pin, m_list)); } } -void sb_pin_kill(struct super_block *sb) +void group_pin_kill(struct hlist_head *p) { while (1) { - struct hlist_node *p; - struct fs_pin *pin; + struct hlist_node *q; rcu_read_lock(); - p = ACCESS_ONCE(sb->s_pins.first); - if (!p) { + q = ACCESS_ONCE(p->first); + if (!q) { rcu_read_unlock(); break; } - pin = hlist_entry(p, struct fs_pin, s_list); - if (!atomic_long_inc_not_zero(&pin->count)) { - rcu_read_unlock(); - cpu_relax(); - continue; - } - rcu_read_unlock(); - pin->kill(pin); + pin_kill(hlist_entry(q, struct fs_pin, s_list)); } } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index ec9c2d33477a..3e32bb8e2d7e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -654,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY; + int sync_state = inode->i_state & I_DIRTY_ALL; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -667,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) - sync_state &= ~I_DIRTY_SYNC; + sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); if (sync_state) { ret = sync_inode_metadata(inode, 1); diff --git a/fs/inode.c b/fs/inode.c index 86bfaca724db..f00b16f45507 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> /* for inode_has_buffers */ #include <linux/ratelimit.h> #include <linux/list_lru.h> +#include <trace/events/writeback.h> #include "internal.h" /* @@ -30,7 +31,7 @@ * inode_sb_list_lock protects: * sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -403,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) && + if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | + I_FREEING | I_WILL_FREE)) && !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) inode_lru_list_add(inode); } @@ -634,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY && !kill_dirty) { + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { spin_unlock(&inode->i_lock); busy = 1; continue; @@ -1268,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino) } EXPORT_SYMBOL(ilookup); +/** + * find_inode_nowait - find an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @match: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @match + * + * Search for the inode specified by @hashval and @data in the inode + * cache, where the helper function @match will return 0 if the inode + * does not match, 1 if the inode does match, and -1 if the search + * should be stopped. The @match function must be responsible for + * taking the i_lock spin_lock and checking i_state for an inode being + * freed or being initialized, and incrementing the reference count + * before returning 1. It also must not sleep, since it is called with + * the inode_hash_lock spinlock held. + * + * This is a even more generalized version of ilookup5() when the + * function must never block --- find_inode() can block in + * __wait_on_freeing_inode() --- or when the caller can not increment + * the reference count because the resulting iput() might cause an + * inode eviction. The tradeoff is that the @match funtion must be + * very carefully implemented. + */ +struct inode *find_inode_nowait(struct super_block *sb, + unsigned long hashval, + int (*match)(struct inode *, unsigned long, + void *), + void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode, *ret_inode = NULL; + int mval; + + spin_lock(&inode_hash_lock); + hlist_for_each_entry(inode, head, i_hash) { + if (inode->i_sb != sb) + continue; + mval = match(inode, hashval, data); + if (mval == 0) + continue; + if (mval == 1) + ret_inode = inode; + goto out; + } +out: + spin_unlock(&inode_hash_lock); + return ret_inode; +} +EXPORT_SYMBOL(find_inode_nowait); + int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1418,11 +1470,20 @@ static void iput_final(struct inode *inode) */ void iput(struct inode *inode) { - if (inode) { - BUG_ON(inode->i_state & I_CLEAR); - - if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) - iput_final(inode); + if (!inode) + return; + BUG_ON(inode->i_state & I_CLEAR); +retry: + if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { + if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { + atomic_inc(&inode->i_count); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + trace_writeback_lazytime_iput(inode); + mark_inode_dirty_sync(inode); + goto retry; + } + iput_final(inode); } } EXPORT_SYMBOL(iput); @@ -1481,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -/* - * This does the actual work of updating an inodes time or version. Must have - * had called mnt_want_write() before calling this. - */ -static int update_time(struct inode *inode, struct timespec *time, int flags) +int generic_update_time(struct inode *inode, struct timespec *time, int flags) { - if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); + int iflags = I_DIRTY_TIME; if (flags & S_ATIME) inode->i_atime = *time; @@ -1498,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) inode->i_ctime = *time; if (flags & S_MTIME) inode->i_mtime = *time; - mark_inode_dirty_sync(inode); + + if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + iflags |= I_DIRTY_SYNC; + __mark_inode_dirty(inode, iflags); return 0; } +EXPORT_SYMBOL(generic_update_time); + +/* + * This does the actual work of updating an inodes time or version. Must have + * had called mnt_want_write() before calling this. + */ +static int update_time(struct inode *inode, struct timespec *time, int flags) +{ + int (*update_time)(struct inode *, struct timespec *, int); + + update_time = inode->i_op->update_time ? inode->i_op->update_time : + generic_update_time; + + return update_time(inode, time, flags); +} /** * touch_atime - update the access time diff --git a/fs/internal.h b/fs/internal.h index d92c346a793d..30459dab409d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -144,7 +144,7 @@ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ -extern void sb_pin_kill(struct super_block *sb); +extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index 92e0644bf867..556de100ebd5 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp) return bit; } -static inline int pulledbits(struct pushpull *pp) -{ - return pp->ofs; -} - static void init_rubin(struct rubin_state *rs, int div, int *bits) { diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 7654e87b0428..9ad5ba4b299b 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo sumlen = c->sector_size - je32_to_cpu(sm->offset); sumptr = buf + buf_size - sumlen; + /* sm->offset maybe wrong but MAGIC maybe right */ + if (sumlen > c->sector_size) + goto full_scan; + /* Now, make sure the summary itself is available */ if (sumlen > buf_size) { /* Need to kmalloc for this. */ @@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo } } +full_scan: buf_ofs = jeb->offset; if (!buf_size) { diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 33aa0cc1f8b8..10815f8dfd8b 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) return rc; mutex_lock(&inode->i_mutex); - if (!(inode->i_state & I_DIRTY) || + if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); diff --git a/fs/libfs.c b/fs/libfs.c index 005843ce5dbd..b2ffdb045be4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, mutex_lock(&inode->i_mutex); ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) + if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; diff --git a/fs/mount.h b/fs/mount.h index 0ad6f760ce52..6a61c2b3e385 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -2,6 +2,7 @@ #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/ns_common.h> +#include <linux/fs_pin.h> struct mnt_namespace { atomic_t count; @@ -62,7 +63,8 @@ struct mount { int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; - struct path mnt_ex_mountpoint; + struct fs_pin mnt_umount; + struct dentry *mnt_ex_mountpoint; }; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ diff --git a/fs/namei.c b/fs/namei.c index bc35b02883bb..96ca11dea4a2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -118,15 +118,6 @@ * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ -void final_putname(struct filename *name) -{ - if (name->separate) { - __putname(name->name); - kfree(name); - } else { - __putname(name); - } -} #define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) @@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty) result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); + result->refcnt = 1; /* * First, try to embed the struct filename inside the names_cache @@ -179,6 +171,7 @@ recopy: } result->name = kname; result->separate = true; + result->refcnt = 1; max = PATH_MAX; goto recopy; } @@ -202,7 +195,7 @@ recopy: return result; error: - final_putname(result); + putname(result); return err; } @@ -212,43 +205,56 @@ getname(const char __user * filename) return getname_flags(filename, 0, NULL); } -/* - * The "getname_kernel()" interface doesn't do pathnames longer - * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user. - */ struct filename * getname_kernel(const char * filename) { struct filename *result; - char *kname; - int len; - - len = strlen(filename); - if (len >= EMBEDDED_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); - kname = (char *)result + sizeof(*result); - result->name = kname; + if (len <= EMBEDDED_NAME_MAX) { + result->name = (char *)(result) + sizeof(*result); + result->separate = false; + } else if (len <= PATH_MAX) { + struct filename *tmp; + + tmp = kmalloc(sizeof(*tmp), GFP_KERNEL); + if (unlikely(!tmp)) { + __putname(result); + return ERR_PTR(-ENOMEM); + } + tmp->name = (char *)result; + tmp->separate = true; + result = tmp; + } else { + __putname(result); + return ERR_PTR(-ENAMETOOLONG); + } + memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->separate = false; + result->refcnt = 1; + audit_getname(result); - strlcpy(kname, filename, EMBEDDED_NAME_MAX); return result; } -#ifdef CONFIG_AUDITSYSCALL void putname(struct filename *name) { - if (unlikely(!audit_dummy_context())) - return audit_putname(name); - final_putname(name); + BUG_ON(name->refcnt <= 0); + + if (--name->refcnt > 0) + return; + + if (name->separate) { + __putname(name->name); + kfree(name); + } else + __putname(name); } -#endif static int check_acl(struct inode *inode, int mask) { @@ -2036,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name, static int do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - struct filename filename = { .name = name }; + struct filename *filename = getname_kernel(name); + int retval = PTR_ERR(filename); - return filename_lookup(dfd, &filename, flags, nd); + if (!IS_ERR(filename)) { + retval = filename_lookup(dfd, filename, flags, nd); + putname(filename); + } + return retval; } /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { + struct filename *filename = getname_kernel(name); struct nameidata nd; struct dentry *d; - int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd); - if (err) - return ERR_PTR(err); + int err; + + if (IS_ERR(filename)) + return ERR_CAST(filename); + + err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd); + if (err) { + d = ERR_PTR(err); + goto out; + } if (nd.last_type != LAST_NORM) { path_put(&nd.path); - return ERR_PTR(-EINVAL); + d = ERR_PTR(-EINVAL); + goto out; } mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); d = __lookup_hash(&nd.last, nd.path.dentry, 0); if (IS_ERR(d)) { mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); - return d; + goto out; } *path = nd.path; +out: + putname(filename); return d; } @@ -2351,13 +2373,17 @@ static int filename_mountpoint(int dfd, struct filename *s, struct path *path, unsigned int flags) { - int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); + int error; + if (IS_ERR(s)) + return PTR_ERR(s); + error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); if (unlikely(error == -ECHILD)) error = path_mountpoint(dfd, s->name, path, flags); if (unlikely(error == -ESTALE)) error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); if (likely(!error)) audit_inode(s, path->dentry, 0); + putname(s); return error; } @@ -2379,21 +2405,14 @@ int user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags, struct path *path) { - struct filename *s = getname(name); - int error; - if (IS_ERR(s)) - return PTR_ERR(s); - error = filename_mountpoint(dfd, s, path, flags); - putname(s); - return error; + return filename_mountpoint(dfd, getname(name), path, flags); } int kern_path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) { - struct filename s = {.name = name}; - return filename_mountpoint(dfd, &s, path, flags); + return filename_mountpoint(dfd, getname_kernel(name), path, flags); } EXPORT_SYMBOL(kern_path_mountpoint); @@ -3273,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, { struct nameidata nd; struct file *file; - struct filename filename = { .name = name }; + struct filename *filename; int flags = op->lookup_flags | LOOKUP_ROOT; nd.root.mnt = mnt; @@ -3282,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU); + filename = getname_kernel(name); + if (unlikely(IS_ERR(filename))) + return ERR_CAST(filename); + + file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, &filename, &nd, op, flags); + file = path_openat(-1, filename, &nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); + putname(filename); return file; } -struct dentry *kern_path_create(int dfd, const char *pathname, +static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); @@ -3305,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, */ lookup_flags &= LOOKUP_REVAL; - error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd); + error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd); if (error) return ERR_PTR(error); @@ -3359,6 +3383,19 @@ out: path_put(&nd.path); return dentry; } + +struct dentry *kern_path_create(int dfd, const char *pathname, + struct path *path, unsigned int lookup_flags) +{ + struct filename *filename = getname_kernel(pathname); + struct dentry *res; + + if (IS_ERR(filename)) + return ERR_CAST(filename); + res = filename_create(dfd, filename, path, lookup_flags); + putname(filename); + return res; +} EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) @@ -3377,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname, struct dentry *res; if (IS_ERR(tmp)) return ERR_CAST(tmp); - res = kern_path_create(dfd, tmp->name, path, lookup_flags); + res = filename_create(dfd, tmp, path, lookup_flags); putname(tmp); return res; } diff --git a/fs/namespace.c b/fs/namespace.c index 6dae553dd69c..72a286e0d33e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt) #endif } +static void drop_mountpoint(struct fs_pin *p) +{ + struct mount *m = container_of(p, struct mount, mnt_umount); + dput(m->mnt_ex_mountpoint); + pin_remove(p); + mntput(&m->mnt); +} + static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -229,6 +237,7 @@ static struct mount *alloc_vfsmnt(const char *name) #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif + init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; @@ -1289,7 +1298,6 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { - struct mount *mnt; struct hlist_head head = unmounted; if (likely(hlist_empty(&head))) { @@ -1299,23 +1307,11 @@ static void namespace_unlock(void) head.first->pprev = &head.first; INIT_HLIST_HEAD(&unmounted); - - /* undo decrements we'd done in umount_tree() */ - hlist_for_each_entry(mnt, &head, mnt_hash) - if (mnt->mnt_ex_mountpoint.mnt) - mntget(mnt->mnt_ex_mountpoint.mnt); - up_write(&namespace_sem); synchronize_rcu(); - while (!hlist_empty(&head)) { - mnt = hlist_entry(head.first, struct mount, mnt_hash); - hlist_del_init(&mnt->mnt_hash); - if (mnt->mnt_ex_mountpoint.mnt) - path_put(&mnt->mnt_ex_mountpoint); - mntput(&mnt->mnt); - } + group_pin_kill(&head); } static inline void namespace_lock(void) @@ -1334,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how) { HLIST_HEAD(tmp_list); struct mount *p; - struct mount *last = NULL; for (p = mnt; p; p = next_mnt(p, mnt)) { hlist_del_init_rcu(&p->mnt_hash); @@ -1347,33 +1342,28 @@ void umount_tree(struct mount *mnt, int how) if (how) propagate_umount(&tmp_list); - hlist_for_each_entry(p, &tmp_list, mnt_hash) { + while (!hlist_empty(&tmp_list)) { + p = hlist_entry(tmp_list.first, struct mount, mnt_hash); + hlist_del_init_rcu(&p->mnt_hash); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; + + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted); if (mnt_has_parent(p)) { hlist_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); - /* move the reference to mountpoint into ->mnt_ex_mountpoint */ - p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; - p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; + /* old mountpoint will be dropped when we can do that */ + p->mnt_ex_mountpoint = p->mnt_mountpoint; p->mnt_mountpoint = p->mnt.mnt_root; p->mnt_parent = p; p->mnt_mp = NULL; } change_mnt_propagation(p, MS_PRIVATE); - last = p; - } - if (last) { - last->mnt_hash.next = unmounted.first; - if (unmounted.first) - unmounted.first->pprev = &last->mnt_hash.next; - unmounted.first = tmp_list.first; - unmounted.first->pprev = &unmounted.first; } } diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 008960101520..e7ca827d7694 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *); static int ncp_compare_dentry(const struct dentry *, const struct dentry *, unsigned int, const char *, const struct qstr *); static int ncp_delete_dentry(const struct dentry *); +static void ncp_d_prune(struct dentry *dentry); const struct dentry_operations ncp_dentry_operations = { @@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations = .d_hash = ncp_hash_dentry, .d_compare = ncp_compare_dentry, .d_delete = ncp_delete_dentry, + .d_prune = ncp_d_prune, }; #define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber]) @@ -384,42 +386,6 @@ finished: return val; } -static struct dentry * -ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) -{ - struct dentry *dent = dentry; - - if (d_validate(dent, parent)) { - if (dent->d_name.len <= NCP_MAXPATHLEN && - (unsigned long)dent->d_fsdata == fpos) { - if (!dent->d_inode) { - dput(dent); - dent = NULL; - } - return dent; - } - dput(dent); - } - - /* If a pointer is invalid, we search the dentry. */ - spin_lock(&parent->d_lock); - list_for_each_entry(dent, &parent->d_subdirs, d_child) { - if ((unsigned long)dent->d_fsdata == fpos) { - if (dent->d_inode) - dget(dent); - else - dent = NULL; - spin_unlock(&parent->d_lock); - goto out; - } - } - spin_unlock(&parent->d_lock); - return NULL; - -out: - return dent; -} - static time_t ncp_obtain_mtime(struct dentry *dentry) { struct inode *inode = dentry->d_inode; @@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) return ncp_date_dos2unix(i.modifyTime, i.modifyDate); } +static inline void +ncp_invalidate_dircache_entries(struct dentry *parent) +{ + struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct dentry *dentry; + + spin_lock(&parent->d_lock); + list_for_each_entry(dentry, &parent->d_subdirs, d_child) { + dentry->d_fsdata = NULL; + ncp_age_dentry(server, dentry); + } + spin_unlock(&parent->d_lock); +} + static int ncp_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; @@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) struct dentry *dent; bool over; - dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx], - dentry, ctx->pos); - if (!dent) + spin_lock(&dentry->d_lock); + if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { + spin_unlock(&dentry->d_lock); + goto invalid_cache; + } + dent = ctl.cache->dentry[ctl.idx]; + if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) { + spin_unlock(&dentry->d_lock); + goto invalid_cache; + } + spin_unlock(&dentry->d_lock); + if (!dent->d_inode) { + dput(dent); goto invalid_cache; + } over = !dir_emit(ctx, dent->d_name.name, dent->d_name.len, dent->d_inode->i_ino, DT_UNKNOWN); @@ -548,6 +539,9 @@ init_cache: ctl.filled = 0; ctl.valid = 1; read_really: + spin_lock(&dentry->d_lock); + NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE; + spin_unlock(&dentry->d_lock); if (ncp_is_server_root(inode)) { ncp_read_volume_list(file, ctx, &ctl); } else { @@ -573,6 +567,13 @@ out: return result; } +static void ncp_d_prune(struct dentry *dentry) +{ + if (!dentry->d_fsdata) /* not referenced from page cache */ + return; + NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE; +} + static int ncp_fill_cache(struct file *file, struct dir_context *ctx, struct ncp_cache_control *ctrl, struct ncp_entry_info *entry, @@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, d_instantiate(newdent, inode); if (!hashed) d_rehash(newdent); + } else { + spin_lock(&dentry->d_lock); + NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + spin_unlock(&dentry->d_lock); } } else { struct inode *inode = newdent->d_inode; @@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, mutex_unlock(&inode->i_mutex); } - if (newdent->d_inode) { - ino = newdent->d_inode->i_ino; - newdent->d_fsdata = (void *) ctl.fpos; - ncp_new_dentry(newdent); - } - if (ctl.idx >= NCP_DIRCACHE_SIZE) { if (ctl.page) { kunmap(ctl.page); @@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, ctl.cache = kmap(ctl.page); } if (ctl.cache) { - ctl.cache->dentry[ctl.idx] = newdent; - valid = 1; + if (newdent->d_inode) { + newdent->d_fsdata = newdent; + ctl.cache->dentry[ctl.idx] = newdent; + ino = newdent->d_inode->i_ino; + ncp_new_dentry(newdent); + } + valid = 1; } dput(newdent); end_advance: diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h index 4b0bec477846..c4794504f843 100644 --- a/fs/ncpfs/ncp_fs_i.h +++ b/fs/ncpfs/ncp_fs_i.h @@ -22,6 +22,7 @@ struct ncp_inode_info { int access; int flags; #define NCPI_KLUDGE_SYMLINK 0x0001 +#define NCPI_DIR_CACHE 0x0002 __u8 file_handle[6]; struct inode vfs_inode; }; diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index b785f74bfe3c..250e443a07f3 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry) dentry->d_time = jiffies; } -static inline void -ncp_renew_dentries(struct dentry *parent) -{ - struct ncp_server *server = NCP_SERVER(parent->d_inode); - struct dentry *dentry; - - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - if (dentry->d_fsdata == NULL) - ncp_age_dentry(server, dentry); - else - ncp_new_dentry(dentry); - } - spin_unlock(&parent->d_lock); -} - -static inline void -ncp_invalidate_dircache_entries(struct dentry *parent) -{ - struct ncp_server *server = NCP_SERVER(parent->d_inode); - struct dentry *dentry; - - spin_lock(&parent->d_lock); - list_for_each_entry(dentry, &parent->d_subdirs, d_child) { - dentry->d_fsdata = NULL; - ncp_age_dentry(server, dentry); - } - spin_unlock(&parent->d_lock); -} - struct ncp_cache_head { time_t mtime; unsigned long time; /* cache age */ diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 84cae2079d21..f22920442172 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -200,7 +200,7 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) { if (fh1->fh_fsid_type != fh2->fh_fsid_type) return false; - if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type) != 0)) + if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0) return false; return true; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index fedb4d620a81..d4c4453674c6 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -1,6 +1,7 @@ #ifndef _FS_NFSD_PNFS_H #define _FS_NFSD_PNFS_H 1 +#ifdef CONFIG_NFSD_V4 #include <linux/exportfs.h> #include <linux/nfsd/export.h> @@ -50,6 +51,7 @@ __be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp, int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, u32 device_generation); struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx); +#endif /* CONFIG_NFSD_V4 */ #ifdef CONFIG_NFSD_PNFS void nfsd4_setup_layout_type(struct svc_export *exp); @@ -59,6 +61,9 @@ void nfsd4_return_all_file_layouts(struct nfs4_client *clp, int nfsd4_init_pnfs(void); void nfsd4_exit_pnfs(void); #else +struct nfs4_client; +struct nfs4_file; + static inline void nfsd4_setup_layout_type(struct svc_export *exp) { } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 46d93e941f3d..44db1808cdb5 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -28,6 +28,7 @@ #include <linux/pipe_fs_i.h> #include <linux/mpage.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #include <cluster/masklog.h> @@ -47,6 +48,9 @@ #include "ocfs2_trace.h" #include "buffer_head_io.h" +#include "dir.h" +#include "namei.h" +#include "sysfile.h" static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -506,18 +510,21 @@ bail: * * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); - * - * Note that we never bother to allocate blocks here, and thus ignore the - * create argument. */ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; + u32 cpos = 0; + int alloc_locked = 0; u64 p_blkno, inode_blocks, contig_blocks; unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + unsigned long len = bh_result->b_size; + unsigned int clusters_to_alloc = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); /* This function won't even be called if the request isn't all * nicely aligned and of the right size, so there's no need @@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* We should already CoW the refcounted extent in case of create. */ BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); + /* allocate blocks if no p_blkno is found, and create == 1 */ + if (!p_blkno && create) { + ret = ocfs2_inode_lock(inode, NULL, 1); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + alloc_locked = 1; + + /* fill hole, allocate blocks can't be larger than the size + * of the hole */ + clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); + if (clusters_to_alloc > contig_blocks) + clusters_to_alloc = contig_blocks; + + /* allocate extent and insert them into the extent tree */ + ret = ocfs2_extend_allocation(inode, cpos, + clusters_to_alloc, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); + if (ret < 0) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + } + /* * get_more_blocks() expects us to describe a hole by clearing * the mapped bit on bh_result(). @@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, contig_blocks = max_blocks; bh_result->b_size = contig_blocks << blocksize_bits; bail: + if (alloc_locked) + ocfs2_inode_unlock(inode, 1); return ret; } @@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int ocfs2_is_overwrite(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + int ret = 0; + u32 v_cpos = 0; + u32 p_cpos = 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + return 1; + + return 0; +} + +static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + ssize_t ret = 0; + ssize_t written = 0; + bool orphaned = false; + int is_overwrite = 0; + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + size_t count = iter->count; + journal_t *journal = osb->journal->j_journal; + u32 zero_len; + int cluster_align; + loff_t final_size = offset + count; + int append_write = offset >= i_size_read(inode) ? 1 : 0; + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + + { + u64 o = offset; + + zero_len = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align = !zero_len; + } + + /* + * when final_size > inode->i_size, inode->i_size will be + * updated after direct write, so add the inode to orphan + * dir first. + */ + if (final_size > i_size_read(inode)) { + ret = ocfs2_add_inode_to_orphan(osb, inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + orphaned = true; + } + + if (append_write) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, offset); + else + ret = ocfs2_extend_no_holes(inode, di_bh, offset, + offset); + if (ret < 0) { + mlog_errno(ret); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + is_overwrite = ocfs2_is_overwrite(osb, inode, offset); + if (is_overwrite < 0) { + mlog_errno(is_overwrite); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + di_bh = NULL; + } + + written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); + if (unlikely(written < 0)) { + loff_t i_size = i_size_read(inode); + + if (offset + count > i_size) { + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + if (i_size == i_size_read(inode)) { + ret = ocfs2_truncate_file(inode, di_bh, + i_size); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + goto clean_orphan; + } + } + + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + ret = jbd2_journal_force_commit(journal); + if (ret < 0) + mlog_errno(ret); + } + } else if (written < 0 && append_write && !is_overwrite && + !cluster_align) { + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + + BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + p_cpos << (osb->s_clustersize_bits - 9), + zero_len >> 9, GFP_KERNEL, false); + if (ret < 0) + mlog_errno(ret); + } + +clean_orphan: + if (orphaned) { + int tmp_ret; + int update_isize = written > 0 ? 1 : 0; + loff_t end = update_isize ? offset + written : 0; + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + update_isize, end); + if (tmp_ret < 0) { + ret = tmp_ret; + goto out; + } + + tmp_ret = jbd2_journal_force_commit(journal); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(tmp_ret); + } + } + +out: + if (ret >= 0) + ret = written; + return ret; +} + static ssize_t ocfs2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, @@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * Fallback to buffered I/O if we see an inode without @@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw, if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; - /* Fallback to buffered I/O if we are appending. */ - if (i_size_read(inode) <= offset) + /* Fallback to buffered I/O if we are appending and + * concurrent O_DIRECT writes are allowed. + */ + if (i_size_read(inode) <= offset && !full_coherency) return 0; - return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, + if (rw == READ) + return __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io, NULL, 0); + else + return ocfs2_direct_IO_write(iocb, iter, offset); } static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e0f04d55fd05..46e0d4e857c7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -295,7 +295,7 @@ out: return ret; } -static int ocfs2_set_inode_size(handle_t *handle, +int ocfs2_set_inode_size(handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, u64 new_i_size) @@ -441,7 +441,7 @@ out: return status; } -static int ocfs2_truncate_file(struct inode *inode, +int ocfs2_truncate_file(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size) { @@ -709,6 +709,13 @@ leave: return status; } +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) +{ + return __ocfs2_extend_allocation(inode, logical_start, + clusters_to_add, mark_unwritten); +} + /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. @@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; loff_t saved_pos = 0, end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * one node could wind up truncating another * nodes writes. */ - if (end > i_size_read(inode)) { + if (end > i_size_read(inode) && !full_coherency) { + *direct_io = 0; + break; + } + + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { *direct_io = 0; break; } @@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ ret = ocfs2_check_range_for_holes(inode, saved_pos, count); if (ret == 1) { - *direct_io = 0; + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; ret = 0; } else if (ret < 0) mlog_errno(ret); @@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2357,11 +2383,51 @@ relock: iov_iter_truncate(from, count); if (direct_io) { + loff_t endbyte; + ssize_t written_buffered; written = generic_file_direct_write(iocb, from, *ppos); - if (written < 0) { + if (written < 0 || written == count) { ret = written; goto out_dio; } + + /* + * for completing the rest of the request. + */ + *ppos += written; + count -= written; + written_buffered = generic_perform_write(file, from, *ppos); + /* + * If generic_file_buffered_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (written_buffered < 0) { + ret = written_buffered; + goto out_dio; + } + + iocb->ki_pos = *ppos + written_buffered; + /* We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = *ppos + written_buffered - 1; + ret = filemap_write_and_wait_range(file->f_mapping, *ppos, + endbyte); + if (ret == 0) { + written += written_buffered; + invalidate_mapping_pages(mapping, + *ppos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } } else { current->backing_dev_info = inode_to_bdi(inode); written = generic_perform_write(file, from, *ppos); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 97bf761c9e7c..e8c62f22215c 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_set_inode_size(handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); +int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); +int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c8b25de9efbb..3025c0da6b8a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode, if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto bail_commit; diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ca3431ee7f24..5e86b247c821 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -81,6 +81,8 @@ struct ocfs2_inode_info tid_t i_sync_tid; tid_t i_datasync_tid; + wait_queue_head_t append_dio_wq; + struct dquot *i_dquot[MAXQUOTAS]; }; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d10860fde165..ff531928269e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -50,6 +50,8 @@ #include "sysfile.h" #include "uptodate.h" #include "quota.h" +#include "file.h" +#include "namei.h" #include "buffer_head_io.h" #include "ocfs2_trace.h" @@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, static int ocfs2_trylock_journal(struct ocfs2_super *osb, int slot_num); static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot); + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type); static int ocfs2_commit_thread(void *arg); static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec); + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type); static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) { @@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb) +void ocfs2_queue_replay_slots(struct ocfs2_super *osb, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; int i; @@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb) for (i = 0; i < replay_map->rm_slots; i++) if (replay_map->rm_replay_slots[i]) ocfs2_queue_recovery_completion(osb->journal, i, NULL, - NULL, NULL); + NULL, NULL, + orphan_reco_type); replay_map->rm_state = REPLAY_DONE; } @@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item { struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; struct ocfs2_quota_recovery *lri_qrec; + enum ocfs2_orphan_reco_type lri_orphan_reco_type; }; /* Does the second half of the recovery process. By this point, the @@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; struct ocfs2_quota_recovery *qrec; + enum ocfs2_orphan_reco_type orphan_reco_type; LIST_HEAD(tmp_la_list); trace_ocfs2_complete_recovery( @@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work) la_dinode = item->lri_la_dinode; tl_dinode = item->lri_tl_dinode; qrec = item->lri_qrec; + orphan_reco_type = item->lri_orphan_reco_type; trace_ocfs2_complete_recovery_slot(item->lri_slot, la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0, @@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work) kfree(tl_dinode); } - ret = ocfs2_recover_orphans(osb, item->lri_slot); + ret = ocfs2_recover_orphans(osb, item->lri_slot, + orphan_reco_type); if (ret < 0) mlog_errno(ret); @@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, struct ocfs2_dinode *tl_dinode, - struct ocfs2_quota_recovery *qrec) + struct ocfs2_quota_recovery *qrec, + enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_la_recovery_item *item; @@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; item->lri_qrec = qrec; + item->lri_orphan_reco_type = orphan_reco_type; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* No need to queue up our truncate_log as regular cleanup will catch * that */ ocfs2_queue_recovery_completion(journal, osb->slot_num, - osb->local_alloc_copy, NULL, NULL); + osb->local_alloc_copy, NULL, NULL, + ORPHAN_NEED_TRUNCATE); ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; @@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); ocfs2_free_replay_slots(osb); } @@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) osb->slot_num, NULL, NULL, - osb->quota_rec); + osb->quota_rec, + ORPHAN_NEED_TRUNCATE); osb->quota_rec = NULL; } } @@ -1360,7 +1374,7 @@ restart: /* queue recovery for our own slot */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL, NULL); + NULL, NULL, ORPHAN_NO_NEED_TRUNCATE); spin_lock(&osb->osb_lock); while (rm->rm_used) { @@ -1419,13 +1433,14 @@ skip_recovery: continue; } ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], - NULL, NULL, qrec); + NULL, NULL, qrec, + ORPHAN_NEED_TRUNCATE); } ocfs2_super_unlock(osb, 1); /* queue recovery for offline slots */ - ocfs2_queue_replay_slots(osb); + ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE); bail: mutex_lock(&osb->recovery_lock); @@ -1711,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy, NULL); + tl_copy, NULL, ORPHAN_NEED_TRUNCATE); status = 0; done: @@ -1901,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) for (i = 0; i < osb->max_slots; i++) ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL, - NULL); + NULL, ORPHAN_NO_NEED_TRUNCATE); /* * We queued a recovery on orphan slots, increment the sequence * number and update LVB so other node will skip the scan for a while @@ -2000,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name, if (IS_ERR(iter)) return 0; + /* Skip inodes which are already added to recover list, since dio may + * happen concurrently with unlink/rename */ + if (OCFS2_I(iter)->ip_next_orphan) { + iput(iter); + return 0; + } + trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno); /* No locking is required for the next_orphan queue as there * is only ever a single process doing orphan recovery. */ @@ -2108,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, * advertising our state to ocfs2_delete_inode(). */ static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) + int slot, + enum ocfs2_orphan_reco_type orphan_reco_type) { int ret = 0; struct inode *inode = NULL; @@ -2132,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, (unsigned long long)oi->ip_blkno); iter = oi->ip_next_orphan; + oi->ip_next_orphan = NULL; + + /* + * We need to take and drop the inode lock to + * force read inode from disk. + */ + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret) { + mlog_errno(ret); + goto next; + } + ocfs2_inode_unlock(inode, 0); + + if (inode->i_nlink == 0) { + spin_lock(&oi->ip_lock); + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&oi->ip_lock); + } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { + struct buffer_head *di_bh = NULL; + + ret = ocfs2_rw_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto next; + } + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + ocfs2_rw_unlock(inode, 1); + mlog_errno(ret); + goto next; + } + + ret = ocfs2_truncate_file(inode, di_bh, + i_size_read(inode)); + ocfs2_inode_unlock(inode, 1); + ocfs2_rw_unlock(inode, 1); + brelse(di_bh); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto next; + } + + ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + if (ret) + mlog_errno(ret); - spin_lock(&oi->ip_lock); - /* Set the proper information to get us going into - * ocfs2_delete_inode. */ - oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - spin_unlock(&oi->ip_lock); + wake_up(&OCFS2_I(inode)->append_dio_wq); + } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ +next: iput(inode); inode = iter; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7f8cde94abfe..f4cd3c3e9fb7 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb) * orphan dir index leaf */ #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4) +/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry + + * orphan dir index root + orphan dir index leaf */ +#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4) +#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS + /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink + 3 x dir index leaves */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 914c121ec890..b5c3a5ea3ee6 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup); + struct ocfs2_dir_lookup_result *lookup, + bool dio); static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, @@ -87,7 +88,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode); + struct inode *orphan_dir_inode, + bool dio); static int ocfs2_create_symlink_data(struct ocfs2_super *osb, handle_t *handle, @@ -104,6 +106,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2); /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +#define OCFS2_DIO_ORPHAN_PREFIX "dio-" +#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -952,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir, if (ocfs2_inode_is_unlinkable(inode)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto leave; @@ -1004,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir, if (is_unlinkable) { status = ocfs2_orphan_add(osb, handle, inode, fe_bh, - orphan_name, &orphan_insert, orphan_dir); + orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) mlog_errno(status); } @@ -1440,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, OCFS2_I(new_inode)->ip_blkno, - orphan_name, &orphan_insert); + orphan_name, &orphan_insert, + false); if (status < 0) { mlog_errno(status); goto bail; @@ -1507,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir, if (should_add_orphan) { status = ocfs2_orphan_add(osb, handle, new_inode, newfe_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto bail; @@ -2088,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, struct buffer_head *orphan_dir_bh, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { int ret; struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; + + if (dio) { + ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } - ret = ocfs2_blkno_stringify(blkno, name); + ret = ocfs2_blkno_stringify(blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + ret = ocfs2_blkno_stringify(blkno, name); if (ret < 0) { mlog_errno(ret); return ret; @@ -2101,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); + namelen, lookup); if (ret < 0) { mlog_errno(ret); return ret; @@ -2128,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, struct inode **ret_orphan_dir, u64 blkno, char *name, - struct ocfs2_dir_lookup_result *lookup) + struct ocfs2_dir_lookup_result *lookup, + bool dio) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; @@ -2142,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, } ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, - blkno, name, lookup); + blkno, name, lookup, dio); if (ret < 0) { mlog_errno(ret); goto out; @@ -2170,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, - struct inode *orphan_dir_inode) + struct inode *orphan_dir_inode, + bool dio) { struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + int namelen = dio ? + (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) : + OCFS2_ORPHAN_NAMELEN; trace_ocfs2_orphan_add_begin( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2219,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, ocfs2_journal_dirty(handle, orphan_dir_bh); status = __ocfs2_add_entry(handle, orphan_dir_inode, name, - OCFS2_ORPHAN_NAMELEN, inode, + namelen, inode, OCFS2_I(inode)->ip_blkno, orphan_dir_bh, lookup); if (status < 0) { @@ -2227,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto rollback; } - fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); - OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; + if (dio) { + /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan + * slot. + */ + fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num); + } else { + fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; - /* Record which orphan dir our inode now resides - * in. delete_inode will use this to determine which orphan - * dir to lock. */ - fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + } ocfs2_journal_dirty(handle, fe_bh); @@ -2258,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh) + struct buffer_head *orphan_dir_bh, + bool dio) { - char name[OCFS2_ORPHAN_NAMELEN + 1]; + const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; + char name[namelen + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (dio) { + status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s", + OCFS2_DIO_ORPHAN_PREFIX); + if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) { + status = -EINVAL; + mlog_errno(status); + return status; + } + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, + name + OCFS2_DIO_ORPHAN_PREFIX_LEN); + } else + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); if (status < 0) { mlog_errno(status); goto leave; @@ -2273,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, OCFS2_ORPHAN_NAMELEN); + name, namelen); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, + status = ocfs2_find_entry(name, namelen, orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2376,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, - di_blkno, orphan_name, orphan_insert); + di_blkno, orphan_name, orphan_insert, + false); if (ret < 0) { mlog_errno(ret); goto out; @@ -2482,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, - &orphan_insert, orphan_dir); + &orphan_insert, orphan_dir, false); if (status < 0) { mlog_errno(status); goto leave; @@ -2527,6 +2577,186 @@ leave: return status; } +static int ocfs2_dio_orphan_recovered(struct inode *inode) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return 0; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + + return ret; +} + +#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000 +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode) +{ + char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; + struct inode *orphan_dir_inode = NULL; + struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + struct buffer_head *di_bh = NULL; + int status = 0; + handle_t *handle = NULL; + struct ocfs2_dinode *di = NULL; + +restart: + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + /* + * Another append dio crashed? + * If so, wait for recovery first. + */ + if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq, + ocfs2_dio_orphan_recovered(inode), + msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL)); + goto restart; + } + + status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode, + OCFS2_I(inode)->ip_blkno, + orphan_name, + &orphan_insert, + true); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_ADD_TO_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name, + &orphan_insert, orphan_dir_inode, true); + if (status) + mlog_errno(status); + + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + ocfs2_free_dir_lookup_result(&orphan_insert); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + handle_t *handle = NULL; + int status = 0; + + status = ocfs2_inode_lock(inode, &di_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) di_bh->b_data; + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + le16_to_cpu(di->i_dio_orphaned_slot)); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail_unlock_inode; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + mlog_errno(status); + goto bail_unlock_inode; + } + + handle = ocfs2_start_trans(osb, + OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + goto bail_unlock_orphan; + } + + BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))); + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, + inode, orphan_dir_bh, true); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL); + di->i_dio_orphaned_slot = 0; + + if (update_isize) { + status = ocfs2_set_inode_size(handle, inode, di_bh, end); + if (status) + mlog_errno(status); + } else + ocfs2_journal_dirty(handle, di_bh); + +bail_commit: + ocfs2_commit_trans(osb, handle); + +bail_unlock_orphan: + ocfs2_inode_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); + iput(orphan_dir_inode); + +bail_unlock_inode: + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + +bail: + return status; +} + int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *inode, struct dentry *dentry) @@ -2615,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, } status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); + orphan_dir_bh, false); if (status < 0) { mlog_errno(status); goto out_commit; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index e5d059d4f115..5ddecce172fa 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, handle_t *handle, struct inode *orphan_dir_inode, struct inode *inode, - struct buffer_head *orphan_dir_bh); + struct buffer_head *orphan_dir_bh, + bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode); +int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, + struct inode *inode); +int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, + struct inode *inode, int update_isize, + loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fdbcbfed529e..8490c64d34fe 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -209,6 +209,11 @@ struct ocfs2_lock_res { #endif }; +enum ocfs2_orphan_reco_type { + ORPHAN_NO_NEED_TRUNCATE = 0, + ORPHAN_NEED_TRUNCATE, +}; + enum ocfs2_orphan_scan_state { ORPHAN_SCAN_ACTIVE, ORPHAN_SCAN_INACTIVE @@ -495,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb) +{ + if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) + return 1; + return 0; +} + + static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) { if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA) @@ -726,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, return clusters; } +static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = (unsigned int)(bytes >> cl_bits); + return clusters; +} + static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, u64 bytes) { diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 938387a10d5d..20e37a3ed26f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -105,7 +105,8 @@ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO) #define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ - | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO) /* * Heartbeat-only devices are missing journals and other files. The @@ -199,6 +200,11 @@ #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 +/* + * Append Direct IO support + */ +#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -229,6 +235,8 @@ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ #define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ +#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially + * for dio */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -729,7 +737,9 @@ struct ocfs2_dinode { inode belongs to. Only valid if allocated from a discontiguous block group */ -/*A0*/ __le64 i_reserved2[3]; +/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */ + __le16 i_reserved1[3]; + __le64 i_reserved2[2]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 87a1f7679d9b..26675185b886 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1746,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data) ocfs2_lock_res_init_once(&oi->ip_inode_lockres); ocfs2_lock_res_init_once(&oi->ip_open_lockres); + init_waitqueue_head(&oi->append_dio_wq); + ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode), &ocfs2_inode_caching_ops); diff --git a/fs/open.c b/fs/open.c index 813be037b412..33f9cbf2610b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ if (f->f_flags & O_DIRECT) { - if (!f->f_mapping->a_ops || - ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_mem))) { + if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; - } } return 0; } @@ -971,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode) */ struct file *filp_open(const char *filename, int flags, umode_t mode) { - struct filename name = {.name = filename}; - return file_open_name(&name, flags, mode); + struct filename *name = getname_kernel(filename); + struct file *file = ERR_CAST(name); + + if (!IS_ERR(name)) { + file = file_open_name(name, flags, mode); + putname(name); + } + return file; } EXPORT_SYMBOL(filp_open); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index de14e46fd807..3309f59d421b 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -350,29 +350,12 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp if (ret) return ret; - if (S_ISDIR(dp->mode)) { - dp->proc_fops = &proc_dir_operations; - dp->proc_iops = &proc_dir_inode_operations; - dir->nlink++; - } else if (S_ISLNK(dp->mode)) { - dp->proc_iops = &proc_link_inode_operations; - } else if (S_ISREG(dp->mode)) { - BUG_ON(dp->proc_fops == NULL); - dp->proc_iops = &proc_file_inode_operations; - } else { - WARN_ON(1); - proc_free_inum(dp->low_ino); - return -EINVAL; - } - spin_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); spin_unlock(&proc_subdir_lock); - if (S_ISDIR(dp->mode)) - dir->nlink--; proc_free_inum(dp->low_ino); return -EEXIST; } @@ -431,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name, ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); if (ent->data) { strcpy((char*)ent->data,dest); + ent->proc_iops = &proc_link_inode_operations; if (proc_register(parent, ent) < 0) { kfree(ent->data); kfree(ent); @@ -456,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; + ent->proc_fops = &proc_dir_operations; + ent->proc_iops = &proc_dir_inode_operations; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); + parent->nlink--; ent = NULL; } } @@ -493,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, return NULL; } + BUG_ON(proc_fops == NULL); + if ((mode & S_IALLUGO) == 0) mode |= S_IRUGO; pde = __proc_create(&parent, name, mode, 1); @@ -500,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, goto out; pde->proc_fops = proc_fops; pde->data = data; + pde->proc_iops = &proc_file_inode_operations; if (proc_register(parent, pde) < 0) goto out_free; return pde; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a90d6d354199..4e61388ec03d 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) nhdr_ptr = notes_section; while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf64_Nhdr) + - ((nhdr_ptr->n_namesz + 3) & ~3) + - ((nhdr_ptr->n_descsz + 3) & ~3); + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); if ((real_sz + sz) > max_sz) { pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); @@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) nhdr_ptr = notes_section; while (nhdr_ptr->n_namesz != 0) { sz = sizeof(Elf32_Nhdr) + - ((nhdr_ptr->n_namesz + 3) & ~3) + - ((nhdr_ptr->n_descsz + 3) & ~3); + (((u64)nhdr_ptr->n_namesz + 3) & ~3) + + (((u64)nhdr_ptr->n_descsz + 3) & ~3); if ((real_sz + sz) > max_sz) { pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n", nhdr_ptr->n_namesz, nhdr_ptr->n_descsz); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 0f96f71ab32b..8db932da4009 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb) { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, + { MS_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; diff --git a/fs/read_write.c b/fs/read_write.c index 4060691e78f7..8e1b68786d66 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -333,6 +333,52 @@ out_putf: } #endif +ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + if (!file->f_op->read_iter) + return -EINVAL; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = iov_iter_count(iter); + + iter->type |= READ; + ret = file->f_op->read_iter(&kiocb, iter); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(vfs_iter_read); + +ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + if (!file->f_op->write_iter) + return -EINVAL; + + init_sync_kiocb(&kiocb, file); + kiocb.ki_pos = *ppos; + kiocb.ki_nbytes = iov_iter_count(iter); + + iter->type |= WRITE; + ret = file->f_op->write_iter(&kiocb, iter); + if (ret == -EIOCBQUEUED) + ret = wait_on_sync_kiocb(&kiocb); + + if (ret > 0) + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(vfs_iter_write); + /* * rw_verify_area doesn't like huge counts. We limit * them to something that fits in "int" so that others diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a7eec9888f10..e72401e1f995 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file, int old_ref = 0; inode = mapping->host; - *fsdata = 0; + *fsdata = NULL; if (flags & AOP_FLAG_CONT_EXPAND && (pos & (inode->i_sb->s_blocksize - 1)) == 0) { pos ++; diff --git a/fs/splice.c b/fs/splice.c index 75c6058eabf2..7968da96bebb 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; - struct kiocb kiocb; size_t left; int n, idx; @@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, left -= this_len; } - /* ... iov_iter */ - from.type = ITER_BVEC | WRITE; - from.bvec = array; - from.nr_segs = n; - from.count = sd.total_len - left; - from.iov_offset = 0; - - /* ... and iocb */ - init_sync_kiocb(&kiocb, out); - kiocb.ki_pos = sd.pos; - kiocb.ki_nbytes = sd.total_len - left; - - /* now, send it */ - ret = out->f_op->write_iter(&kiocb, &from); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - + iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, + sd.total_len - left); + ret = vfs_iter_write(out, &from, &sd.pos); if (ret <= 0) break; sd.num_spliced += ret; sd.total_len -= ret; - *ppos = sd.pos = kiocb.ki_pos; + *ppos = sd.pos; /* dismiss the fully eaten buffers, adjust the partial one */ while (ret) { diff --git a/fs/super.c b/fs/super.c index 1facd2c282e5..65a53efc1cf4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -715,9 +715,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); if (remount_ro) { - if (sb->s_pins.first) { + if (!hlist_empty(&sb->s_pins)) { up_write(&sb->s_umount); - sb_pin_kill(sb); + group_pin_kill(&sb->s_pins); down_write(&sb->s_umount); if (!sb->s_root) return 0; diff --git a/fs/sync.c b/fs/sync.c index 01d9f18a70b5..fbc98ee62044 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd) */ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) { + struct inode *inode = file->f_mapping->host; + if (!file->f_op->fsync) return -EINVAL; + if (!datasync && (inode->i_state & I_DIRTY_TIME)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index da73801301d5..8092d3759a5e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -95,22 +95,18 @@ void lock_ufs(struct super_block *sb) { -#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) struct ufs_sb_info *sbi = UFS_SB(sb); mutex_lock(&sbi->mutex); sbi->mutex_owner = current; -#endif } void unlock_ufs(struct super_block *sb) { -#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT) struct ufs_sb_info *sbi = UFS_SB(sb); sbi->mutex_owner = NULL; mutex_unlock(&sbi->mutex); -#endif } static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) @@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep; static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; - ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); + + ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + ei->vfs_inode.i_version = 1; return &ei->vfs_inode; } |