From ae05f269400533cbb32bfba131ab528d78dffd16 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 28 Oct 2008 15:21:40 -0400 Subject: NFS: Convert nfs_attr_generation_counter into an atomic_long The most important property we need from nfs_attr_generation_counter is monotonicity, which is not guaranteed by the current system of smp memory barriers. We should convert it to an atomic_long_t, and drop the memory barriers. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index dc52793ff8f8..d22eb383e1cf 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -908,21 +908,16 @@ static int nfs_size_need_update(const struct inode *inode, const struct nfs_fatt return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); } -static unsigned long nfs_attr_generation_counter; +static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) { - smp_rmb(); - return nfs_attr_generation_counter; + return atomic_long_read(&nfs_attr_generation_counter); } unsigned long nfs_inc_attr_generation_counter(void) { - unsigned long ret; - smp_rmb(); - ret = ++nfs_attr_generation_counter; - smp_wmb(); - return ret; + return atomic_long_inc_return(&nfs_attr_generation_counter); } void nfs_fattr_init(struct nfs_fattr *fattr) -- cgit v1.2.3 From 4e02ed4b4a2fae34aae766a5bb93ae235f60adb8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 29 Oct 2008 14:00:55 -0700 Subject: fs: remove prepare_write/commit_write Nothing uses prepare_write or commit_write. Remove them from the tree completely. [akpm@linux-foundation.org: schedule simple_prepare_write() for unexporting] Signed-off-by: Nick Piggin Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 12 +- Documentation/filesystems/vfs.txt | 39 +----- drivers/block/loop.c | 5 +- fs/fat/inode.c | 2 +- fs/libfs.c | 2 +- fs/ocfs2/file.c | 3 +- fs/splice.c | 4 +- include/linux/fs.h | 7 -- mm/filemap.c | 242 +------------------------------------- 9 files changed, 23 insertions(+), 293 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 8362860e21a7..23d2f4460deb 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -161,8 +161,12 @@ prototypes: int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); - int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); - int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); @@ -180,8 +184,6 @@ sync_page: no maybe writepages: no set_page_dirty no no readpages: no -prepare_write: no yes yes -commit_write: no yes yes write_begin: no locks the page yes write_end: no yes, unlocks yes perform_write: no n/a yes @@ -191,7 +193,7 @@ releasepage: no yes direct_IO: no launder_page: no yes - ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() + ->write_begin(), ->write_end(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index c4d348dabe94..5579bda58a6d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -492,7 +492,7 @@ written-back to storage typically in whole pages, however the address_space has finer control of write sizes. The read process essentially only requires 'readpage'. The write -process is more complicated and uses prepare_write/commit_write or +process is more complicated and uses write_begin/write_end or set_page_dirty to write data into the address_space, and writepage, sync_page, and writepages to writeback data to storage. @@ -521,8 +521,6 @@ struct address_space_operations { int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); - int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); - int (*commit_write)(struct file *, struct page *, unsigned, unsigned); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); @@ -598,37 +596,7 @@ struct address_space_operations { readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. - prepare_write: called by the generic write path in VM to set up a write - request for a page. This indicates to the address space that - the given range of bytes is about to be written. The - address_space should check that the write will be able to - complete, by allocating space if necessary and doing any other - internal housekeeping. If the write will update parts of - any basic-blocks on storage, then those blocks should be - pre-read (if they haven't been read already) so that the - updated blocks can be written out properly. - The page will be locked. - - Note: the page _must not_ be marked uptodate in this function - (or anywhere else) unless it actually is uptodate right now. As - soon as a page is marked uptodate, it is possible for a concurrent - read(2) to copy it to userspace. - - commit_write: If prepare_write succeeds, new data will be copied - into the page and then commit_write will be called. It will - typically update the size of the file (if appropriate) and - mark the inode as dirty, and do any other related housekeeping - operations. It should avoid returning an error if possible - - errors should have been handled by prepare_write. - - write_begin: This is intended as a replacement for prepare_write. The - key differences being that: - - it returns a locked page (in *pagep) rather than being - given a pre locked page; - - it must be able to cope with short writes (where the - length passed to write_begin is greater than the number - of bytes copied into the page). - + write_begin: Called by the generic buffered write code to ask the filesystem to prepare to write len bytes at the given offset in the file. The address_space should check that the write will be able to complete, @@ -640,6 +608,9 @@ struct address_space_operations { The filesystem must return the locked pagecache page for the specified offset, in *pagep, for the caller to write into. + It must be able to cope with short writes (where the length passed to + write_begin is greater than the number of bytes copied into the page). + flags is a field for AOP_FLAG_xxx flags, described in include/linux/fs.h. diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3f09cd8bcc38..5c4ee70d5cf3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -40,8 +40,7 @@ * Heinz Mauelshagen , Feb 2002 * * Support for falling back on the write file operation when the address space - * operations prepare_write and/or commit_write are not available on the - * backing filesystem. + * operations write_begin is not available on the backing filesystem. * Anton Altaparmakov, 16 Feb 2005 * * Still To Fix: @@ -765,7 +764,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, */ if (!file->f_op->splice_read) goto out_putf; - if (aops->prepare_write || aops->write_begin) + if (aops->write_begin) lo_flags |= LO_FLAGS_USE_AOPS; if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) lo_flags |= LO_FLAGS_READ_ONLY; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 19eafbe3c379..2b2eec1283bf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, if (rw == WRITE) { /* - * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(), + * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), * so we need to update the ->mmu_private to block boundary. * * But we must fill the remaining area or hole by nul for diff --git a/fs/libfs.c b/fs/libfs.c index 74688598bcf7..e960a8321902 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr); EXPORT_SYMBOL(simple_link); EXPORT_SYMBOL(simple_lookup); EXPORT_SYMBOL(simple_pin_fs); -EXPORT_SYMBOL(simple_prepare_write); +EXPORT_UNUSED_SYMBOL(simple_prepare_write); EXPORT_SYMBOL(simple_readpage); EXPORT_SYMBOL(simple_release_fs); EXPORT_SYMBOL(simple_rename); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8d3225a78073..7efe937a415f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -679,8 +679,7 @@ leave: /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to - * worry about recursive locking in ->prepare_write() and - * ->commit_write(). */ + * worry about recursive locking in ->write_begin() and ->write_end(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 size) { diff --git a/fs/splice.c b/fs/splice.c index a1e701c27156..1abab5cee4ba 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, }; /* - * The actor worker might be calling ->prepare_write and - * ->commit_write. Most of the time, these expect i_mutex to + * The actor worker might be calling ->write_begin and + * ->write_end. Most of the time, these expect i_mutex to * be held. Since this may result in an ABBA deadlock with * pipe->inode, we have to order lock acquiry here. */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b248d61430c..0dcdd9458f4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -489,13 +489,6 @@ struct address_space_operations { int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); - /* - * ext3 requires that a successful prepare_write() call be followed - * by a commit_write() call - they must be balanced - */ - int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); - int (*commit_write)(struct file *, struct page *, unsigned, unsigned); - int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); diff --git a/mm/filemap.c b/mm/filemap.c index ab8553658af3..f3e5f8944d17 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2029,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - if (aops->write_begin) { - return aops->write_begin(file, mapping, pos, len, flags, + return aops->write_begin(file, mapping, pos, len, flags, pagep, fsdata); - } else { - int ret; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - struct inode *inode = mapping->host; - struct page *page; -again: - page = __grab_cache_page(mapping, index); - *pagep = page; - if (!page) - return -ENOMEM; - - if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { - /* - * There is no way to resolve a short write situation - * for a !Uptodate page (except by double copying in - * the caller done by generic_perform_write_2copy). - * - * Instead, we have to bring it uptodate here. - */ - ret = aops->readpage(file, page); - page_cache_release(page); - if (ret) { - if (ret == AOP_TRUNCATED_PAGE) - goto again; - return ret; - } - goto again; - } - - ret = aops->prepare_write(file, page, offset, offset+len); - if (ret) { - unlock_page(page); - page_cache_release(page); - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); - } - return ret; - } } EXPORT_SYMBOL(pagecache_write_begin); @@ -2079,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { const struct address_space_operations *aops = mapping->a_ops; - int ret; - - if (aops->write_end) { - mark_page_accessed(page); - ret = aops->write_end(file, mapping, pos, len, copied, - page, fsdata); - } else { - unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - struct inode *inode = mapping->host; - - flush_dcache_page(page); - ret = aops->commit_write(file, page, offset, offset+len); - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - - if (ret < 0) { - if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); - } else if (ret > 0) - ret = min_t(size_t, copied, ret); - else - ret = copied; - } - return ret; + mark_page_accessed(page); + return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); @@ -2226,174 +2163,6 @@ repeat: } EXPORT_SYMBOL(__grab_cache_page); -static ssize_t generic_perform_write_2copy(struct file *file, - struct iov_iter *i, loff_t pos) -{ - struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - ssize_t written = 0; - - do { - struct page *src_page; - struct page *page; - pgoff_t index; /* Pagecache index for current page */ - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ - size_t copied; /* Bytes copied from user */ - - offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; - bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, - iov_iter_count(i)); - - /* - * a non-NULL src_page indicates that we're doing the - * copy via get_user_pages and kmap. - */ - src_page = NULL; - - /* - * Bring in the user page that we will copy from _first_. - * Otherwise there's a nasty deadlock on copying from the - * same page as we're writing to, without it being marked - * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. - */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { - status = -EFAULT; - break; - } - - page = __grab_cache_page(mapping, index); - if (!page) { - status = -ENOMEM; - break; - } - - /* - * non-uptodate pages cannot cope with short copies, and we - * cannot take a pagefault with the destination page locked. - * So pin the source page to copy it. - */ - if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { - unlock_page(page); - - src_page = alloc_page(GFP_KERNEL); - if (!src_page) { - page_cache_release(page); - status = -ENOMEM; - break; - } - - /* - * Cannot get_user_pages with a page locked for the - * same reason as we can't take a page fault with a - * page locked (as explained below). - */ - copied = iov_iter_copy_from_user(src_page, i, - offset, bytes); - if (unlikely(copied == 0)) { - status = -EFAULT; - page_cache_release(page); - page_cache_release(src_page); - break; - } - bytes = copied; - - lock_page(page); - /* - * Can't handle the page going uptodate here, because - * that means we would use non-atomic usercopies, which - * zero out the tail of the page, which can cause - * zeroes to become transiently visible. We could just - * use a non-zeroing copy, but the APIs aren't too - * consistent. - */ - if (unlikely(!page->mapping || PageUptodate(page))) { - unlock_page(page); - page_cache_release(page); - page_cache_release(src_page); - continue; - } - } - - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) - goto fs_write_aop_error; - - if (!src_page) { - /* - * Must not enter the pagefault handler here, because - * we hold the page lock, so we might recursively - * deadlock on the same lock, or get an ABBA deadlock - * against a different lock, or against the mmap_sem - * (which nests outside the page lock). So increment - * preempt count, and use _atomic usercopies. - * - * The page is uptodate so we are OK to encounter a - * short copy: if unmodified parts of the page are - * marked dirty and written out to disk, it doesn't - * really matter. - */ - pagefault_disable(); - copied = iov_iter_copy_from_user_atomic(page, i, - offset, bytes); - pagefault_enable(); - } else { - void *src, *dst; - src = kmap_atomic(src_page, KM_USER0); - dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + offset, bytes); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); - copied = bytes; - } - flush_dcache_page(page); - - status = a_ops->commit_write(file, page, offset, offset+bytes); - if (unlikely(status < 0)) - goto fs_write_aop_error; - if (unlikely(status > 0)) /* filesystem did partial write */ - copied = min_t(size_t, copied, status); - - unlock_page(page); - mark_page_accessed(page); - page_cache_release(page); - if (src_page) - page_cache_release(src_page); - - iov_iter_advance(i, copied); - pos += copied; - written += copied; - - balance_dirty_pages_ratelimited(mapping); - cond_resched(); - continue; - -fs_write_aop_error: - unlock_page(page); - page_cache_release(page); - if (src_page) - page_cache_release(src_page); - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. - */ - if (pos + bytes > inode->i_size) - vmtruncate(inode, inode->i_size); - break; - } while (iov_iter_count(i)); - - return written ? written : status; -} - static ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { @@ -2494,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, struct iov_iter i; iov_iter_init(&i, iov, nr_segs, count, written); - if (a_ops->write_begin) - status = generic_perform_write(file, &i, pos); - else - status = generic_perform_write_2copy(file, &i, pos); + status = generic_perform_write(file, &i, pos); if (likely(status >= 0)) { written += status; -- cgit v1.2.3 From 87b811c3f96559e466403e22b1fa99d472571625 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 29 Oct 2008 14:01:08 -0700 Subject: ecryptfs: fix memory corruption when storing crypto info in xattrs When ecryptfs allocates space to write crypto headers into, before copying it out to file headers or to xattrs, it looks at the value of crypt_stat->num_header_bytes_at_front to determine how much space it needs. This is also used as the file offset to the actual encrypted data, so for xattr-stored crypto info, the value was zero. So, we kzalloc'd 0 bytes, and then ran off to write to that memory. (Which returned as ZERO_SIZE_PTR, so we explode quickly). The right answer is to always allocate a page to write into; the current code won't ever write more than that (this is enforced by the (PAGE_CACHE_SIZE - offset) length in the call to ecryptfs_generate_key_packet_set). To be explicit about this, we now send in a "max" parameter, rather than magically using PAGE_CACHE_SIZE there. Also, since the pointer we pass down the callchain eventually gets the virt_to_page() treatment, we should be using a alloc_page variant, not kzalloc (see also 7fcba054373d5dfc43d26e243a5c9b92069972ee) Signed-off-by: Eric Sandeen Acked-by: Michael Halcrow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 06db79d05c12..6046239465a1 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1251,6 +1251,7 @@ struct kmem_cache *ecryptfs_header_cache_2; /** * ecryptfs_write_headers_virt * @page_virt: The virtual address to write the headers to + * @max: The size of memory allocated at page_virt * @size: Set to the number of bytes written by this function * @crypt_stat: The cryptographic context * @ecryptfs_dentry: The eCryptfs dentry @@ -1278,7 +1279,8 @@ struct kmem_cache *ecryptfs_header_cache_2; * * Returns zero on success */ -static int ecryptfs_write_headers_virt(char *page_virt, size_t *size, +static int ecryptfs_write_headers_virt(char *page_virt, size_t max, + size_t *size, struct ecryptfs_crypt_stat *crypt_stat, struct dentry *ecryptfs_dentry) { @@ -1296,7 +1298,7 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t *size, offset += written; rc = ecryptfs_generate_key_packet_set((page_virt + offset), crypt_stat, ecryptfs_dentry, &written, - PAGE_CACHE_SIZE - offset); + max - offset); if (rc) ecryptfs_printk(KERN_WARNING, "Error generating key packet " "set; rc = [%d]\n", rc); @@ -1368,14 +1370,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) goto out; } /* Released in this function */ - virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL); + virt = (char *)get_zeroed_page(GFP_KERNEL); if (!virt) { printk(KERN_ERR "%s: Out of memory\n", __func__); rc = -ENOMEM; goto out; } - rc = ecryptfs_write_headers_virt(virt, &size, crypt_stat, - ecryptfs_dentry); + rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, + crypt_stat, ecryptfs_dentry); if (unlikely(rc)) { printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", __func__, rc); @@ -1393,8 +1395,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) goto out_free; } out_free: - memset(virt, 0, crypt_stat->num_header_bytes_at_front); - kfree(virt); + free_page((unsigned long)virt); out: return rc; } -- cgit v1.2.3 From e74481e23283fb080d4591c258de20785cc3b6c3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 29 Oct 2008 14:01:10 -0700 Subject: fs: remove excess kernel-doc Delete excess kernel-doc notation in fs/ subdirectory: Warning(linux-2.6.27-git10//fs/jbd/transaction.c:886): Excess function parameter or struct member 'credits' description in 'journal_get_undo_access' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/transaction.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index d15cd6e7251e..60d4c32c8808 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -860,7 +860,6 @@ out: * int journal_get_undo_access() - Notify intent to modify metadata with non-rewindable consequences * @handle: transaction * @bh: buffer to undo - * @credits: store the number of taken credits here (if not NULL) * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses -- cgit v1.2.3