From 3a307ffc2730bfa1a4dfa94537be9d412338aad2 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Tue, 8 May 2007 17:47:32 -0700
Subject: ocfs2: rework ocfs2_buffered_write_cluster()

Use some ideas from the new-aops patch series and turn
ocfs2_buffered_write_cluster() into a 2 stage operation with the caller
copying data in between. The code now understands multiple cluster writes as
a result of having to deal with a full page write for greater than 4k pages.

This sets us up to easily call into the write path during ->page_mkwrite().

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 812 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 478 insertions(+), 334 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09c79b9..3e5758ebd932 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 	     bh = bh->b_this_page, block_start += bsize) {
 		block_end = block_start + bsize;
 
+		clear_buffer_new(bh);
+
 		/*
 		 * Ignore blocks outside of our i/o range -
 		 * they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 		 * For an allocating write with cluster size >= page
 		 * size, we always write the entire page.
 		 */
-
-		if (buffer_new(bh))
-			clear_buffer_new(bh);
+		if (new)
+			set_buffer_new(bh);
 
 		if (!buffer_mapped(bh)) {
 			map_bh(bh, inode->i_sb, *p_blkno);
@@ -761,217 +762,232 @@ next_bh:
 	return ret;
 }
 
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES	1
+#else
+#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
 /*
- * This will copy user data from the buffer page in the splice
- * context.
- *
- * For now, we ignore SPLICE_F_MOVE as that would require some extra
- * communication out all the way to ocfs2_write().
+ * Describe the state of a single cluster to be written to.
  */
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-				  unsigned int *ret_from, unsigned int *ret_to)
-{
-	int ret;
-	unsigned int to, from, cluster_start, cluster_end;
-	char *src, *dst;
-	struct ocfs2_splice_write_priv *sp = wc->w_private;
-	struct pipe_buffer *buf = sp->s_buf;
-	unsigned long bytes, src_from;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+struct ocfs2_write_cluster_desc {
+	u32		c_cpos;
+	u32		c_phys;
+	/*
+	 * Give this a unique field because c_phys eventually gets
+	 * filled.
+	 */
+	unsigned	c_new;
+};
 
-	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
-					&cluster_end);
+struct ocfs2_write_ctxt {
+	/* Logical cluster position / len of write */
+	u32				w_cpos;
+	u32				w_clen;
 
-	from = sp->s_offset;
-	src_from = sp->s_buf_offset;
-	bytes = wc->w_count;
+	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
-	if (wc->w_large_pages) {
-		/*
-		 * For cluster size < page size, we have to
-		 * calculate pos within the cluster and obey
-		 * the rightmost boundary.
-		 */
-		bytes = min(bytes, (unsigned long)(osb->s_clustersize
-				   - (wc->w_pos & (osb->s_clustersize - 1))));
-	}
-	to = from + bytes;
+	/*
+	 * This is true if page_size > cluster_size.
+	 *
+	 * It triggers a set of special cases during write which might
+	 * have to deal with allocating writes to partial pages.
+	 */
+	unsigned int			w_large_pages;
 
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
-	BUG_ON(from < cluster_start);
-	BUG_ON(to > cluster_end);
+	/*
+	 * Pages involved in this write.
+	 *
+	 * w_target_page is the page being written to by the user.
+	 *
+	 * w_pages is an array of pages which always contains
+	 * w_target_page, and in the case of an allocating write with
+	 * page_size < cluster size, it will contain zero'd and mapped
+	 * pages adjacent to w_target_page which need to be written
+	 * out in so that future reads from that region will get
+	 * zero's.
+	 */
+	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
+	unsigned int			w_num_pages;
+	struct page			*w_target_page;
 
-	if (wc->w_this_page_new)
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    cluster_start, cluster_end, 1);
-	else
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    from, to, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
+	/*
+	 * ocfs2_write_end() uses this to know what the real range to
+	 * write in the target should be.
+	 */
+	unsigned int			w_target_from;
+	unsigned int			w_target_to;
+
+	/*
+	 * We could use journal_current_handle() but this is cleaner,
+	 * IMHO -Mark
+	 */
+	handle_t			*w_handle;
+
+	struct buffer_head		*w_di_bh;
+};
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+	int i;
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		if (wc->w_pages[i] == NULL)
+			continue;
+
+		unlock_page(wc->w_pages[i]);
+		mark_page_accessed(wc->w_pages[i]);
+		page_cache_release(wc->w_pages[i]);
 	}
 
-	src = buf->ops->map(sp->s_pipe, buf, 1);
-	dst = kmap_atomic(wc->w_this_page, KM_USER1);
-	memcpy(dst + from, src + src_from, bytes);
-	kunmap_atomic(wc->w_this_page, KM_USER1);
-	buf->ops->unmap(sp->s_pipe, buf, src);
+	brelse(wc->w_di_bh);
+	kfree(wc);
+}
+
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+				  struct ocfs2_super *osb, loff_t pos,
+				  unsigned len)
+{
+	struct ocfs2_write_ctxt *wc;
+
+	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+	if (!wc)
+		return -ENOMEM;
 
-	wc->w_finished_copy = 1;
+	wc->w_cpos = pos >> osb->s_clustersize_bits;
+	wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
 
-	*ret_from = from;
-	*ret_to = to;
-out:
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+		wc->w_large_pages = 1;
+	else
+		wc->w_large_pages = 0;
+
+	*wcp = wc;
 
-	return bytes ? (unsigned int)bytes : ret;
+	return 0;
 }
 
 /*
- * This will copy user data from the iovec in the buffered write
- * context.
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
  */
-int ocfs2_map_and_write_user_data(struct inode *inode,
-				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-				  unsigned int *ret_from, unsigned int *ret_to)
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
-	int ret;
-	unsigned int to, from, cluster_start, cluster_end;
-	unsigned long bytes, src_from;
-	char *dst;
-	struct ocfs2_buffered_write_priv *bp = wc->w_private;
-	const struct iovec *cur_iov = bp->b_cur_iov;
-	char __user *buf;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
 
-	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
-					&cluster_end);
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
 
-	buf = cur_iov->iov_base + bp->b_cur_off;
-	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, end;
+					void *kaddr;
+
+					start = max(from, block_start);
+					end = min(to, block_end);
+
+					kaddr = kmap_atomic(page, KM_USER0);
+					memset(kaddr+start, 0, end - start);
+					flush_dcache_page(page);
+					kunmap_atomic(kaddr, KM_USER0);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
 
-	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+				struct ocfs2_write_ctxt *wc,
+				loff_t user_pos, unsigned user_len)
+{
+	int i;
+	unsigned from, to;
+	struct page *tmppage;
+
+	ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
 
-	/*
-	 * This is a lot of comparisons, but it reads quite
-	 * easily, which is important here.
-	 */
-	/* Stay within the src page */
-	bytes = PAGE_SIZE - src_from;
-	/* Stay within the vector */
-	bytes = min(bytes,
-		    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
-	/* Stay within count */
-	bytes = min(bytes, (unsigned long)wc->w_count);
-	/*
-	 * For clustersize > page size, just stay within
-	 * target page, otherwise we have to calculate pos
-	 * within the cluster and obey the rightmost
-	 * boundary.
-	 */
 	if (wc->w_large_pages) {
-		/*
-		 * For cluster size < page size, we have to
-		 * calculate pos within the cluster and obey
-		 * the rightmost boundary.
-		 */
-		bytes = min(bytes, (unsigned long)(osb->s_clustersize
-				   - (wc->w_pos & (osb->s_clustersize - 1))));
+		from = wc->w_target_from;
+		to = wc->w_target_to;
 	} else {
-		/*
-		 * cluster size > page size is the most common
-		 * case - we just stay within the target page
-		 * boundary.
-		 */
-		bytes = min(bytes, PAGE_CACHE_SIZE - from);
+		from = 0;
+		to = PAGE_CACHE_SIZE;
 	}
 
-	to = from + bytes;
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
 
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
-	BUG_ON(from < cluster_start);
-	BUG_ON(to > cluster_end);
+		if (ocfs2_should_order_data(inode))
+			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+					  from, to, NULL,
+					  ocfs2_journal_dirty_data);
 
-	if (wc->w_this_page_new)
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    cluster_start, cluster_end, 1);
-	else
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    from, to, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
+		block_commit_write(tmppage, from, to);
 	}
-
-	dst = kmap(wc->w_this_page);
-	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
-	kunmap(wc->w_this_page);
-
-	/*
-	 * XXX: This is slow, but simple. The caller of
-	 * ocfs2_buffered_write_cluster() is responsible for
-	 * passing through the iovecs, so it's difficult to
-	 * predict what our next step is in here after our
-	 * initial write. A future version should be pushing
-	 * that iovec manipulation further down.
-	 *
-	 * By setting this, we indicate that a copy from user
-	 * data was done, and subsequent calls for this
-	 * cluster will skip copying more data.
-	 */
-	wc->w_finished_copy = 1;
-
-	*ret_from = from;
-	*ret_to = to;
-out:
-
-	return bytes ? (unsigned int)bytes : ret;
 }
 
-/*
- * Map, fill and write a page to disk.
- *
- * The work of copying data is done via callback.  Newly allocated
- * pages which don't take user data will be zero'd (set 'new' to
- * indicate an allocating write)
- *
- * Returns a negative error code or the number of bytes copied into
- * the page.
- */
-static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-				 u64 *p_blkno, struct page *page,
-				 struct ocfs2_write_ctxt *wc, int new)
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+					struct ocfs2_write_ctxt *wc,
+					struct page *page, u32 cpos,
+					loff_t user_pos, unsigned user_len,
+					int new)
 {
-	int ret, copied = 0;
-	unsigned int from = 0, to = 0;
+	int ret;
+	unsigned int map_from = 0, map_to = 0;
 	unsigned int cluster_start, cluster_end;
-	unsigned int zero_from = 0, zero_to = 0;
+	unsigned int user_data_from = 0, user_data_to = 0;
 
-	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
 					&cluster_start, &cluster_end);
 
-	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
-	    && !wc->w_finished_copy) {
-
-		wc->w_this_page = page;
-		wc->w_this_page_new = new;
-		ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
-		if (ret < 0) {
+	if (page == wc->w_target_page) {
+		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+		map_to = map_from + user_len;
+
+		if (new)
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    cluster_start, cluster_end,
+						    new);
+		else
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    map_from, map_to, new);
+		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		copied = ret;
-
-		zero_from = from;
-		zero_to = to;
+		user_data_from = map_from;
+		user_data_to = map_to;
 		if (new) {
-			from = cluster_start;
-			to = cluster_end;
+			map_from = cluster_start;
+			map_to = cluster_end;
 		}
+
+		wc->w_target_from = map_from;
+		wc->w_target_to = map_to;
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
@@ -980,11 +996,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
 		 */
 		BUG_ON(!new);
 
-		from = cluster_start;
-		to = cluster_end;
+		map_from = cluster_start;
+		map_to = cluster_end;
 
 		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-					    cluster_start, cluster_end, 1);
+					    cluster_start, cluster_end, new);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1003,108 +1019,84 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
 	 */
 	if (new && !PageUptodate(page))
 		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
-					 wc->w_cpos, zero_from, zero_to);
+					 cpos, user_data_from, user_data_to);
 
 	flush_dcache_page(page);
 
-	if (ocfs2_should_order_data(inode)) {
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, NULL,
-					ocfs2_journal_dirty_data);
-		if (ret < 0)
-			mlog_errno(ret);
-	}
-
-	/*
-	 * We don't use generic_commit_write() because we need to
-	 * handle our own i_size update.
-	 */
-	ret = block_commit_write(page, from, to);
-	if (ret)
-		mlog_errno(ret);
 out:
-
-	return copied ? copied : ret;
+	return ret;
 }
 
 /*
- * Do the actual write of some data into an inode. Optionally allocate
- * in order to fulfill the write.
- *
- * cpos is the logical cluster offset within the file to write at
- *
- * 'phys' is the physical mapping of that offset. a 'phys' value of
- * zero indicates that allocation is required. In this case, data_ac
- * and meta_ac should be valid (meta_ac can be null if metadata
- * allocation isn't required).
+ * This function will only grab one clusters worth of pages.
  */
-static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
-			   struct buffer_head *di_bh,
-			   struct ocfs2_alloc_context *data_ac,
-			   struct ocfs2_alloc_context *meta_ac,
-			   struct ocfs2_write_ctxt *wc)
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+				      struct ocfs2_write_ctxt *wc,
+				      u32 cpos, loff_t user_pos, int new)
 {
-	int ret, i, numpages = 1, new;
-	unsigned int copied = 0;
-	u32 tmp_pos;
-	u64 v_blkno, p_blkno;
-	struct address_space *mapping = file->f_mapping;
+	int ret = 0, i;
+	unsigned long start, target_index, index;
 	struct inode *inode = mapping->host;
-	unsigned long index, start;
-	struct page **cpages;
 
-	new = phys == 0 ? 1 : 0;
+	target_index = user_pos >> PAGE_CACHE_SHIFT;
 
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
 	 * non allocating write, we just change the one
 	 * page. Otherwise, we'll need a whole clusters worth.
 	 */
-	if (new)
-		numpages = ocfs2_pages_per_cluster(inode->i_sb);
-
-	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
-	if (!cpages) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	/*
-	 * Fill our page array first. That way we've grabbed enough so
-	 * that we can zero and flush if we error after adding the
-	 * extent.
-	 */
 	if (new) {
-		start = ocfs2_align_clusters_to_page_index(inode->i_sb,
-							   wc->w_cpos);
-		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
 	} else {
-		start = wc->w_pos >> PAGE_CACHE_SHIFT;
-		v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+		wc->w_num_pages = 1;
+		start = target_index;
 	}
 
-	for(i = 0; i < numpages; i++) {
+	for(i = 0; i < wc->w_num_pages; i++) {
 		index = start + i;
 
-		cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!cpages[i]) {
+		wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS);
+		if (!wc->w_pages[i]) {
 			ret = -ENOMEM;
 			mlog_errno(ret);
 			goto out;
 		}
+
+		if (index == target_index)
+			wc->w_target_page = wc->w_pages[i];
 	}
+out:
+	return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+			       u32 phys, struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct ocfs2_write_ctxt *wc, u32 cpos,
+			       loff_t user_pos, unsigned user_len)
+{
+	int ret, i, new;
+	u64 v_blkno, p_blkno;
+	struct inode *inode = mapping->host;
+
+	new = phys == 0 ? 1 : 0;
 
 	if (new) {
+		u32 tmp_pos;
+
 		/*
 		 * This is safe to call with the page locks - it won't take
 		 * any additional semaphores or cluster locks.
 		 */
-		tmp_pos = wc->w_cpos;
+		tmp_pos = cpos;
 		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, di_bh, handle,
-						 data_ac, meta_ac, NULL);
+						 &tmp_pos, 1, wc->w_di_bh,
+						 wc->w_handle, data_ac,
+						 meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
@@ -1121,103 +1113,132 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
 			mlog_errno(ret);
 			goto out;
 		}
+
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+	} else {
+		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
 	}
 
+	/*
+	 * The only reason this should fail is due to an inability to
+	 * find the extent added.
+	 */
 	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
 					  NULL);
 	if (ret < 0) {
-
-		/*
-		 * XXX: Should we go readonly here?
-		 */
-
-		mlog_errno(ret);
+		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+			    "at logical block %llu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_blkno);
 		goto out;
 	}
 
 	BUG_ON(p_blkno == 0);
 
-	for(i = 0; i < numpages; i++) {
-		ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
-					    wc, new);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
+	for(i = 0; i < wc->w_num_pages; i++) {
+		int tmpret;
 
-		copied += ret;
+		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+						      wc->w_pages[i], cpos,
+						      user_pos, user_len, new);
+		if (tmpret) {
+			mlog_errno(tmpret);
+			if (ret == 0)
+				tmpret = ret;
+		}
 	}
 
+	/*
+	 * We only have cleanup to do in case of allocating write.
+	 */
+	if (ret && new)
+		ocfs2_write_failure(inode, wc, user_pos, user_len);
+
 out:
-	for(i = 0; i < numpages; i++) {
-		unlock_page(cpages[i]);
-		mark_page_accessed(cpages[i]);
-		page_cache_release(cpages[i]);
-	}
-	kfree(cpages);
 
-	return copied ? copied : ret;
+	return ret;
 }
 
-static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
-				  struct ocfs2_super *osb, loff_t pos,
-				  size_t count, ocfs2_page_writer *cb,
-				  void *cb_priv)
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+					struct ocfs2_write_ctxt *wc,
+					loff_t pos, unsigned len, int alloc)
 {
-	wc->w_count = count;
-	wc->w_pos = pos;
-	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
-	wc->w_finished_copy = 0;
+	struct ocfs2_write_cluster_desc *desc;
 
-	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
-		wc->w_large_pages = 1;
-	else
-		wc->w_large_pages = 0;
+	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+	wc->w_target_to = wc->w_target_from + len;
+
+	if (alloc == 0)
+		return;
+
+	/*
+	 * Allocating write - we may have different boundaries based
+	 * on page size and cluster size.
+	 *
+	 * NOTE: We can no longer compute one value from the other as
+	 * the actual write length and user provided length may be
+	 * different.
+	 */
 
-	wc->w_write_data_page = cb;
-	wc->w_private = cb_priv;
+	if (wc->w_large_pages) {
+		/*
+		 * We only care about the 1st and last cluster within
+		 * our range and whether they are holes or not. Either
+		 * value may be extended out to the start/end of a
+		 * newly allocated cluster.
+		 */
+		desc = &wc->w_desc[0];
+		if (desc->c_new)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							&wc->w_target_from,
+							NULL);
+
+		desc = &wc->w_desc[wc->w_clen - 1];
+		if (desc->c_new)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							NULL,
+							&wc->w_target_to);
+	} else {
+		wc->w_target_from = 0;
+		wc->w_target_to = PAGE_CACHE_SIZE;
+	}
 }
 
-/*
- * Write a cluster to an inode. The cluster may not be allocated yet,
- * in which case it will be. This only exists for buffered writes -
- * O_DIRECT takes a more "traditional" path through the kernel.
- *
- * The caller is responsible for incrementing pos, written counts, etc
- *
- * For file systems that don't support sparse files, pre-allocation
- * and page zeroing up until cpos should be done prior to this
- * function call.
- *
- * Callers should be holding i_sem, and the rw cluster lock.
- *
- * Returns the number of user bytes written, or less than zero for
- * error.
- */
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
-				     size_t count, ocfs2_page_writer *actor,
-				     void *priv)
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+		      loff_t pos, unsigned len, unsigned flags,
+		      struct page **pagep, void **fsdata)
 {
-	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-	ssize_t written = 0;
-	u32 phys;
-	struct inode *inode = file->f_mapping->host;
+	int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
+	unsigned int num_clusters = 0, clusters_to_alloc = 0;
+	u32 phys = 0;
+	struct ocfs2_write_ctxt *wc;
+	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
-	struct ocfs2_write_ctxt wc;
+	struct ocfs2_write_cluster_desc *desc;
 
-	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
 
-	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
 	}
-	di = (struct ocfs2_dinode *)di_bh->b_data;
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
 	/*
 	 * Take alloc sem here to prevent concurrent lookups. That way
@@ -1228,23 +1249,60 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
 	 */
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_meta;
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+		desc->c_cpos = wc->w_cpos + i;
+
+		if (num_clusters == 0) {
+			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+						 &num_clusters, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_meta;
+			}
+		} else if (phys) {
+			/*
+			 * Only increment phys if it doesn't describe
+			 * a hole.
+			 */
+			phys++;
+		}
+
+		desc->c_phys = phys;
+		if (phys == 0) {
+			desc->c_new = 1;
+			clusters_to_alloc++;
+		}
+
+		num_clusters--;
 	}
 
-	/* phys == 0 means that allocation is required. */
-	if (phys == 0) {
-		ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+	/*
+	 * We set w_target_from, w_target_to here so that
+	 * ocfs2_write_end() knows which range in the target page to
+	 * write out. An allocation requires that we write the entire
+	 * cluster range.
+	 */
+	if (clusters_to_alloc > 0) {
+		/*
+		 * XXX: We are stretching the limits of
+		 * ocfs2_lock_allocators(). It greately over-estimates
+		 * the work to be done.
+		 */
+		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out_meta;
 		}
 
-		credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+						    clusters_to_alloc);
+
 	}
 
+	ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
+
 	ret = ocfs2_data_lock(inode, 1);
 	if (ret) {
 		mlog_errno(ret);
@@ -1258,36 +1316,50 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
 		goto out_data;
 	}
 
-	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
-			      meta_ac, &wc);
-	if (written < 0) {
-		ret = written;
+	wc->w_handle = handle;
+
+	/*
+	 * We don't want this to fail in ocfs2_write_end(), so do it
+	 * here.
+	 */
+	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
+	/*
+	 * Fill our page array first. That way we've grabbed enough so
+	 * that we can zero and flush if we error after adding the
+	 * extent.
+	 */
+	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+					 clusters_to_alloc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
 	}
 
-	pos += written;
-	if (pos > inode->i_size) {
-		i_size_write(inode, pos);
-		mark_inode_dirty(inode);
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+
+		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
+					  meta_ac, wc, desc->c_cpos, pos, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_commit;
+		}
 	}
-	inode->i_blocks = ocfs2_inode_sector_count(inode);
-	di->i_size = cpu_to_le64((u64)i_size_read(inode));
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret)
-		mlog_errno(ret);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
 
+	*pagep = wc->w_target_page;
+	*fsdata = wc;
+	return 0;
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 
@@ -1299,13 +1371,85 @@ out_meta:
 	ocfs2_meta_unlock(inode, 1);
 
 out:
-	brelse(di_bh);
+	ocfs2_free_write_ctxt(wc);
+
 	if (data_ac)
 		ocfs2_free_alloc_context(data_ac);
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned copied,
+		    struct page *page, void *fsdata)
+{
+	int i;
+	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_write_ctxt *wc = fsdata;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	handle_t *handle = wc->w_handle;
+	struct page *tmppage;
+
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(wc->w_target_page))
+			copied = 0;
+
+		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+				       start+len);
+	}
+	flush_dcache_page(wc->w_target_page);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
+
+		if (tmppage == wc->w_target_page) {
+			from = wc->w_target_from;
+			to = wc->w_target_to;
+
+			BUG_ON(from > PAGE_CACHE_SIZE ||
+			       to > PAGE_CACHE_SIZE ||
+			       to < from);
+		} else {
+			/*
+			 * Pages adjacent to the target (if any) imply
+			 * a hole-filling write in which case we want
+			 * to flush their entire range.
+			 */
+			from = 0;
+			to = PAGE_CACHE_SIZE;
+		}
+
+		if (ocfs2_should_order_data(inode))
+			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+					  from, to, NULL,
+					  ocfs2_journal_dirty_data);
+
+		block_commit_write(tmppage, from, to);
+	}
+
+	pos += copied;
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	ocfs2_journal_dirty(handle, wc->w_di_bh);
+
+	ocfs2_commit_trans(osb, handle);
+	ocfs2_data_unlock(inode, 1);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 1);
+	ocfs2_free_write_ctxt(wc);
 
-	return written ? written : ret;
+	return copied;
 }
 
 const struct address_space_operations ocfs2_aops = {
-- 
cgit v1.2.3


From 607d44aa3fa6f40b0facaf1028886ed362b92682 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 9 May 2007 15:14:45 -0700
Subject: ocfs2: factor out write aops into nolock variants

ocfs2_mkwrite() will want this so that it can add some mmap specific checks
before asking for a write.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 120 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 80 insertions(+), 40 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3e5758ebd932..fc723fb9c981 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -849,7 +849,7 @@ static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
 
 static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 				  struct ocfs2_super *osb, loff_t pos,
-				  unsigned len)
+				  unsigned len, struct buffer_head *di_bh)
 {
 	struct ocfs2_write_ctxt *wc;
 
@@ -859,6 +859,8 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 
 	wc->w_cpos = pos >> osb->s_clustersize_bits;
 	wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+	get_bh(di_bh);
+	wc->w_di_bh = di_bh;
 
 	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
 		wc->w_large_pages = 1;
@@ -1211,9 +1213,10 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 	}
 }
 
-int ocfs2_write_begin(struct file *file, struct address_space *mapping,
-		      loff_t pos, unsigned len, unsigned flags,
-		      struct page **pagep, void **fsdata)
+static int ocfs2_write_begin_nolock(struct address_space *mapping,
+				    loff_t pos, unsigned len, unsigned flags,
+				    struct page **pagep, void **fsdata,
+				    struct buffer_head *di_bh)
 {
 	int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
 	unsigned int num_clusters = 0, clusters_to_alloc = 0;
@@ -1227,28 +1230,14 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	handle_t *handle;
 	struct ocfs2_write_cluster_desc *desc;
 
-	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len);
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
 		mlog_errno(ret);
 		return ret;
 	}
 
-	ret = ocfs2_meta_lock(inode, &wc->w_di_bh, 1);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
-	/*
-	 * Take alloc sem here to prevent concurrent lookups. That way
-	 * the mapping, zeroing and tree manipulation within
-	 * ocfs2_write() will be safe against ->readpage(). This
-	 * should also serve to lock out allocation from a shared
-	 * writeable region.
-	 */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 		desc->c_cpos = wc->w_cpos + i;
@@ -1258,7 +1247,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 						 &num_clusters, NULL);
 			if (ret) {
 				mlog_errno(ret);
-				goto out_meta;
+				goto out;
 			}
 		} else if (phys) {
 			/*
@@ -1293,7 +1282,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 					    &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_meta;
+			goto out;
 		}
 
 		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
@@ -1303,17 +1292,11 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 
 	ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
 
-	ret = ocfs2_data_lock(inode, 1);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_meta;
-	}
-
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		mlog_errno(ret);
-		goto out_data;
+		goto out;
 	}
 
 	wc->w_handle = handle;
@@ -1363,13 +1346,6 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 
-out_data:
-	ocfs2_data_unlock(inode, 1);
-
-out_meta:
-	up_write(&OCFS2_I(inode)->ip_alloc_sem);
-	ocfs2_meta_unlock(inode, 1);
-
 out:
 	ocfs2_free_write_ctxt(wc);
 
@@ -1380,9 +1356,60 @@ out:
 	return ret;
 }
 
-int ocfs2_write_end(struct file *file, struct address_space *mapping,
-		    loff_t pos, unsigned len, unsigned copied,
-		    struct page *page, void *fsdata)
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+		      loff_t pos, unsigned len, unsigned flags,
+		      struct page **pagep, void **fsdata)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = mapping->host;
+
+	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * Take alloc sem here to prevent concurrent lookups. That way
+	 * the mapping, zeroing and tree manipulation within
+	 * ocfs2_write() will be safe against ->readpage(). This
+	 * should also serve to lock out allocation from a shared
+	 * writeable region.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_fail;
+	}
+
+	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+				       fsdata, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_fail_data;
+	}
+
+	brelse(di_bh);
+
+	return 0;
+
+out_fail_data:
+	ocfs2_data_unlock(inode, 1);
+out_fail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_meta_unlock(inode, 1);
+
+	return ret;
+}
+
+static int ocfs2_write_end_nolock(struct address_space *mapping,
+				  loff_t pos, unsigned len, unsigned copied,
+				  struct page *page, void *fsdata)
 {
 	int i;
 	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
@@ -1444,12 +1471,25 @@ int ocfs2_write_end(struct file *file, struct address_space *mapping,
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 	ocfs2_commit_trans(osb, handle);
+	ocfs2_free_write_ctxt(wc);
+
+	return copied;
+}
+
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned copied,
+		    struct page *page, void *fsdata)
+{
+	int ret;
+	struct inode *inode = mapping->host;
+
+	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
 	ocfs2_data_unlock(inode, 1);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 1);
-	ocfs2_free_write_ctxt(wc);
 
-	return copied;
+	return ret;
 }
 
 const struct address_space_operations ocfs2_aops = {
-- 
cgit v1.2.3


From 7307de80510a70e5e5aa98de1e80ccbb7d90a3a8 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Wed, 9 May 2007 15:16:19 -0700
Subject: ocfs2: shared writeable mmap

Implement cluster consistent shared writeable mappings using the
->page_mkwrite() callback.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c |  56 ++++++++++++++-----
 fs/ocfs2/aops.h |   9 +++
 fs/ocfs2/file.c |   7 +++
 fs/ocfs2/mmap.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 200 insertions(+), 39 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index fc723fb9c981..b8869fd0884f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1034,7 +1034,8 @@ out:
  */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				      struct ocfs2_write_ctxt *wc,
-				      u32 cpos, loff_t user_pos, int new)
+				      u32 cpos, loff_t user_pos, int new,
+				      struct page *mmap_page)
 {
 	int ret = 0, i;
 	unsigned long start, target_index, index;
@@ -1058,11 +1059,36 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 	for(i = 0; i < wc->w_num_pages; i++) {
 		index = start + i;
 
-		wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!wc->w_pages[i]) {
-			ret = -ENOMEM;
-			mlog_errno(ret);
-			goto out;
+		if (index == target_index && mmap_page) {
+			/*
+			 * ocfs2_pagemkwrite() is a little different
+			 * and wants us to directly use the page
+			 * passed in.
+			 */
+			lock_page(mmap_page);
+
+			if (mmap_page->mapping != mapping) {
+				unlock_page(mmap_page);
+				/*
+				 * Sanity check - the locking in
+				 * ocfs2_pagemkwrite() should ensure
+				 * that this code doesn't trigger.
+				 */
+				ret = -EINVAL;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			page_cache_get(mmap_page);
+			wc->w_pages[i] = mmap_page;
+		} else {
+			wc->w_pages[i] = find_or_create_page(mapping, index,
+							     GFP_NOFS);
+			if (!wc->w_pages[i]) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
 		}
 
 		if (index == target_index)
@@ -1213,10 +1239,10 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 	}
 }
 
-static int ocfs2_write_begin_nolock(struct address_space *mapping,
-				    loff_t pos, unsigned len, unsigned flags,
-				    struct page **pagep, void **fsdata,
-				    struct buffer_head *di_bh)
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page)
 {
 	int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
 	unsigned int num_clusters = 0, clusters_to_alloc = 0;
@@ -1318,7 +1344,7 @@ static int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-					 clusters_to_alloc);
+					 clusters_to_alloc, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1386,7 +1412,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
-				       fsdata, di_bh);
+				       fsdata, di_bh, NULL);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_fail_data;
@@ -1407,9 +1433,9 @@ out_fail:
 	return ret;
 }
 
-static int ocfs2_write_end_nolock(struct address_space *mapping,
-				  loff_t pos, unsigned len, unsigned copied,
-				  struct page *page, void *fsdata)
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
 {
 	int i;
 	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index bdcdd1ae63a9..389579bd64e3 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -50,6 +50,15 @@ int ocfs2_write_end(struct file *file, struct address_space *mapping,
 		    loff_t pos, unsigned len, unsigned copied,
 		    struct page *page, void *fsdata);
 
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata);
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page);
+
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4c850d00c269..a80f31776d94 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1001,6 +1001,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		goto bail_unlock;
 	}
 
+	/*
+	 * This will intentionally not wind up calling vmtruncate(),
+	 * since all the work for a size change has been done above.
+	 * Otherwise, we could get into problems with truncate as
+	 * ip_alloc_sem is used there to protect against i_size
+	 * changes.
+	 */
 	status = inode_setattr(inode, attr);
 	if (status < 0) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158b39f5..d79aa12137d2 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
 
 #include "ocfs2.h"
 
+#include "aops.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
 
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+	/* The best way to deal with signals in the vm path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(blocked);
+
+	/* We should technically never get a bad return value
+	 * from sigprocmask */
+	return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+	return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
 				 unsigned long address,
 				 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
 	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
 		   type);
 
-	/* The best way to deal with signals in this path is
-	 * to block them upfront, rather than allowing the
-	 * locking paths to return -ERESTARTSYS. */
-	sigfillset(&blocked);
-
-	/* We should technically never get a bad ret return
-	 * from sigprocmask */
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
 
 	page = filemap_nopage(area, address, type);
 
-	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	ret = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
@@ -76,28 +87,136 @@ out:
 	return page;
 }
 
-static struct vm_operations_struct ocfs2_file_vm_ops = {
-	.nopage = ocfs2_nopage,
-};
+static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+				struct page *page)
+{
+	int ret;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t pos = page->index << PAGE_CACHE_SHIFT;
+	unsigned int len = PAGE_CACHE_SIZE;
+	pgoff_t last_index;
+	struct page *locked_page = NULL;
+	void *fsdata;
+	loff_t size = i_size_read(inode);
 
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+	/*
+	 * Another node might have truncated while we were waiting on
+	 * cluster locks.
+	 */
+	last_index = size >> PAGE_CACHE_SHIFT;
+	if (page->index > last_index) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * The i_size check above doesn't catch the case where nodes
+	 * truncated and then re-extended the file. We'll re-check the
+	 * page mapping after taking the page lock inside of
+	 * ocfs2_write_begin_nolock().
+	 */
+	if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Call ocfs2_write_begin() and ocfs2_write_end() to take
+	 * advantage of the allocation code there. We pass a write
+	 * length of the whole page (chopped to i_size) to make sure
+	 * the whole thing is allocated.
+	 *
+	 * Since we know the page is up to date, we don't have to
+	 * worry about ocfs2_write_begin() skipping some buffer reads
+	 * because the "write" would invalidate their data.
+	 */
+	if (page->index == last_index)
+		len = size & ~PAGE_CACHE_MASK;
+
+	ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+				       &fsdata, di_bh, page);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+				     fsdata);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	BUG_ON(ret != len);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-	int ret = 0, lock_level = 0;
-	struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct buffer_head *di_bh = NULL;
+	sigset_t blocked, oldset;
+	int ret, ret2;
+
+	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * The cluster locks taken will block a truncate from another
+	 * node. Taking the data lock will also ensure that we don't
+	 * attempt page truncation as part of a downconvert.
+	 */
+	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
 
 	/*
-	 * Only support shared writeable mmap for local mounts which
-	 * don't know about holes.
+	 * The alloc sem should be enough to serialize with
+	 * ocfs2_truncate_file() changing i_size as well as any thread
+	 * modifying the inode btree.
 	 */
-	if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
-	    ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
-	    ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
-		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
-		/* This is -EINVAL because generic_file_readonly_mmap
-		 * returns it in a similar situation. */
-		return -EINVAL;
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_meta_unlock;
 	}
 
+	ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+
+	ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_meta_unlock(inode, 1);
+
+out:
+	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+	if (ret2 < 0)
+		mlog_errno(ret2);
+
+	return ret;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	.nopage		= ocfs2_nopage,
+	.page_mkwrite	= ocfs2_page_mkwrite,
+};
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret = 0, lock_level = 0;
+
 	ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
 				    file->f_vfsmnt, &lock_level);
 	if (ret < 0) {
-- 
cgit v1.2.3


From bce997682fe3121516f5a20cf7bad2e6029ba018 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 18 Jun 2007 11:12:36 -0700
Subject: ocfs2: harden buffer check during mapping of page blocks

We don't want to submit buffer_new blocks for read i/o. This actually won't
happen right now because those requests during an allocating write are all nicely
aligned. It's probably a good idea to provide an explicit check though.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b8869fd0884f..e8d16ae12ef0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -712,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 			if (!buffer_uptodate(bh))
 				set_buffer_uptodate(bh);
 		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
-		     (block_start < from || block_end > to)) {
+			   !buffer_new(bh) &&
+			   (block_start < from || block_end > to)) {
 			ll_rw_block(READ, 1, &bh);
 			*wait_bh++=bh;
 		}
-- 
cgit v1.2.3


From 59a5e416d1ab543a5248a2b34d83202c4d55d132 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 22 Jun 2007 15:52:36 -0700
Subject: ocfs2: plug truncate into cached dealloc routines

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c    | 102 ++++++++++++++--------------------------------------
 fs/ocfs2/alloc.h    |   3 +-
 fs/ocfs2/aops.c     |   1 +
 fs/ocfs2/suballoc.c |  13 -------
 fs/ocfs2/suballoc.h |   4 ---
 5 files changed, 29 insertions(+), 94 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 873bb99fc2ff..26e867087e95 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
 #include "buffer_head_io.h"
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb);
 
 /*
  * Structures which describe a path through a btree, and functions to
@@ -3161,6 +3163,15 @@ out:
 	return ret;
 }
 
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+					 struct ocfs2_extent_block *eb)
+{
+	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+					 le16_to_cpu(eb->h_suballoc_slot),
+					 le64_to_cpu(eb->h_blkno),
+					 le16_to_cpu(eb->h_suballoc_bit));
+}
+
 /* This function will figure out whether the currently last extent
  * block will be deleted, and if it will, what the new last extent
  * block will be so we can update his h_next_leaf_blk field, as well
@@ -3442,27 +3453,10 @@ delete:
 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
 
-			if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
-				/*
-				 * This code only understands how to
-				 * lock the suballocator in slot 0,
-				 * which is fine because allocation is
-				 * only ever done out of that
-				 * suballocator too. A future version
-				 * might change that however, so avoid
-				 * a free if we don't know how to
-				 * handle it. This way an fs incompat
-				 * bit will not be necessary.
-				 */
-				ret = ocfs2_free_extent_block(handle,
-							      tc->tc_ext_alloc_inode,
-							      tc->tc_ext_alloc_bh,
-							      eb);
-
-				/* An error here is not fatal. */
-				if (ret < 0)
-					mlog_errno(ret);
-			}
+			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
+			/* An error here is not fatal. */
+			if (ret < 0)
+				mlog_errno(ret);
 		} else {
 			deleted_eb = 0;
 		}
@@ -3965,6 +3959,8 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
+	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+
 	ocfs2_free_path(path);
 
 	/* This will drop the ext_alloc cluster lock for us */
@@ -3975,23 +3971,18 @@ bail:
 }
 
 /*
- * Expects the inode to already be locked. This will figure out which
- * inodes need to be locked and will put them on the returned truncate
- * context.
+ * Expects the inode to already be locked.
  */
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
 			   struct ocfs2_truncate_context **tc)
 {
-	int status, metadata_delete, i;
+	int status;
 	unsigned int new_i_clusters;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list *el;
 	struct buffer_head *last_eb_bh = NULL;
-	struct inode *ext_alloc_inode = NULL;
-	struct buffer_head *ext_alloc_bh = NULL;
 
 	mlog_entry_void();
 
@@ -4011,12 +4002,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 		mlog_errno(status);
 		goto bail;
 	}
+	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
 
-	metadata_delete = 0;
 	if (fe->id2.i_list.l_tree_depth) {
-		/* If we have a tree, then the truncate may result in
-		 * metadata deletes. Figure this out from the
-		 * rightmost leaf block.*/
 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
 		if (status < 0) {
@@ -4031,43 +4019,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			status = -EIO;
 			goto bail;
 		}
-		el = &(eb->h_list);
-
-		i = 0;
-		if (ocfs2_is_empty_extent(&el->l_recs[0]))
-			i = 1;
-		/*
-		 * XXX: Should we check that next_free_rec contains
-		 * the extent?
-		 */
-		if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
-			metadata_delete = 1;
 	}
 
 	(*tc)->tc_last_eb_bh = last_eb_bh;
 
-	if (metadata_delete) {
-		mlog(0, "Will have to delete metadata for this trunc. "
-		     "locking allocator.\n");
-		ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
-		if (!ext_alloc_inode) {
-			status = -ENOMEM;
-			mlog_errno(status);
-			goto bail;
-		}
-
-		mutex_lock(&ext_alloc_inode->i_mutex);
-		(*tc)->tc_ext_alloc_inode = ext_alloc_inode;
-
-		status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-		(*tc)->tc_ext_alloc_bh = ext_alloc_bh;
-		(*tc)->tc_ext_alloc_locked = 1;
-	}
-
 	status = 0;
 bail:
 	if (status < 0) {
@@ -4081,16 +4036,13 @@ bail:
 
 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
 {
-	if (tc->tc_ext_alloc_inode) {
-		if (tc->tc_ext_alloc_locked)
-			ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
-
-		mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
-		iput(tc->tc_ext_alloc_inode);
-	}
-
-	if (tc->tc_ext_alloc_bh)
-		brelse(tc->tc_ext_alloc_bh);
+	/*
+	 * The caller is responsible for completing deallocation
+	 * before freeing the context.
+	 */
+	if (tc->tc_dealloc.c_first_suballocator != NULL)
+		mlog(ML_NOTICE,
+		     "Truncate completion has non-empty dealloc context\n");
 
 	if (tc->tc_last_eb_bh)
 		brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 01db0adc2150..cb02e53b593c 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -83,8 +83,7 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		       struct ocfs2_cached_dealloc_ctxt *ctxt);
 
 struct ocfs2_truncate_context {
-	struct inode *tc_ext_alloc_inode;
-	struct buffer_head *tc_ext_alloc_bh;
+	struct ocfs2_cached_dealloc_ctxt tc_dealloc;
 	int tc_ext_alloc_locked; /* is it cluster locked? */
 	/* these get destroyed once it's passed to ocfs2_commit_truncate. */
 	struct buffer_head *tc_last_eb_bh;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e8d16ae12ef0..510bf84c9cf5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1498,6 +1498,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 	ocfs2_commit_trans(osb, handle);
+
 	ocfs2_free_write_ctxt(wc);
 
 	return copied;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6788f2f1a667..82bf12f887a6 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1708,19 +1708,6 @@ int ocfs2_free_dinode(handle_t *handle,
 					inode_alloc_bh, bit, bg_blkno, 1);
 }
 
-int ocfs2_free_extent_block(handle_t *handle,
-			    struct inode *eb_alloc_inode,
-			    struct buffer_head *eb_alloc_bh,
-			    struct ocfs2_extent_block *eb)
-{
-	u64 blk = le64_to_cpu(eb->h_blkno);
-	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
-	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-
-	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
-					bit, bg_blkno, 1);
-}
-
 int ocfs2_free_clusters(handle_t *handle,
 		       struct inode *bitmap_inode,
 		       struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 7bc4819db4db..f212dc01a84b 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -96,10 +96,6 @@ int ocfs2_free_dinode(handle_t *handle,
 		      struct inode *inode_alloc_inode,
 		      struct buffer_head *inode_alloc_bh,
 		      struct ocfs2_dinode *di);
-int ocfs2_free_extent_block(handle_t *handle,
-			    struct inode *eb_alloc_inode,
-			    struct buffer_head *eb_alloc_bh,
-			    struct ocfs2_extent_block *eb);
 int ocfs2_free_clusters(handle_t *handle,
 			struct inode *bitmap_inode,
 			struct buffer_head *bitmap_bh,
-- 
cgit v1.2.3


From 0d172baa5586071ae0ae0c07356a378fdbedecdb Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 14 May 2007 18:09:54 -0700
Subject: ocfs2: small cleanup of ocfs2_write_begin_nolock()

We can easily seperate out the write descriptor setup and manipulation
into helper functions.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 108 +++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 76 insertions(+), 32 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 510bf84c9cf5..077583b50391 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1188,6 +1188,31 @@ out:
 	return ret;
 }
 
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+				       struct ocfs2_alloc_context *data_ac,
+				       struct ocfs2_alloc_context *meta_ac,
+				       struct ocfs2_write_ctxt *wc,
+				       loff_t pos, unsigned len)
+{
+	int ret, i;
+	struct ocfs2_write_cluster_desc *desc;
+
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+
+		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
+					  meta_ac, wc, desc->c_cpos, pos, len);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
 /*
  * ocfs2_write_end() wants to know which parts of the target page it
  * should complete the write on. It's easiest to compute them ahead of
@@ -1240,30 +1265,19 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 	}
 }
 
-int ocfs2_write_begin_nolock(struct address_space *mapping,
-			     loff_t pos, unsigned len, unsigned flags,
-			     struct page **pagep, void **fsdata,
-			     struct buffer_head *di_bh, struct page *mmap_page)
+/*
+ * Populate each single-cluster write descriptor in the write context
+ * with information about the i/o to be done.
+ */
+static int ocfs2_populate_write_desc(struct inode *inode,
+				     struct ocfs2_write_ctxt *wc,
+				     unsigned int *clusters_to_alloc)
 {
-	int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
-	unsigned int num_clusters = 0, clusters_to_alloc = 0;
-	u32 phys = 0;
-	struct ocfs2_write_ctxt *wc;
-	struct inode *inode = mapping->host;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_dinode *di;
-	struct ocfs2_alloc_context *data_ac = NULL;
-	struct ocfs2_alloc_context *meta_ac = NULL;
-	handle_t *handle;
+	int ret;
 	struct ocfs2_write_cluster_desc *desc;
-
-	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
-	}
-
-	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	unsigned int num_clusters = 0;
+	u32 phys = 0;
+	int i;
 
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
@@ -1287,12 +1301,46 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		desc->c_phys = phys;
 		if (phys == 0) {
 			desc->c_new = 1;
-			clusters_to_alloc++;
+			*clusters_to_alloc = *clusters_to_alloc + 1;
 		}
 
 		num_clusters--;
 	}
 
+	ret = 0;
+out:
+	return ret;
+}
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page)
+{
+	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+	unsigned int clusters_to_alloc = 0;
+	struct ocfs2_write_ctxt *wc;
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle;
+
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
 	/*
 	 * We set w_target_from, w_target_to here so that
 	 * ocfs2_write_end() knows which range in the target page to
@@ -1351,15 +1399,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		goto out_commit;
 	}
 
-	for (i = 0; i < wc->w_clen; i++) {
-		desc = &wc->w_desc[i];
-
-		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
-					  meta_ac, wc, desc->c_cpos, pos, len);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_commit;
-		}
+	ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+					  len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
 	}
 
 	if (data_ac)
-- 
cgit v1.2.3


From b27b7cbcf12a1bfff1ed68a73ddd7d11edc20daf Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Mon, 18 Jun 2007 11:22:56 -0700
Subject: ocfs2: support writing of unwritten extents

Update the write code to detect when the user is asking to write to an
unwritten extent. Like writing to a hole, we must zero the region between
the write and the cluster boundaries. Most of the existing cluster zeroing
logic can be re-used with some additional checks for the unwritten flag on
extent records.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 94 +++++++++++++++++++++++++++++++++++++++++++++------------
 fs/ocfs2/file.c | 14 ++++++---
 fs/ocfs2/file.h |  2 +-
 3 files changed, 84 insertions(+), 26 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 077583b50391..8af923316d22 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -782,8 +782,14 @@ struct ocfs2_write_cluster_desc {
 	 * filled.
 	 */
 	unsigned	c_new;
+	unsigned	c_unwritten;
 };
 
+static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
+{
+	return d->c_new || d->c_unwritten;
+}
+
 struct ocfs2_write_ctxt {
 	/* Logical cluster position / len of write */
 	u32				w_cpos;
@@ -829,6 +835,8 @@ struct ocfs2_write_ctxt {
 	handle_t			*w_handle;
 
 	struct buffer_head		*w_di_bh;
+
+	struct ocfs2_cached_dealloc_ctxt w_dealloc;
 };
 
 static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
@@ -868,6 +876,8 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 	else
 		wc->w_large_pages = 0;
 
+	ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
 	*wcp = wc;
 
 	return 0;
@@ -1103,16 +1113,19 @@ out:
  * Prepare a single cluster for write one cluster into the file.
  */
 static int ocfs2_write_cluster(struct address_space *mapping,
-			       u32 phys, struct ocfs2_alloc_context *data_ac,
+			       u32 phys, unsigned int unwritten,
+			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       struct ocfs2_write_ctxt *wc, u32 cpos,
 			       loff_t user_pos, unsigned user_len)
 {
-	int ret, i, new;
+	int ret, i, new, should_zero = 0;
 	u64 v_blkno, p_blkno;
 	struct inode *inode = mapping->host;
 
 	new = phys == 0 ? 1 : 0;
+	if (new || unwritten)
+		should_zero = 1;
 
 	if (new) {
 		u32 tmp_pos;
@@ -1142,11 +1155,20 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 			mlog_errno(ret);
 			goto out;
 		}
+	} else if (unwritten) {
+		ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+						wc->w_handle, cpos, 1, phys,
+						meta_ac, &wc->w_dealloc);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
 
+	if (should_zero)
 		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
-	} else {
+	else
 		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
-	}
 
 	/*
 	 * The only reason this should fail is due to an inability to
@@ -1169,7 +1191,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 
 		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
 						      wc->w_pages[i], cpos,
-						      user_pos, user_len, new);
+						      user_pos, user_len,
+						      should_zero);
 		if (tmpret) {
 			mlog_errno(tmpret);
 			if (ret == 0)
@@ -1200,8 +1223,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 
-		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
-					  meta_ac, wc, desc->c_cpos, pos, len);
+		ret = ocfs2_write_cluster(mapping, desc->c_phys,
+					  desc->c_unwritten, data_ac, meta_ac,
+					  wc, desc->c_cpos, pos, len);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1242,19 +1266,19 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 	if (wc->w_large_pages) {
 		/*
 		 * We only care about the 1st and last cluster within
-		 * our range and whether they are holes or not. Either
+		 * our range and whether they should be zero'd or not. Either
 		 * value may be extended out to the start/end of a
 		 * newly allocated cluster.
 		 */
 		desc = &wc->w_desc[0];
-		if (desc->c_new)
+		if (ocfs2_should_zero_cluster(desc))
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							&wc->w_target_from,
 							NULL);
 
 		desc = &wc->w_desc[wc->w_clen - 1];
-		if (desc->c_new)
+		if (ocfs2_should_zero_cluster(desc))
 			ocfs2_figure_cluster_boundaries(osb,
 							desc->c_cpos,
 							NULL,
@@ -1268,28 +1292,52 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 /*
  * Populate each single-cluster write descriptor in the write context
  * with information about the i/o to be done.
+ *
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
  */
 static int ocfs2_populate_write_desc(struct inode *inode,
 				     struct ocfs2_write_ctxt *wc,
-				     unsigned int *clusters_to_alloc)
+				     unsigned int *clusters_to_alloc,
+				     unsigned int *extents_to_split)
 {
 	int ret;
 	struct ocfs2_write_cluster_desc *desc;
 	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
 	u32 phys = 0;
 	int i;
 
+	*clusters_to_alloc = 0;
+	*extents_to_split = 0;
+
 	for (i = 0; i < wc->w_clen; i++) {
 		desc = &wc->w_desc[i];
 		desc->c_cpos = wc->w_cpos + i;
 
 		if (num_clusters == 0) {
+			/*
+			 * Need to look up the next extent record.
+			 */
 			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
-						 &num_clusters, NULL);
+						 &num_clusters, &ext_flags);
 			if (ret) {
 				mlog_errno(ret);
 				goto out;
 			}
+
+			/*
+			 * Assume worst case - that we're writing in
+			 * the middle of the extent.
+			 *
+			 * We can assume that the write proceeds from
+			 * left to right, in which case the extent
+			 * insert code is smart enough to coalesce the
+			 * next splits into the previous records created.
+			 */
+			if (ext_flags & OCFS2_EXT_UNWRITTEN)
+				*extents_to_split = *extents_to_split + 2;
 		} else if (phys) {
 			/*
 			 * Only increment phys if it doesn't describe
@@ -1303,6 +1351,8 @@ static int ocfs2_populate_write_desc(struct inode *inode,
 			desc->c_new = 1;
 			*clusters_to_alloc = *clusters_to_alloc + 1;
 		}
+		if (ext_flags & OCFS2_EXT_UNWRITTEN)
+			desc->c_unwritten = 1;
 
 		num_clusters--;
 	}
@@ -1318,7 +1368,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     struct buffer_head *di_bh, struct page *mmap_page)
 {
 	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-	unsigned int clusters_to_alloc = 0;
+	unsigned int clusters_to_alloc, extents_to_split;
 	struct ocfs2_write_ctxt *wc;
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -1333,7 +1383,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		return ret;
 	}
 
-	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc);
+	ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+					&extents_to_split);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
@@ -1347,14 +1398,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * write out. An allocation requires that we write the entire
 	 * cluster range.
 	 */
-	if (clusters_to_alloc > 0) {
+	if (clusters_to_alloc || extents_to_split) {
 		/*
 		 * XXX: We are stretching the limits of
-		 * ocfs2_lock_allocators(). It greately over-estimates
+		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
 		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
-					    &data_ac, &meta_ac);
+					    extents_to_split, &data_ac, &meta_ac);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1365,7 +1416,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 
 	}
 
-	ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
+	ocfs2_set_target_boundaries(osb, wc, pos, len,
+				    clusters_to_alloc + extents_to_split);
 
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -1393,7 +1445,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * extent.
 	 */
 	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-					 clusters_to_alloc, mmap_page);
+					 clusters_to_alloc + extents_to_split,
+					 mmap_page);
 	if (ret) {
 		mlog_errno(ret);
 		goto out_commit;
@@ -1538,11 +1591,12 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
 	ocfs2_commit_trans(osb, handle);
 
+	ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
 	ocfs2_free_write_ctxt(wc);
 
 	return copied;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a80f31776d94..6745086da6fd 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -527,20 +527,21 @@ leave:
  * understand sparse inodes.
  */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add,
+			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac)
 {
 	int ret, num_free_extents;
+	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	*meta_ac = NULL;
 	*data_ac = NULL;
 
 	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u\n",
+	     "clusters_to_add = %u, extents_to_split = %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-	     le32_to_cpu(di->i_clusters), clusters_to_add);
+	     le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
 
 	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
 	if (num_free_extents < 0) {
@@ -558,9 +559,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 	 *
 	 * Most of the time we'll only be seeing this 1 cluster at a time
 	 * anyway.
+	 *
+	 * Always lock for any unwritten extents - we might want to
+	 * add blocks during a split.
 	 */
 	if (!num_free_extents ||
-	    (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
 		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
@@ -641,7 +645,7 @@ restart_all:
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 
-	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa1822b..54df3c4bd2fd 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -47,7 +47,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
-			  u32 clusters_to_add,
+			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
 			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
-- 
cgit v1.2.3


From 2ae99a60374f360ba07037ebbf33d19b89ac43a6 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Fri, 9 Mar 2007 16:43:28 -0800
Subject: ocfs2: Support creation of unwritten extents

This can now be trivially supported with re-use of our existing extend code.

ocfs2_allocate_unwritten_extents() takes a start offset and a byte length
and iterates over the inode, adding extents (marked as unwritten) until len
is reached. Existing extents are skipped over.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/alloc.c |   2 +
 fs/ocfs2/alloc.h |   1 +
 fs/ocfs2/aops.c  |   2 +-
 fs/ocfs2/dir.c   |   2 +-
 fs/ocfs2/file.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++-----------
 fs/ocfs2/file.h  |   5 ++-
 fs/ocfs2/namei.c |   2 +-
 7 files changed, 105 insertions(+), 28 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0db6a1f724e1..2e3898316468 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3726,6 +3726,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
+			u8 flags,
 			struct ocfs2_alloc_context *meta_ac)
 {
 	int status;
@@ -3749,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	rec.e_cpos = cpu_to_le32(cpos);
 	rec.e_blkno = cpu_to_le64(start_blk);
 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+	rec.e_flags = flags;
 
 	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
 					  &insert);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index d3acf45225c2..e3284f3eb6b9 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,6 +34,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			u32 cpos,
 			u64 start_blk,
 			u32 new_clusters,
+			u8 flags,
 			struct ocfs2_alloc_context *meta_ac);
 struct ocfs2_cached_dealloc_ctxt;
 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8af923316d22..ec8b606b30e1 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1136,7 +1136,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
 		 */
 		tmp_pos = cpos;
 		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, wc->w_di_bh,
+						 &tmp_pos, 1, 0, wc->w_di_bh,
 						 wc->w_handle, data_ac,
 						 meta_ac, NULL);
 		/*
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1f2bad..0d5fdde959c8 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 		u32 offset = OCFS2_I(dir)->ip_clusters;
 
 		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
-						    1, parent_fe_bh, handle,
+						    1, 0, parent_fe_bh, handle,
 						    data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6745086da6fd..3e21ad9a6dde 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -425,6 +425,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
 			       u32 *logical_offset,
 			       u32 clusters_to_add,
+			       int mark_unwritten,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
@@ -437,9 +438,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
 	u32 bit_off, num_bits;
 	u64 block;
+	u8 flags = 0;
 
 	BUG_ON(!clusters_to_add);
 
+	if (mark_unwritten)
+		flags = OCFS2_EXT_UNWRITTEN;
+
 	free_extents = ocfs2_num_free_extents(osb, inode, fe);
 	if (free_extents < 0) {
 		status = free_extents;
@@ -489,7 +494,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
 	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
 				     *logical_offset, block, num_bits,
-				     meta_ac);
+				     flags, meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
@@ -522,9 +527,11 @@ leave:
  * For a given allocation, determine which allocators will need to be
  * accessed, and lock them, reserving the appropriate number of bits.
  *
- * Called from ocfs2_extend_allocation() for file systems which don't
- * support holes, and from ocfs2_write() for file systems which
- * understand sparse inodes.
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
  */
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  u32 clusters_to_add, u32 extents_to_split,
@@ -595,14 +602,13 @@ out:
 	return ret;
 }
 
-static int ocfs2_extend_allocation(struct inode *inode,
-				   u32 clusters_to_add)
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+				     u32 clusters_to_add, int mark_unwritten)
 {
 	int status = 0;
 	int restart_func = 0;
-	int drop_alloc_sem = 0;
 	int credits;
-	u32 prev_clusters, logical_start;
+	u32 prev_clusters;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
@@ -617,7 +623,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
 	 * This function only exists for file systems which don't
 	 * support holes.
 	 */
-	BUG_ON(ocfs2_sparse_alloc(osb));
+	BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
 
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
@@ -633,18 +639,9 @@ static int ocfs2_extend_allocation(struct inode *inode,
 		goto leave;
 	}
 
-	logical_start = OCFS2_I(inode)->ip_clusters;
-
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	/* blocks peope in read/write from reading our allocation
-	 * until we're done changing it. We depend on i_mutex to block
-	 * other extend/truncate calls while we're here. Ordering wrt
-	 * start_trans is important here -- always do it before! */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-	drop_alloc_sem = 1;
-
 	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
 				       &meta_ac);
 	if (status) {
@@ -678,6 +675,7 @@ restarted_transaction:
 					    inode,
 					    &logical_start,
 					    clusters_to_add,
+					    mark_unwritten,
 					    bh,
 					    handle,
 					    data_ac,
@@ -730,10 +728,6 @@ restarted_transaction:
 	     OCFS2_I(inode)->ip_clusters, i_size_read(inode));
 
 leave:
-	if (drop_alloc_sem) {
-		up_write(&OCFS2_I(inode)->ip_alloc_sem);
-		drop_alloc_sem = 0;
-	}
 	if (handle) {
 		ocfs2_commit_trans(osb, handle);
 		handle = NULL;
@@ -759,6 +753,25 @@ leave:
 	return status;
 }
 
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+				   u32 clusters_to_add, int mark_unwritten)
+{
+	int ret;
+
+	/*
+	 * The alloc sem blocks peope in read/write from reading our
+	 * allocation until we're done changing it. We depend on
+	 * i_mutex to block other extend/truncate calls while we're
+	 * here.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
+					mark_unwritten);
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	return ret;
+}
+
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->prepare_write() and
@@ -900,7 +913,9 @@ static int ocfs2_extend_file(struct inode *inode,
 	}
 
 	if (clusters_to_add) {
-		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		ret = ocfs2_extend_allocation(inode,
+					      OCFS2_I(inode)->ip_clusters,
+					      clusters_to_add, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out_unlock;
@@ -1176,6 +1191,64 @@ out:
 	return ret;
 }
 
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+					    u64 start, u64 len)
+{
+	int ret;
+	u32 cpos, phys_cpos, clusters, alloc_size;
+
+	/*
+	 * We consider both start and len to be inclusive.
+	 */
+	cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+	clusters -= cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+					 &alloc_size, NULL);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * Hole or existing extent len can be arbitrary, so
+		 * cap it to our own allocation request.
+		 */
+		if (alloc_size > clusters)
+			alloc_size = clusters;
+
+		if (phys_cpos) {
+			/*
+			 * We already have an allocation at this
+			 * region so we can safely skip it.
+			 */
+			goto next;
+		}
+
+		ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+		if (ret) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+
+next:
+		cpos += alloc_size;
+		clusters -= alloc_size;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 54df3c4bd2fd..79115c92dc30 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,13 +39,14 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
-			       u32 *cluster_start,
+			       u32 *logical_offset,
 			       u32 clusters_to_add,
+			       int mark_unwritten,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
-			       enum ocfs2_alloc_restarted *reason);
+			       enum ocfs2_alloc_restarted *reason_ret);
 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
 			  u32 clusters_to_add, u32 extents_to_split,
 			  struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6295ce..d430fdab16e9 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
 		u32 offset = 0;
 
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
 						    new_fe_bh,
 						    handle, data_ac, NULL,
 						    NULL);
-- 
cgit v1.2.3


From 54c57dc3b6578356c0a428c767d4bf080254a2ee Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 20 Jun 2007 17:15:10 -0700
Subject: [PATCH] ocfs2: zero_user_page conversion

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
---
 fs/ocfs2/aops.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ec8b606b30e1..84bf6e79de23 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -740,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 	bh = head;
 	block_start = 0;
 	do {
-		void *kaddr;
-
 		block_end = block_start + bsize;
 		if (block_end <= from)
 			goto next_bh;
 		if (block_start >= to)
 			break;
 
-		kaddr = kmap_atomic(page, KM_USER0);
-		memset(kaddr+block_start, 0, bh->b_size);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr, KM_USER0);
+		zero_user_page(page, block_start, bh->b_size, KM_USER0);
 		set_buffer_uptodate(bh);
 		mark_buffer_dirty(bh);
 
@@ -906,15 +901,11 @@ static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to
 			if (block_end > from && block_start < to) {
 				if (!PageUptodate(page)) {
 					unsigned start, end;
-					void *kaddr;
 
 					start = max(from, block_start);
 					end = min(to, block_end);
 
-					kaddr = kmap_atomic(page, KM_USER0);
-					memset(kaddr+start, 0, end - start);
-					flush_dcache_page(page);
-					kunmap_atomic(kaddr, KM_USER0);
+					zero_user_page(page, start, end - start, KM_USER0);
 					set_buffer_uptodate(bh);
 				}
 
-- 
cgit v1.2.3


From 54cb8821de07f2ffcd28c380ce9b93d5784b40d7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 19 Jul 2007 01:46:59 -0700
Subject: mm: merge populate and nopage into fault (fixes nonlinear)

Nonlinear mappings are (AFAIKS) simply a virtual memory concept that encodes
the virtual address -> file offset differently from linear mappings.

->populate is a layering violation because the filesystem/pagecache code
should need to know anything about the virtual memory mapping.  The hitch here
is that the ->nopage handler didn't pass down enough information (ie.  pgoff).
 But it is more logical to pass pgoff rather than have the ->nopage function
calculate it itself anyway (because that's a similar layering violation).

Having the populate handler install the pte itself is likewise a nasty thing
to be doing.

This patch introduces a new fault handler that replaces ->nopage and
->populate and (later) ->nopfn.  Most of the old mechanism is still in place
so there is a lot of duplication and nice cleanups that can be removed if
everyone switches over.

The rationale for doing this in the first place is that nonlinear mappings are
subject to the pagefault vs invalidate/truncate race too, and it seemed stupid
to duplicate the synchronisation logic rather than just consolidate the two.

After this patch, MAP_NONBLOCK no longer sets up ptes for pages present in
pagecache.  Seems like a fringe functionality anyway.

NOPAGE_REFAULT is removed.  This should be implemented with ->fault, and no
users have hit mainline yet.

[akpm@linux-foundation.org: cleanup]
[randy.dunlap@oracle.com: doc. fixes for readahead]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/feature-removal-schedule.txt |  27 ++++++
 Documentation/filesystems/Locking          |   2 +
 fs/gfs2/ops_address.c                      |   2 +-
 fs/gfs2/ops_file.c                         |   2 +-
 fs/gfs2/ops_vm.c                           |  36 ++++----
 fs/ncpfs/mmap.c                            |  23 ++---
 fs/ocfs2/aops.c                            |   2 +-
 fs/ocfs2/mmap.c                            |  17 ++--
 fs/xfs/linux-2.6/xfs_file.c                |  23 +++--
 include/linux/mm.h                         |  41 +++++++--
 ipc/shm.c                                  |   9 +-
 mm/filemap.c                               |  94 ++++++++++++--------
 mm/filemap_xip.c                           |  54 ++++++------
 mm/fremap.c                                | 103 +++++++++++++++-------
 mm/memory.c                                | 132 +++++++++++++++++++----------
 mm/mmap.c                                  |   8 +-
 mm/nommu.c                                 |   3 +-
 mm/rmap.c                                  |   4 +-
 mm/shmem.c                                 |  82 +++++-------------
 mm/truncate.c                              |   2 +-
 20 files changed, 394 insertions(+), 272 deletions(-)

(limited to 'fs/ocfs2/aops.c')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 66c8b4b165c1..716568afdff8 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -135,6 +135,33 @@ Who:	Greg Kroah-Hartman <gregkh@suse.de>
 
 ---------------------------
 
+What:	filemap_nopage, filemap_populate
+When:	April 2007
+Why:	These legacy interfaces no longer have any callers in the kernel and
+	any functionality provided can be provided with filemap_fault. The
+	removal schedule is short because they are a big maintainence burden
+	and have some bugs.
+Who:	Nick Piggin <npiggin@suse.de>
+
+---------------------------
+
+What:	vm_ops.populate, install_page
+When:	April 2007
+Why:	These legacy interfaces no longer have any callers in the kernel and
+	any functionality provided can be provided with vm_ops.fault.
+Who:	Nick Piggin <npiggin@suse.de>
+
+---------------------------
+
+What:	vm_ops.nopage
+When:	February 2008, provided in-kernel callers have been converted
+Why:	This interface is replaced by vm_ops.fault, but it has been around
+	forever, is used by a lot of drivers, and doesn't cost much to
+	maintain.
+Who:	Nick Piggin <npiggin@suse.de>
+
+---------------------------
+
 What:	Interrupt only SA_* flags
 When:	September 2007
 Why:	The interrupt related SA_* flags are replaced by IRQF_* to move them
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index d866551be037..970c8ec1a05b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -510,12 +510,14 @@ More details about quota locking can be found in fs/dquot.c.
 prototypes:
 	void (*open)(struct vm_area_struct*);
 	void (*close)(struct vm_area_struct*);
+	struct page *(*fault)(struct vm_area_struct*, struct fault_data *);
 	struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
 
 locking rules:
 		BKL	mmap_sem
 open:		no	yes
 close:		no	yes
+fault:		no	yes
 nopage:		no	yes
 
 ================================================================================
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 26c888890c24..ce90032c010e 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -251,7 +251,7 @@ static int gfs2_readpage(struct file *file, struct page *page)
 		if (file) {
 			gf = file->private_data;
 			if (test_bit(GFF_EXLOCK, &gf->f_flags))
-				/* gfs2_sharewrite_nopage has grabbed the ip->i_gl already */
+				/* gfs2_sharewrite_fault has grabbed the ip->i_gl already */
 				goto skip_lock;
 		}
 		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index bad0b24cb773..581ac11b2656 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -364,7 +364,7 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	else
 		vma->vm_ops = &gfs2_vm_ops_private;
 
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE|VM_CAN_NONLINEAR;
 
 	gfs2_glock_dq_uninit(&i_gh);
 
diff --git a/fs/gfs2/ops_vm.c b/fs/gfs2/ops_vm.c
index d5a98cbfebdc..e9fe6eb74e75 100644
--- a/fs/gfs2/ops_vm.c
+++ b/fs/gfs2/ops_vm.c
@@ -27,13 +27,13 @@
 #include "trans.h"
 #include "util.h"
 
-static struct page *gfs2_private_nopage(struct vm_area_struct *area,
-					unsigned long address, int *type)
+static struct page *gfs2_private_fault(struct vm_area_struct *vma,
+					struct fault_data *fdata)
 {
-	struct gfs2_inode *ip = GFS2_I(area->vm_file->f_mapping->host);
+	struct gfs2_inode *ip = GFS2_I(vma->vm_file->f_mapping->host);
 
 	set_bit(GIF_PAGED, &ip->i_flags);
-	return filemap_nopage(area, address, type);
+	return filemap_fault(vma, fdata);
 }
 
 static int alloc_page_backing(struct gfs2_inode *ip, struct page *page)
@@ -104,16 +104,14 @@ out:
 	return error;
 }
 
-static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
-					   unsigned long address, int *type)
+static struct page *gfs2_sharewrite_fault(struct vm_area_struct *vma,
+						struct fault_data *fdata)
 {
-	struct file *file = area->vm_file;
+	struct file *file = vma->vm_file;
 	struct gfs2_file *gf = file->private_data;
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_holder i_gh;
 	struct page *result = NULL;
-	unsigned long index = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) +
-			      area->vm_pgoff;
 	int alloc_required;
 	int error;
 
@@ -124,23 +122,27 @@ static struct page *gfs2_sharewrite_nopage(struct vm_area_struct *area,
 	set_bit(GIF_PAGED, &ip->i_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
-	error = gfs2_write_alloc_required(ip, (u64)index << PAGE_CACHE_SHIFT,
-					  PAGE_CACHE_SIZE, &alloc_required);
-	if (error)
+	error = gfs2_write_alloc_required(ip,
+					(u64)fdata->pgoff << PAGE_CACHE_SHIFT,
+					PAGE_CACHE_SIZE, &alloc_required);
+	if (error) {
+		fdata->type = VM_FAULT_OOM; /* XXX: are these right? */
 		goto out;
+	}
 
 	set_bit(GFF_EXLOCK, &gf->f_flags);
-	result = filemap_nopage(area, address, type);
+	result = filemap_fault(vma, fdata);
 	clear_bit(GFF_EXLOCK, &gf->f_flags);
-	if (!result || result == NOPAGE_OOM)
+	if (!result)
 		goto out;
 
 	if (alloc_required) {
 		error = alloc_page_backing(ip, result);
 		if (error) {
-			if (area->vm_flags & VM_CAN_INVALIDATE)
+			if (vma->vm_flags & VM_CAN_INVALIDATE)
 				unlock_page(result);
 			page_cache_release(result);
+			fdata->type = VM_FAULT_OOM;
 			result = NULL;
 			goto out;
 		}
@@ -154,10 +156,10 @@ out:
 }
 
 struct vm_operations_struct gfs2_vm_ops_private = {
-	.nopage = gfs2_private_nopage,
+	.fault = gfs2_private_fault,
 };
 
 struct vm_operations_struct gfs2_vm_ops_sharewrite = {
-	.nopage = gfs2_sharewrite_nopage,
+	.fault = gfs2_sharewrite_fault,
 };
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 5416673418b8..af48b792ca04 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -25,8 +25,8 @@
 /*
  * Fill in the supplied page for mmap
  */
-static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
-				     unsigned long address, int *type)
+static struct page* ncp_file_mmap_fault(struct vm_area_struct *area,
+						struct fault_data *fdata)
 {
 	struct file *file = area->vm_file;
 	struct dentry *dentry = file->f_path.dentry;
@@ -40,15 +40,17 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
 
 	page = alloc_page(GFP_HIGHUSER); /* ncpfs has nothing against high pages
 	           as long as recvmsg and memset works on it */
-	if (!page)
-		return page;
+	if (!page) {
+		fdata->type = VM_FAULT_OOM;
+		return NULL;
+	}
 	pg_addr = kmap(page);
-	address &= PAGE_MASK;
-	pos = address - area->vm_start + (area->vm_pgoff << PAGE_SHIFT);
+	pos = fdata->pgoff << PAGE_SHIFT;
 
 	count = PAGE_SIZE;
-	if (address + PAGE_SIZE > area->vm_end) {
-		count = area->vm_end - address;
+	if (fdata->address + PAGE_SIZE > area->vm_end) {
+		WARN_ON(1); /* shouldn't happen? */
+		count = area->vm_end - fdata->address;
 	}
 	/* what we can read in one go */
 	bufsize = NCP_SERVER(inode)->buffer_size;
@@ -91,15 +93,14 @@ static struct page* ncp_file_mmap_nopage(struct vm_area_struct *area,
 	 * fetches from the network, here the analogue of disk.
 	 * -- wli
 	 */
-	if (type)
-		*type = VM_FAULT_MAJOR;
+	fdata->type = VM_FAULT_MAJOR;
 	count_vm_event(PGMAJFAULT);
 	return page;
 }
 
 static struct vm_operations_struct ncp_file_mmap =
 {
-	.nopage	= ncp_file_mmap_nopage,
+	.fault = ncp_file_mmap_fault,
 };
 
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 84bf6e79de23..460d440310f2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -232,7 +232,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	 * might now be discovering a truncate that hit on another node.
 	 * block_read_full_page->get_block freaks out if it is asked to read
 	 * beyond the end of a file, so we check here.  Callers
-	 * (generic_file_read, fault->nopage) are clever enough to check i_size
+	 * (generic_file_read, vm_ops->fault) are clever enough to check i_size
 	 * and notice that the page they just read isn't needed.
 	 *
 	 * XXX sys_readahead() seems to get that wrong?
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 904f39ff5340..cd75508b1c8a 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -60,24 +60,23 @@ static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
 	return sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-static struct page *ocfs2_nopage(struct vm_area_struct * area,
-				 unsigned long address,
-				 int *type)
+static struct page *ocfs2_fault(struct vm_area_struct *area,
+						struct fault_data *fdata)
 {
-	struct page *page = NOPAGE_SIGBUS;
+	struct page *page = NULL;
 	sigset_t blocked, oldset;
 	int ret;
 
-	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
-		   type);
+	mlog_entry("(area=%p, page offset=%lu)\n", area, fdata->pgoff);
 
 	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
 	if (ret < 0) {
+		fdata->type = VM_FAULT_SIGBUS;
 		mlog_errno(ret);
 		goto out;
 	}
 
-	page = filemap_nopage(area, address, type);
+	page = filemap_fault(area, fdata);
 
 	ret = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret < 0)
@@ -209,7 +208,7 @@ out:
 }
 
 static struct vm_operations_struct ocfs2_file_vm_ops = {
-	.nopage		= ocfs2_nopage,
+	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
 };
 
@@ -226,7 +225,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	ocfs2_meta_unlock(file->f_dentry->d_inode, lock_level);
 out:
 	vma->vm_ops = &ocfs2_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 92b2f225712f..f12e80a69c68 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -213,18 +213,19 @@ xfs_file_fsync(
 
 #ifdef CONFIG_XFS_DMAPI
 STATIC struct page *
-xfs_vm_nopage(
-	struct vm_area_struct	*area,
-	unsigned long		address,
-	int			*type)
+xfs_vm_fault(
+	struct vm_area_struct	*vma,
+	struct fault_data	*fdata)
 {
-	struct inode	*inode = area->vm_file->f_path.dentry->d_inode;
+	struct inode	*inode = vma->vm_file->f_path.dentry->d_inode;
 	bhv_vnode_t	*vp = vn_from_inode(inode);
 
 	ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
-	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0))
+	if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0)) {
+		fdata->type = VM_FAULT_SIGBUS;
 		return NULL;
-	return filemap_nopage(area, address, type);
+	}
+	return filemap_fault(vma, fdata);
 }
 #endif /* CONFIG_XFS_DMAPI */
 
@@ -310,7 +311,7 @@ xfs_file_mmap(
 	struct vm_area_struct *vma)
 {
 	vma->vm_ops = &xfs_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 
 #ifdef CONFIG_XFS_DMAPI
 	if (vn_from_inode(filp->f_path.dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI)
@@ -465,14 +466,12 @@ const struct file_operations xfs_dir_file_operations = {
 };
 
 static struct vm_operations_struct xfs_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
+	.fault		= filemap_fault,
 };
 
 #ifdef CONFIG_XFS_DMAPI
 static struct vm_operations_struct xfs_dmapi_file_vm_ops = {
-	.nopage		= xfs_vm_nopage,
-	.populate	= filemap_populate,
+	.fault		= xfs_vm_fault,
 #ifdef HAVE_VMOP_MPROTECT
 	.mprotect	= xfs_vm_mprotect,
 #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca9536a348c8..f28a1b3e63a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -173,6 +173,7 @@ extern unsigned int kobjsize(const void *objp);
 					 * In this case, do_no_page must
 					 * return with the page locked.
 					 */
+#define VM_CAN_NONLINEAR 0x10000000	/* Has ->fault & does nonlinear pages */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@ -196,6 +197,25 @@ extern unsigned int kobjsize(const void *objp);
  */
 extern pgprot_t protection_map[16];
 
+#define FAULT_FLAG_WRITE	0x01
+#define FAULT_FLAG_NONLINEAR	0x02
+
+/*
+ * fault_data is filled in the the pagefault handler and passed to the
+ * vma's ->fault function. That function is responsible for filling in
+ * 'type', which is the type of fault if a page is returned, or the type
+ * of error if NULL is returned.
+ *
+ * pgoff should be used in favour of address, if possible. If pgoff is
+ * used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get
+ * nonlinear mapping support.
+ */
+struct fault_data {
+	unsigned long address;
+	pgoff_t pgoff;
+	unsigned int flags;
+	int type;
+};
 
 /*
  * These are the virtual MM functions - opening of an area, closing and
@@ -205,9 +225,15 @@ extern pgprot_t protection_map[16];
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
-	struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
-	unsigned long (*nopfn)(struct vm_area_struct * area, unsigned long address);
-	int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);
+	struct page *(*fault)(struct vm_area_struct *vma,
+			struct fault_data *fdata);
+	struct page *(*nopage)(struct vm_area_struct *area,
+			unsigned long address, int *type);
+	unsigned long (*nopfn)(struct vm_area_struct *area,
+			unsigned long address);
+	int (*populate)(struct vm_area_struct *area, unsigned long address,
+			unsigned long len, pgprot_t prot, unsigned long pgoff,
+			int nonblock);
 
 	/* notification that a previously read-only page is about to become
 	 * writable, if an error is returned it will cause a SIGBUS */
@@ -661,7 +687,6 @@ static inline int page_mapped(struct page *page)
  */
 #define NOPAGE_SIGBUS	(NULL)
 #define NOPAGE_OOM	((struct page *) (-1))
-#define NOPAGE_REFAULT	((struct page *) (-2))	/* Return to userspace, rerun */
 
 /*
  * Error return values for the *_nopfn functions
@@ -1110,9 +1135,11 @@ extern void truncate_inode_pages_range(struct address_space *,
 				       loff_t lstart, loff_t lend);
 
 /* generic vm_area_ops exported for stackable file systems */
-extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int *);
-extern int filemap_populate(struct vm_area_struct *, unsigned long,
-		unsigned long, pgprot_t, unsigned long, int);
+extern struct page *filemap_fault(struct vm_area_struct *, struct fault_data *);
+extern struct page * __deprecated_for_modules
+filemap_nopage(struct vm_area_struct *, unsigned long, int *);
+extern int __deprecated_for_modules filemap_populate(struct vm_area_struct *,
+		unsigned long, unsigned long, pgprot_t, unsigned long, int);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff --git a/ipc/shm.c b/ipc/shm.c
index 242c3f66493a..e2d090348b1e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -224,13 +224,13 @@ static void shm_close(struct vm_area_struct *vma)
 	mutex_unlock(&shm_ids(ns).mutex);
 }
 
-static struct page *shm_nopage(struct vm_area_struct *vma,
-			       unsigned long address, int *type)
+static struct page *shm_fault(struct vm_area_struct *vma,
+					struct fault_data *fdata)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 
-	return sfd->vm_ops->nopage(vma, address, type);
+	return sfd->vm_ops->fault(vma, fdata);
 }
 
 #ifdef CONFIG_NUMA
@@ -269,6 +269,7 @@ static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 	if (ret != 0)
 		return ret;
 	sfd->vm_ops = vma->vm_ops;
+	BUG_ON(!sfd->vm_ops->fault);
 	vma->vm_ops = &shm_vm_ops;
 	shm_open(vma);
 
@@ -327,7 +328,7 @@ static const struct file_operations shm_file_operations = {
 static struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
-	.nopage	= shm_nopage,
+	.fault	= shm_fault,
 #if defined(CONFIG_NUMA)
 	.set_policy = shm_set_policy,
 	.get_policy = shm_get_policy,
diff --git a/mm/filemap.c b/mm/filemap.c
index 462cda58a18e..26b992d169e5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1301,40 +1301,38 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 #define MMAP_LOTSAMISS  (100)
 
 /**
- * filemap_nopage - read in file data for page fault handling
- * @area:	the applicable vm_area
- * @address:	target address to read in
- * @type:	returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ * filemap_fault - read in file data for page fault handling
+ * @vma:	user vma (not used)
+ * @fdata:	the applicable fault_data
  *
- * filemap_nopage() is invoked via the vma operations vector for a
+ * filemap_fault() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
  * The goto's are kind of ugly, but this streamlines the normal case of having
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  */
-struct page *filemap_nopage(struct vm_area_struct *area,
-				unsigned long address, int *type)
+struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 {
 	int error;
-	struct file *file = area->vm_file;
+	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct file_ra_state *ra = &file->f_ra;
 	struct inode *inode = mapping->host;
 	struct page *page;
-	unsigned long size, pgoff;
-	int did_readaround = 0, majmin = VM_FAULT_MINOR;
+	unsigned long size;
+	int did_readaround = 0;
 
-	BUG_ON(!(area->vm_flags & VM_CAN_INVALIDATE));
+	fdata->type = VM_FAULT_MINOR;
 
-	pgoff = ((address-area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
+	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff >= size)
+	if (fdata->pgoff >= size)
 		goto outside_data_content;
 
 	/* If we don't want any read-ahead, don't bother */
-	if (VM_RandomReadHint(area))
+	if (VM_RandomReadHint(vma))
 		goto no_cached_page;
 
 	/*
@@ -1343,19 +1341,19 @@ struct page *filemap_nopage(struct vm_area_struct *area,
 	 *
 	 * For sequential accesses, we use the generic readahead logic.
 	 */
-	if (VM_SequentialReadHint(area))
-		page_cache_readahead(mapping, ra, file, pgoff, 1);
+	if (VM_SequentialReadHint(vma))
+		page_cache_readahead(mapping, ra, file, fdata->pgoff, 1);
 
 	/*
 	 * Do we have something in the page cache already?
 	 */
 retry_find:
-	page = find_lock_page(mapping, pgoff);
+	page = find_lock_page(mapping, fdata->pgoff);
 	if (!page) {
 		unsigned long ra_pages;
 
-		if (VM_SequentialReadHint(area)) {
-			handle_ra_miss(mapping, ra, pgoff);
+		if (VM_SequentialReadHint(vma)) {
+			handle_ra_miss(mapping, ra, fdata->pgoff);
 			goto no_cached_page;
 		}
 		ra->mmap_miss++;
@@ -1372,7 +1370,7 @@ retry_find:
 		 * check did_readaround, as this is an inner loop.
 		 */
 		if (!did_readaround) {
-			majmin = VM_FAULT_MAJOR;
+			fdata->type = VM_FAULT_MAJOR;
 			count_vm_event(PGMAJFAULT);
 		}
 		did_readaround = 1;
@@ -1380,11 +1378,11 @@ retry_find:
 		if (ra_pages) {
 			pgoff_t start = 0;
 
-			if (pgoff > ra_pages / 2)
-				start = pgoff - ra_pages / 2;
+			if (fdata->pgoff > ra_pages / 2)
+				start = fdata->pgoff - ra_pages / 2;
 			do_page_cache_readahead(mapping, file, start, ra_pages);
 		}
-		page = find_lock_page(mapping, pgoff);
+		page = find_lock_page(mapping, fdata->pgoff);
 		if (!page)
 			goto no_cached_page;
 	}
@@ -1401,7 +1399,7 @@ retry_find:
 
 	/* Must recheck i_size under page lock */
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (unlikely(pgoff >= size)) {
+	if (unlikely(fdata->pgoff >= size)) {
 		unlock_page(page);
 		goto outside_data_content;
 	}
@@ -1410,8 +1408,6 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 */
 	mark_page_accessed(page);
-	if (type)
-		*type = majmin;
 	return page;
 
 outside_data_content:
@@ -1419,15 +1415,17 @@ outside_data_content:
 	 * An external ptracer can access pages that normally aren't
 	 * accessible..
 	 */
-	if (area->vm_mm == current->mm)
-		return NOPAGE_SIGBUS;
+	if (vma->vm_mm == current->mm) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
 	 * We're only likely to ever get here if MADV_RANDOM is in
 	 * effect.
 	 */
-	error = page_cache_read(file, pgoff);
+	error = page_cache_read(file, fdata->pgoff);
 
 	/*
 	 * The page we want has now been added to the page cache.
@@ -1443,13 +1441,15 @@ no_cached_page:
 	 * to schedule I/O.
 	 */
 	if (error == -ENOMEM)
-		return NOPAGE_OOM;
-	return NOPAGE_SIGBUS;
+		fdata->type = VM_FAULT_OOM;
+	else
+		fdata->type = VM_FAULT_SIGBUS;
+	return NULL;
 
 page_not_uptodate:
 	/* IO error path */
 	if (!did_readaround) {
-		majmin = VM_FAULT_MAJOR;
+		fdata->type = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 	}
 
@@ -1468,7 +1468,30 @@ page_not_uptodate:
 
 	/* Things didn't work out. Return zero to tell the mm layer so. */
 	shrink_readahead_size_eio(file, ra);
-	return NOPAGE_SIGBUS;
+	fdata->type = VM_FAULT_SIGBUS;
+	return NULL;
+}
+EXPORT_SYMBOL(filemap_fault);
+
+/*
+ * filemap_nopage and filemap_populate are legacy exports that are not used
+ * in tree. Scheduled for removal.
+ */
+struct page *filemap_nopage(struct vm_area_struct *area,
+				unsigned long address, int *type)
+{
+	struct page *page;
+	struct fault_data fdata;
+	fdata.address = address;
+	fdata.pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+			+ area->vm_pgoff;
+	fdata.flags = 0;
+
+	page = filemap_fault(area, &fdata);
+	if (type)
+		*type = fdata.type;
+
+	return page;
 }
 EXPORT_SYMBOL(filemap_nopage);
 
@@ -1646,8 +1669,7 @@ repeat:
 EXPORT_SYMBOL(filemap_populate);
 
 struct vm_operations_struct generic_file_vm_ops = {
-	.nopage		= filemap_nopage,
-	.populate	= filemap_populate,
+	.fault		= filemap_fault,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -1660,7 +1682,7 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
 		return -ENOEXEC;
 	file_accessed(file);
 	vma->vm_ops = &generic_file_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 	return 0;
 }
 
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 65ffc321f0c0..82f4b8e9834e 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -205,62 +205,67 @@ __xip_unmap (struct address_space * mapping,
 }
 
 /*
- * xip_nopage() is invoked via the vma operations vector for a
+ * xip_fault() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
  *
- * This function is derived from filemap_nopage, but used for execute in place
+ * This function is derived from filemap_fault, but used for execute in place
  */
-static struct page *
-xip_file_nopage(struct vm_area_struct * area,
-		   unsigned long address,
-		   int *type)
+static struct page *xip_file_fault(struct vm_area_struct *area,
+					struct fault_data *fdata)
 {
 	struct file *file = area->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	struct page *page;
-	unsigned long size, pgoff, endoff;
+	pgoff_t size;
 
-	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
-		+ area->vm_pgoff;
-	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
-		+ area->vm_pgoff;
+	/* XXX: are VM_FAULT_ codes OK? */
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if (pgoff >= size)
-		return NOPAGE_SIGBUS;
+	if (fdata->pgoff >= size) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 
-	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+	page = mapping->a_ops->get_xip_page(mapping,
+					fdata->pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page))
 		goto out;
-	if (PTR_ERR(page) != -ENODATA)
-		return NOPAGE_SIGBUS;
+	if (PTR_ERR(page) != -ENODATA) {
+		fdata->type = VM_FAULT_OOM;
+		return NULL;
+	}
 
 	/* sparse block */
 	if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
 	    (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
 	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
 		/* maybe shared writable, allocate new block */
-		page = mapping->a_ops->get_xip_page (mapping,
-			pgoff*(PAGE_SIZE/512), 1);
-		if (IS_ERR(page))
-			return NOPAGE_SIGBUS;
+		page = mapping->a_ops->get_xip_page(mapping,
+					fdata->pgoff*(PAGE_SIZE/512), 1);
+		if (IS_ERR(page)) {
+			fdata->type = VM_FAULT_SIGBUS;
+			return NULL;
+		}
 		/* unmap page at pgoff from all other vmas */
-		__xip_unmap(mapping, pgoff);
+		__xip_unmap(mapping, fdata->pgoff);
 	} else {
 		/* not shared and writable, use xip_sparse_page() */
 		page = xip_sparse_page();
-		if (!page)
-			return NOPAGE_OOM;
+		if (!page) {
+			fdata->type = VM_FAULT_OOM;
+			return NULL;
+		}
 	}
 
 out:
+	fdata->type = VM_FAULT_MINOR;
 	page_cache_get(page);
 	return page;
 }
 
 static struct vm_operations_struct xip_file_vm_ops = {
-	.nopage         = xip_file_nopage,
+	.fault	= xip_file_fault,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -269,6 +274,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 
 	file_accessed(file);
 	vma->vm_ops = &xip_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 4e3f53dd5fd4..01e51f01b84e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -126,6 +126,25 @@ out:
 	return err;
 }
 
+static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
+			unsigned long addr, unsigned long size, pgoff_t pgoff)
+{
+	int err;
+
+	do {
+		err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
+		if (err)
+			return err;
+
+		size -= PAGE_SIZE;
+		addr += PAGE_SIZE;
+		pgoff++;
+	} while (size);
+
+        return 0;
+
+}
+
 /***
  * sys_remap_file_pages - remap arbitrary pages of a shared backing store
  *                        file within an existing vma.
@@ -183,41 +202,63 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 	 * the single existing vma.  vm_private_data is used as a
 	 * swapout cursor in a VM_NONLINEAR vma.
 	 */
-	if (vma && (vma->vm_flags & VM_SHARED) &&
-		(!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
-		vma->vm_ops && vma->vm_ops->populate &&
-			end > start && start >= vma->vm_start &&
-				end <= vma->vm_end) {
-
-		/* Must set VM_NONLINEAR before any pages are populated. */
-		if (pgoff != linear_page_index(vma, start) &&
-		    !(vma->vm_flags & VM_NONLINEAR)) {
-			if (!has_write_lock) {
-				up_read(&mm->mmap_sem);
-				down_write(&mm->mmap_sem);
-				has_write_lock = 1;
-				goto retry;
+	if (!vma || !(vma->vm_flags & VM_SHARED))
+		goto out;
+
+	if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
+		goto out;
+
+	if ((!vma->vm_ops || !vma->vm_ops->populate) &&
+					!(vma->vm_flags & VM_CAN_NONLINEAR))
+		goto out;
+
+	if (end <= start || start < vma->vm_start || end > vma->vm_end)
+		goto out;
+
+	/* Must set VM_NONLINEAR before any pages are populated. */
+	if (!(vma->vm_flags & VM_NONLINEAR)) {
+		/* Don't need a nonlinear mapping, exit success */
+		if (pgoff == linear_page_index(vma, start)) {
+			err = 0;
+			goto out;
+		}
+
+		if (!has_write_lock) {
+			up_read(&mm->mmap_sem);
+			down_write(&mm->mmap_sem);
+			has_write_lock = 1;
+			goto retry;
+		}
+		mapping = vma->vm_file->f_mapping;
+		spin_lock(&mapping->i_mmap_lock);
+		flush_dcache_mmap_lock(mapping);
+		vma->vm_flags |= VM_NONLINEAR;
+		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
+		flush_dcache_mmap_unlock(mapping);
+		spin_unlock(&mapping->i_mmap_lock);
+	}
+
+	if (vma->vm_flags & VM_CAN_NONLINEAR) {
+		err = populate_range(mm, vma, start, size, pgoff);
+		if (!err && !(flags & MAP_NONBLOCK)) {
+			if (unlikely(has_write_lock)) {
+				downgrade_write(&mm->mmap_sem);
+				has_write_lock = 0;
 			}
-			mapping = vma->vm_file->f_mapping;
-			spin_lock(&mapping->i_mmap_lock);
-			flush_dcache_mmap_lock(mapping);
-			vma->vm_flags |= VM_NONLINEAR;
-			vma_prio_tree_remove(vma, &mapping->i_mmap);
-			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-			flush_dcache_mmap_unlock(mapping);
-			spin_unlock(&mapping->i_mmap_lock);
+			make_pages_present(start, start+size);
 		}
+	} else
+		err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
+					    	pgoff, flags & MAP_NONBLOCK);
 
-		err = vma->vm_ops->populate(vma, start, size,
-					    vma->vm_page_prot,
-					    pgoff, flags & MAP_NONBLOCK);
+	/*
+	 * We can't clear VM_NONLINEAR because we'd have to do
+	 * it after ->populate completes, and that would prevent
+	 * downgrading the lock.  (Locks can't be upgraded).
+	 */
 
-		/*
-		 * We can't clear VM_NONLINEAR because we'd have to do
-		 * it after ->populate completes, and that would prevent
-		 * downgrading the lock.  (Locks can't be upgraded).
-		 */
-	}
+out:
 	if (likely(!has_write_lock))
 		up_read(&mm->mmap_sem);
 	else
diff --git a/mm/memory.c b/mm/memory.c
index e6c99f6b5649..eee7fec3ab54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1047,7 +1047,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		if (pages)
 			foll_flags |= FOLL_GET;
 		if (!write && !(vma->vm_flags & VM_LOCKED) &&
-		    (!vma->vm_ops || !vma->vm_ops->nopage))
+		    (!vma->vm_ops || (!vma->vm_ops->nopage &&
+					!vma->vm_ops->fault)))
 			foll_flags |= FOLL_ANON;
 
 		do {
@@ -2288,10 +2289,10 @@ oom:
 }
 
 /*
- * do_no_page() tries to create a new page mapping. It aggressively
+ * __do_fault() tries to create a new page mapping. It aggressively
  * tries to share with existing pages, but makes a separate copy if
- * the "write_access" parameter is true in order to avoid the next
- * page fault.
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
  *
  * As this is called only for pages that do not currently exist, we
  * do not need to flush old virtual caches or the TLB.
@@ -2300,64 +2301,82 @@ oom:
  * but allow concurrent faults), and pte mapped but not yet locked.
  * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		int write_access)
+		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	spinlock_t *ptl;
-	struct page *page, *nopage_page;
+	struct page *page, *faulted_page;
 	pte_t entry;
-	int ret = VM_FAULT_MINOR;
 	int anon = 0;
 	struct page *dirty_page = NULL;
+	struct fault_data fdata;
+
+	fdata.address = address & PAGE_MASK;
+	fdata.pgoff = pgoff;
+	fdata.flags = flags;
 
 	pte_unmap(page_table);
 	BUG_ON(vma->vm_flags & VM_PFNMAP);
 
-	nopage_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-	/* no page was available -- either SIGBUS, OOM or REFAULT */
-	if (unlikely(nopage_page == NOPAGE_SIGBUS))
-		return VM_FAULT_SIGBUS;
-	else if (unlikely(nopage_page == NOPAGE_OOM))
-		return VM_FAULT_OOM;
-	else if (unlikely(nopage_page == NOPAGE_REFAULT))
-		return VM_FAULT_MINOR;
+	if (likely(vma->vm_ops->fault)) {
+		fdata.type = -1;
+		faulted_page = vma->vm_ops->fault(vma, &fdata);
+		WARN_ON(fdata.type == -1);
+		if (unlikely(!faulted_page))
+			return fdata.type;
+	} else {
+		/* Legacy ->nopage path */
+		fdata.type = VM_FAULT_MINOR;
+		faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
+								&fdata.type);
+		/* no page was available -- either SIGBUS or OOM */
+		if (unlikely(faulted_page == NOPAGE_SIGBUS))
+			return VM_FAULT_SIGBUS;
+		else if (unlikely(faulted_page == NOPAGE_OOM))
+			return VM_FAULT_OOM;
+	}
 
-	BUG_ON(vma->vm_flags & VM_CAN_INVALIDATE && !PageLocked(nopage_page));
 	/*
-	 * For consistency in subsequent calls, make the nopage_page always
+	 * For consistency in subsequent calls, make the faulted_page always
 	 * locked.
 	 */
 	if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
-		lock_page(nopage_page);
+		lock_page(faulted_page);
+	else
+		BUG_ON(!PageLocked(faulted_page));
 
 	/*
 	 * Should we do an early C-O-W break?
 	 */
-	page = nopage_page;
-	if (write_access) {
+	page = faulted_page;
+	if (flags & FAULT_FLAG_WRITE) {
 		if (!(vma->vm_flags & VM_SHARED)) {
+			anon = 1;
 			if (unlikely(anon_vma_prepare(vma))) {
-				ret = VM_FAULT_OOM;
-				goto out_error;
+				fdata.type = VM_FAULT_OOM;
+				goto out;
 			}
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 			if (!page) {
-				ret = VM_FAULT_OOM;
-				goto out_error;
+				fdata.type = VM_FAULT_OOM;
+				goto out;
 			}
-			copy_user_highpage(page, nopage_page, address, vma);
-			anon = 1;
+			copy_user_highpage(page, faulted_page, address, vma);
 		} else {
-			/* if the page will be shareable, see if the backing
+			/*
+			 * If the page will be shareable, see if the backing
 			 * address space wants to know that the page is about
-			 * to become writable */
+			 * to become writable
+			 */
 			if (vma->vm_ops->page_mkwrite &&
 			    vma->vm_ops->page_mkwrite(vma, page) < 0) {
-				ret = VM_FAULT_SIGBUS;
-				goto out_error;
+				fdata.type = VM_FAULT_SIGBUS;
+				anon = 1; /* no anon but release faulted_page */
+				goto out;
 			}
 		}
+
 	}
 
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2373,10 +2392,10 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * handle that later.
 	 */
 	/* Only go through if we didn't race with anybody else... */
-	if (likely(pte_none(*page_table))) {
+	if (likely(pte_same(*page_table, orig_pte))) {
 		flush_icache_page(vma, page);
 		entry = mk_pte(page, vma->vm_page_prot);
-		if (write_access)
+		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
@@ -2386,7 +2405,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(page);
-			if (write_access) {
+			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
 				get_page(dirty_page);
 			}
@@ -2399,25 +2418,42 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (anon)
 			page_cache_release(page);
 		else
-			anon = 1; /* not anon, but release nopage_page */
+			anon = 1; /* no anon but release faulted_page */
 	}
 
 	pte_unmap_unlock(page_table, ptl);
 
 out:
-	unlock_page(nopage_page);
+	unlock_page(faulted_page);
 	if (anon)
-		page_cache_release(nopage_page);
+		page_cache_release(faulted_page);
 	else if (dirty_page) {
 		set_page_dirty_balance(dirty_page);
 		put_page(dirty_page);
 	}
 
-	return ret;
+	return fdata.type;
+}
 
-out_error:
-	anon = 1; /* relase nopage_page */
-	goto out;
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pte_t orig_pte)
+{
+	pgoff_t pgoff = (((address & PAGE_MASK)
+			- vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+	unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
+}
+
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+		unsigned long address, pte_t *page_table, pmd_t *pmd,
+		int write_access, pgoff_t pgoff, pte_t orig_pte)
+{
+	unsigned int flags = FAULT_FLAG_NONLINEAR |
+				(write_access ? FAULT_FLAG_WRITE : 0);
+
+	return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
 }
 
 /*
@@ -2496,9 +2532,14 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		print_bad_pte(vma, orig_pte, address);
 		return VM_FAULT_OOM;
 	}
-	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
 
 	pgoff = pte_to_pgoff(orig_pte);
+
+	if (vma->vm_ops && vma->vm_ops->fault)
+		return do_nonlinear_fault(mm, vma, address, page_table, pmd,
+					write_access, pgoff, orig_pte);
+
+	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
 	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
 					vma->vm_page_prot, pgoff, 0);
 	if (err == -ENOMEM)
@@ -2532,10 +2573,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
-				if (vma->vm_ops->nopage)
-					return do_no_page(mm, vma, address,
-							  pte, pmd,
-							  write_access);
+				if (vma->vm_ops->fault || vma->vm_ops->nopage)
+					return do_linear_fault(mm, vma, address,
+						pte, pmd, write_access, entry);
 				if (unlikely(vma->vm_ops->nopfn))
 					return do_no_pfn(mm, vma, address, pte,
 							 pmd, write_access);
diff --git a/mm/mmap.c b/mm/mmap.c
index 144b4a290f2c..724f342bcf89 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1165,12 +1165,8 @@ out:
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
 	}
-	if (flags & MAP_POPULATE) {
-		up_write(&mm->mmap_sem);
-		sys_remap_file_pages(addr, len, 0,
-					pgoff, flags & MAP_NONBLOCK);
-		down_write(&mm->mmap_sem);
-	}
+	if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+		make_pages_present(addr, addr + len);
 	return addr;
 
 unmap_and_free_vma:
diff --git a/mm/nommu.c b/mm/nommu.c
index 8bbbf147a794..aee0e1b0ebe7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1341,8 +1341,7 @@ int in_gate_area_no_task(unsigned long addr)
 	return 0;
 }
 
-struct page *filemap_nopage(struct vm_area_struct *area,
-			unsigned long address, int *type)
+struct page *filemap_fault(struct vm_area_struct *vma, struct fault_data *fdata)
 {
 	BUG();
 	return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index 61e492597a0b..fede5c7910be 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -621,8 +621,10 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 			print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
-			if (vma->vm_ops)
+			if (vma->vm_ops) {
 				print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
+				print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
+			}
 			if (vma->vm_file && vma->vm_file->f_op)
 				print_symbol (KERN_EMERG "  vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
 			BUG();
diff --git a/mm/shmem.c b/mm/shmem.c
index 5808fadd3944..6b44440f1b24 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -83,7 +83,7 @@ enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
 	SGP_WRITE,	/* may exceed i_size, may allocate page */
-	SGP_NOPAGE,	/* same as SGP_CACHE, return with page locked */
+	SGP_FAULT,	/* same as SGP_CACHE, return with page locked */
 };
 
 static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -1101,6 +1101,10 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
 
 	if (idx >= SHMEM_MAX_INDEX)
 		return -EFBIG;
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
 	/*
 	 * Normally, filepage is NULL on entry, and either found
 	 * uptodate immediately, or allocated and zeroed, or read
@@ -1291,7 +1295,7 @@ repeat:
 done:
 	if (*pagep != filepage) {
 		*pagep = filepage;
-		if (sgp != SGP_NOPAGE)
+		if (sgp != SGP_FAULT)
 			unlock_page(filepage);
 
 	}
@@ -1305,76 +1309,31 @@ failed:
 	return error;
 }
 
-static struct page *shmem_nopage(struct vm_area_struct *vma,
-				 unsigned long address, int *type)
+static struct page *shmem_fault(struct vm_area_struct *vma,
+					struct fault_data *fdata)
 {
 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
 	struct page *page = NULL;
-	unsigned long idx;
 	int error;
 
 	BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
 
-	idx = (address - vma->vm_start) >> PAGE_SHIFT;
-	idx += vma->vm_pgoff;
-	idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-	if (((loff_t) idx << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-		return NOPAGE_SIGBUS;
+	if (((loff_t)fdata->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+		fdata->type = VM_FAULT_SIGBUS;
+		return NULL;
+	}
 
-	error = shmem_getpage(inode, idx, &page, SGP_NOPAGE, type);
-	if (error)
-		return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
+	error = shmem_getpage(inode, fdata->pgoff, &page,
+						SGP_FAULT, &fdata->type);
+	if (error) {
+		fdata->type = ((error == -ENOMEM)?VM_FAULT_OOM:VM_FAULT_SIGBUS);
+		return NULL;
+	}
 
 	mark_page_accessed(page);
 	return page;
 }
 
-static int shmem_populate(struct vm_area_struct *vma,
-	unsigned long addr, unsigned long len,
-	pgprot_t prot, unsigned long pgoff, int nonblock)
-{
-	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-	struct mm_struct *mm = vma->vm_mm;
-	enum sgp_type sgp = nonblock? SGP_QUICK: SGP_CACHE;
-	unsigned long size;
-
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (pgoff >= size || pgoff + (len >> PAGE_SHIFT) > size)
-		return -EINVAL;
-
-	while ((long) len > 0) {
-		struct page *page = NULL;
-		int err;
-		/*
-		 * Will need changing if PAGE_CACHE_SIZE != PAGE_SIZE
-		 */
-		err = shmem_getpage(inode, pgoff, &page, sgp, NULL);
-		if (err)
-			return err;
-		/* Page may still be null, but only if nonblock was set. */
-		if (page) {
-			mark_page_accessed(page);
-			err = install_page(mm, vma, addr, page, prot);
-			if (err) {
-				page_cache_release(page);
-				return err;
-			}
-		} else if (vma->vm_flags & VM_NONLINEAR) {
-			/* No page was found just because we can't read it in
-			 * now (being here implies nonblock != 0), but the page
-			 * may exist, so set the PTE to fault it in later. */
-    			err = install_file_pte(mm, vma, addr, pgoff, prot);
-			if (err)
-	    			return err;
-		}
-
-		len -= PAGE_SIZE;
-		addr += PAGE_SIZE;
-		pgoff++;
-	}
-	return 0;
-}
-
 #ifdef CONFIG_NUMA
 int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
@@ -1419,7 +1378,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	file_accessed(file);
 	vma->vm_ops = &shmem_vm_ops;
-	vma->vm_flags |= VM_CAN_INVALIDATE;
+	vma->vm_flags |= VM_CAN_INVALIDATE | VM_CAN_NONLINEAR;
 	return 0;
 }
 
@@ -2465,8 +2424,7 @@ static const struct super_operations shmem_ops = {
 };
 
 static struct vm_operations_struct shmem_vm_ops = {
-	.nopage		= shmem_nopage,
-	.populate	= shmem_populate,
+	.fault		= shmem_fault,
 #ifdef CONFIG_NUMA
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
diff --git a/mm/truncate.c b/mm/truncate.c
index aed85f0b707f..5cdfbc1a59fd 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
 /*
  * If truncate cannot remove the fs-private metadata from the page, the page
  * becomes anonymous.  It will be left on the LRU and may even be mapped into
- * user pagetables if we're racing with filemap_nopage().
+ * user pagetables if we're racing with filemap_fault().
  *
  * We need to bale out if page->mapping is no longer equal to the original
  * mapping.  This happens a) when the VM reclaimed the page while we waited on
-- 
cgit v1.2.3