From 07a4e2da7dd3c9345f84b2552872f9d38c257451 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Wed, 12 Jan 2005 13:08:26 +0000
Subject: NTFS: Use i_size_{read,write}() in fs/ntfs/{aops.c,mft.c} and protect
       access to the i_size and other size fields using the size_lock.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/mft.c | 157 ++++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 113 insertions(+), 44 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index dfa85ac2f8ba..20011e02f5b6 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -45,6 +45,7 @@
  */
 static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 {
+	loff_t i_size;
 	ntfs_volume *vol = ni->vol;
 	struct inode *mft_vi = vol->mft_ino;
 	struct page *page;
@@ -60,13 +61,14 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 	index = ni->mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
 	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 
+	i_size = i_size_read(mft_vi);
 	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+	end_index = i_size >> PAGE_CACHE_SHIFT;
 
 	/* If the wanted index is out of bounds the mft record doesn't exist. */
 	if (unlikely(index >= end_index)) {
-		if (index > end_index || (mft_vi->i_size & ~PAGE_CACHE_MASK) <
-				ofs + vol->mft_record_size) {
+		if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+				vol->mft_record_size) {
 			page = ERR_PTR(-ENOENT);
 			ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, "
 					"which is beyond the end of the mft.  "
@@ -1121,6 +1123,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 		ntfs_inode *base_ni)
 {
 	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+	unsigned long flags;
 	struct address_space *mftbmp_mapping;
 	u8 *buf, *byte;
 	struct page *page;
@@ -1134,9 +1137,13 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 	 * Set the end of the pass making sure we do not overflow the mft
 	 * bitmap.
 	 */
+	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
 	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
 			vol->mft_record_size_bits;
+	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
+	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
 	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
 	if (pass_end > ll)
 		pass_end = ll;
 	pass = 1;
@@ -1263,6 +1270,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 {
 	LCN lcn;
 	s64 ll;
+	unsigned long flags;
 	struct page *page;
 	ntfs_inode *mft_ni, *mftbmp_ni;
 	runlist_element *rl, *rl2 = NULL;
@@ -1286,8 +1294,10 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	 * mft bitmap cannot be zero so we are ok to do this.
 	 * ntfs_find_vcn() returns the runlist locked on success.
 	 */
-	rl = ntfs_find_vcn(mftbmp_ni, (mftbmp_ni->allocated_size - 1) >>
-			vol->cluster_size_bits, TRUE);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	ll = mftbmp_ni->allocated_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	rl = ntfs_find_vcn(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, TRUE);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft bitmap attribute.");
@@ -1458,9 +1468,11 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 		}
 		a = ctx->attr;
 	}
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	mftbmp_ni->allocated_size += vol->cluster_size;
 	a->data.non_resident.allocated_size =
 			cpu_to_sle64(mftbmp_ni->allocated_size);
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
@@ -1476,7 +1488,9 @@ restore_undo_alloc:
 			0, ctx)) {
 		ntfs_error(vol->sb, "Failed to find last attribute extent of "
 				"mft bitmap attribute.%s", es);
+		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
 		mftbmp_ni->allocated_size += vol->cluster_size;
+		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 		ntfs_attr_put_search_ctx(ctx);
 		unmap_mft_record(mft_ni);
 		up_write(&mftbmp_ni->runlist.lock);
@@ -1550,6 +1564,7 @@ undo_alloc:
 static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
 {
 	s64 old_data_size, old_initialized_size;
+	unsigned long flags;
 	struct inode *mftbmp_vi;
 	ntfs_inode *mft_ni, *mftbmp_ni;
 	ntfs_attr_search_ctx *ctx;
@@ -1583,7 +1598,8 @@ static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
 		goto put_err_out;
 	}
 	a = ctx->attr;
-	old_data_size = mftbmp_vi->i_size;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = i_size_read(mftbmp_vi);
 	old_initialized_size = mftbmp_ni->initialized_size;
 	/*
 	 * We can simply update the initialized_size before filling the space
@@ -1593,11 +1609,12 @@ static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
 	mftbmp_ni->initialized_size += 8;
 	a->data.non_resident.initialized_size =
 			cpu_to_sle64(mftbmp_ni->initialized_size);
-	if (mftbmp_ni->initialized_size > mftbmp_vi->i_size) {
-		mftbmp_vi->i_size = mftbmp_ni->initialized_size;
+	if (mftbmp_ni->initialized_size > old_data_size) {
+		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
 		a->data.non_resident.data_size =
-				cpu_to_sle64(mftbmp_vi->i_size);
+				cpu_to_sle64(mftbmp_ni->initialized_size);
 	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
@@ -1636,22 +1653,28 @@ unm_err_out:
 		goto err_out;
 	}
 	a = ctx->attr;
+	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	mftbmp_ni->initialized_size = old_initialized_size;
 	a->data.non_resident.initialized_size =
 			cpu_to_sle64(old_initialized_size);
-	if (mftbmp_vi->i_size != old_data_size) {
-		mftbmp_vi->i_size = old_data_size;
+	if (i_size_read(mftbmp_vi) != old_data_size) {
+		i_size_write(mftbmp_vi, old_data_size);
 		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
 	}
+	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(mft_ni);
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
 			"data_size 0x%llx, initialized_size 0x%llx.",
 			(long long)mftbmp_ni->allocated_size,
-			(long long)mftbmp_vi->i_size,
+			(long long)i_size_read(mftbmp_vi),
 			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
 err_out:
 	return ret;
 }
@@ -1679,7 +1702,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 {
 	LCN lcn;
 	VCN old_last_vcn;
-	s64 min_nr, nr, ll = 0;
+	s64 min_nr, nr, ll;
+	unsigned long flags;
 	ntfs_inode *mft_ni;
 	runlist_element *rl, *rl2;
 	ntfs_attr_search_ctx *ctx = NULL;
@@ -1697,8 +1721,10 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	 * attribute cannot be zero so we are ok to do this.
 	 * ntfs_find_vcn() returns the runlist locked on success.
 	 */
-	rl = ntfs_find_vcn(mft_ni, (mft_ni->allocated_size - 1) >>
-			vol->cluster_size_bits, TRUE);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	rl = ntfs_find_vcn(mft_ni, (ll - 1) >> vol->cluster_size_bits, TRUE);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft data attribute.");
@@ -1710,8 +1736,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		return ret;
 	}
 	lcn = rl->lcn + rl->length;
-	ntfs_debug("Last lcn of mft data attribute is 0x%llx.",
-			(long long)lcn);
+	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
 	/* Minimum allocation is one mft record worth of clusters. */
 	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
 	if (!min_nr)
@@ -1721,12 +1746,13 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	if (!nr)
 		nr = min_nr;
 	/* Ensure we do not go above 2^32-1 mft records. */
-	if (unlikely((mft_ni->allocated_size +
-			(nr << vol->cluster_size_bits)) >>
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	ll = mft_ni->allocated_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
 			vol->mft_record_size_bits >= (1ll << 32))) {
 		nr = min_nr;
-		if (unlikely((mft_ni->allocated_size +
-				(nr << vol->cluster_size_bits)) >>
+		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
 				vol->mft_record_size_bits >= (1ll << 32))) {
 			ntfs_warning(vol->sb, "Cannot allocate mft record "
 					"because the maximum number of inodes "
@@ -1875,9 +1901,11 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		}
 		a = ctx->attr;
 	}
+	write_lock_irqsave(&mft_ni->size_lock, flags);
 	mft_ni->allocated_size += nr << vol->cluster_size_bits;
 	a->data.non_resident.allocated_size =
 			cpu_to_sle64(mft_ni->allocated_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
@@ -1892,7 +1920,9 @@ restore_undo_alloc:
 			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
 		ntfs_error(vol->sb, "Failed to find last attribute extent of "
 				"mft data attribute.%s", es);
+		write_lock_irqsave(&mft_ni->size_lock, flags);
 		mft_ni->allocated_size += nr << vol->cluster_size_bits;
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
 		ntfs_attr_put_search_ctx(ctx);
 		unmap_mft_record(mft_ni);
 		up_write(&mft_ni->runlist.lock);
@@ -2036,6 +2066,7 @@ static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
  */
 static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 {
+	loff_t i_size;
 	struct inode *mft_vi = vol->mft_ino;
 	struct page *page;
 	MFT_RECORD *m;
@@ -2051,10 +2082,11 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
 	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
 	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = mft_vi->i_size >> PAGE_CACHE_SHIFT;
+	i_size = i_size_read(mft_vi);
+	end_index = i_size >> PAGE_CACHE_SHIFT;
 	if (unlikely(index >= end_index)) {
 		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
-				(mft_vi->i_size & ~PAGE_CACHE_MASK))) {
+				(i_size & ~PAGE_CACHE_MASK))) {
 			ntfs_error(vol->sb, "Tried to format non-existing mft "
 					"record 0x%llx.", (long long)mft_no);
 			return -ENOENT;
@@ -2188,6 +2220,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 		ntfs_inode *base_ni, MFT_RECORD **mrec)
 {
 	s64 ll, bit, old_data_initialized, old_data_size;
+	unsigned long flags;
 	struct inode *vi;
 	struct page *page;
 	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
@@ -2237,9 +2270,13 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 	 * the first 24 mft records as they are special and whilst they may not
 	 * be in use, we do not allocate from them.
 	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
-	if (mftbmp_ni->initialized_size << 3 > ll &&
-			mftbmp_ni->initialized_size > 3) {
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_initialized = mftbmp_ni->initialized_size;
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
 		bit = ll;
 		if (bit < 24)
 			bit = 24;
@@ -2254,15 +2291,18 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 	 * mft record that we can allocate.
 	 * Note: The smallest mft record we allocate is mft record 24.
 	 */
-	bit = mftbmp_ni->initialized_size << 3;
+	bit = old_data_initialized << 3;
 	if (unlikely(bit >= (1ll << 32)))
 		goto max_err_out;
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+	old_data_size = mftbmp_ni->allocated_size;
 	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
 			"data_size 0x%llx, initialized_size 0x%llx.",
-			(long long)mftbmp_ni->allocated_size,
-			(long long)vol->mftbmp_ino->i_size,
-			(long long)mftbmp_ni->initialized_size);
-	if (mftbmp_ni->initialized_size + 8 > mftbmp_ni->allocated_size) {
+			(long long)old_data_size,
+			(long long)i_size_read(vol->mftbmp_ino),
+			(long long)old_data_initialized);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+	if (old_data_initialized + 8 > old_data_size) {
 		/* Need to extend bitmap by one more cluster. */
 		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
 		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
@@ -2270,12 +2310,16 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 			up_write(&vol->mftbmp_lock);
 			goto err_out;
 		}
+#ifdef DEBUG
+		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 		ntfs_debug("Status of mftbmp after allocation extension: "
 				"allocated_size 0x%llx, data_size 0x%llx, "
 				"initialized_size 0x%llx.",
 				(long long)mftbmp_ni->allocated_size,
-				(long long)vol->mftbmp_ino->i_size,
+				(long long)i_size_read(vol->mftbmp_ino),
 				(long long)mftbmp_ni->initialized_size);
+		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
 	}
 	/*
 	 * We now have sufficient allocated space, extend the initialized_size
@@ -2287,12 +2331,16 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
 		up_write(&vol->mftbmp_lock);
 		goto err_out;
 	}
+#ifdef DEBUG
+	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	ntfs_debug("Status of mftbmp after initialized extention: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
 			(long long)mftbmp_ni->allocated_size,
-			(long long)vol->mftbmp_ino->i_size,
+			(long long)i_size_read(vol->mftbmp_ino),
 			(long long)mftbmp_ni->initialized_size);
+	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
 	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
 found_free_rec:
 	/* @bit is the found free mft record, allocate it in the mft bitmap. */
@@ -2314,7 +2362,10 @@ have_alloc_rec:
 	 * parallel allocation could allocate the same mft record as this one.
 	 */
 	ll = (bit + 1) << vol->mft_record_size_bits;
-	if (ll <= mft_ni->initialized_size) {
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_initialized = mft_ni->initialized_size;
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	if (ll <= old_data_initialized) {
 		ntfs_debug("Allocated mft record already initialized.");
 		goto mft_rec_already_initialized;
 	}
@@ -2325,25 +2376,32 @@ have_alloc_rec:
 	 * actually traversed more than once when a freshly formatted volume is
 	 * first written to so it optimizes away nicely in the common case.
 	 */
+	read_lock_irqsave(&mft_ni->size_lock, flags);
+	old_data_size = mft_ni->allocated_size;
 	ntfs_debug("Status of mft data before extension: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
-			(long long)mft_ni->allocated_size,
-			(long long)vol->mft_ino->i_size,
+			(long long)old_data_size,
+			(long long)i_size_read(vol->mft_ino),
 			(long long)mft_ni->initialized_size);
-	while (ll > mft_ni->allocated_size) {
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
+	while (ll > old_data_size) {
 		err = ntfs_mft_data_extend_allocation_nolock(vol);
 		if (unlikely(err)) {
 			ntfs_error(vol->sb, "Failed to extend mft data "
 					"allocation.");
 			goto undo_mftbmp_alloc_nolock;
 		}
+#ifdef DEBUG
+		read_lock_irqsave(&mft_ni->size_lock, flags);
 		ntfs_debug("Status of mft data after allocation extension: "
 				"allocated_size 0x%llx, data_size 0x%llx, "
 				"initialized_size 0x%llx.",
 				(long long)mft_ni->allocated_size,
-				(long long)vol->mft_ino->i_size,
+				(long long)i_size_read(vol->mft_ino),
 				(long long)mft_ni->initialized_size);
+		read_unlock_irqrestore(&mft_ni->size_lock, flags);
+#endif /* DEBUG */
 	}
 	/*
 	 * Extend mft data initialized size (and data size of course) to reach
@@ -2352,6 +2410,7 @@ have_alloc_rec:
 	 * needed by ntfs_mft_record_format().  We will update the attribute
 	 * record itself in one fell swoop later on.
 	 */
+	write_lock_irqsave(&mft_ni->size_lock, flags);
 	old_data_initialized = mft_ni->initialized_size;
 	old_data_size = vol->mft_ino->i_size;
 	while (ll > mft_ni->initialized_size) {
@@ -2360,8 +2419,9 @@ have_alloc_rec:
 		new_initialized_size = mft_ni->initialized_size +
 				vol->mft_record_size;
 		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
-		if (new_initialized_size > vol->mft_ino->i_size)
-			vol->mft_ino->i_size = new_initialized_size;
+		if (new_initialized_size > i_size_read(vol->mft_ino))
+			i_size_write(vol->mft_ino, new_initialized_size);
+		write_unlock_irqrestore(&mft_ni->size_lock, flags);
 		ntfs_debug("Initializing mft record 0x%llx.",
 				(long long)mft_no);
 		err = ntfs_mft_record_format(vol, mft_no);
@@ -2369,8 +2429,10 @@ have_alloc_rec:
 			ntfs_error(vol->sb, "Failed to format mft record.");
 			goto undo_data_init;
 		}
+		write_lock_irqsave(&mft_ni->size_lock, flags);
 		mft_ni->initialized_size = new_initialized_size;
 	}
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
 	record_formatted = TRUE;
 	/* Update the mft data attribute record to reflect the new sizes. */
 	m = map_mft_record(mft_ni);
@@ -2396,22 +2458,27 @@ have_alloc_rec:
 		goto undo_data_init;
 	}
 	a = ctx->attr;
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	a->data.non_resident.initialized_size =
 			cpu_to_sle64(mft_ni->initialized_size);
-	a->data.non_resident.data_size = cpu_to_sle64(vol->mft_ino->i_size);
+	a->data.non_resident.data_size =
+			cpu_to_sle64(i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/* Ensure the changes make it to disk. */
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
 	mark_mft_record_dirty(ctx->ntfs_ino);
 	ntfs_attr_put_search_ctx(ctx);
 	unmap_mft_record(mft_ni);
+	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ntfs_debug("Status of mft data after mft record initialization: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
 			(long long)mft_ni->allocated_size,
-			(long long)vol->mft_ino->i_size,
+			(long long)i_size_read(vol->mft_ino),
 			(long long)mft_ni->initialized_size);
-	BUG_ON(vol->mft_ino->i_size > mft_ni->allocated_size);
-	BUG_ON(mft_ni->initialized_size > vol->mft_ino->i_size);
+	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
+	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 mft_rec_already_initialized:
 	/*
 	 * We can finally drop the mft bitmap lock as the mft data attribute
@@ -2652,8 +2719,10 @@ mft_rec_already_initialized:
 	*mrec = m;
 	return ni;
 undo_data_init:
+	write_lock_irqsave(&mft_ni->size_lock, flags);
 	mft_ni->initialized_size = old_data_initialized;
-	vol->mft_ino->i_size = old_data_size;
+	i_size_write(vol->mft_ino, old_data_size);
+	write_unlock_irqrestore(&mft_ni->size_lock, flags);
 	goto undo_mftbmp_alloc_nolock;
 undo_mftbmp_alloc:
 	down_write(&vol->mftbmp_lock);
-- 
cgit v1.2.3


From 3834c3f227725e2395840aed82342bda4ee9d379 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Thu, 13 Jan 2005 11:04:39 +0000
Subject: NTFS: Fix stupid bug in fs/ntfs/mft.c introduced in last changeset.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/mft.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 20011e02f5b6..4e0bf39426cf 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2377,22 +2377,20 @@ have_alloc_rec:
 	 * first written to so it optimizes away nicely in the common case.
 	 */
 	read_lock_irqsave(&mft_ni->size_lock, flags);
-	old_data_size = mft_ni->allocated_size;
 	ntfs_debug("Status of mft data before extension: "
 			"allocated_size 0x%llx, data_size 0x%llx, "
 			"initialized_size 0x%llx.",
-			(long long)old_data_size,
+			(long long)mft_ni->allocated_size,
 			(long long)i_size_read(vol->mft_ino),
 			(long long)mft_ni->initialized_size);
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	while (ll > old_data_size) {
+	while (ll > mft_ni->allocated_size) {
+		read_unlock_irqrestore(&mft_ni->size_lock, flags);
 		err = ntfs_mft_data_extend_allocation_nolock(vol);
 		if (unlikely(err)) {
 			ntfs_error(vol->sb, "Failed to extend mft data "
 					"allocation.");
 			goto undo_mftbmp_alloc_nolock;
 		}
-#ifdef DEBUG
 		read_lock_irqsave(&mft_ni->size_lock, flags);
 		ntfs_debug("Status of mft data after allocation extension: "
 				"allocated_size 0x%llx, data_size 0x%llx, "
@@ -2400,9 +2398,8 @@ have_alloc_rec:
 				(long long)mft_ni->allocated_size,
 				(long long)i_size_read(vol->mft_ino),
 				(long long)mft_ni->initialized_size);
-		read_unlock_irqrestore(&mft_ni->size_lock, flags);
-#endif /* DEBUG */
 	}
+	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/*
 	 * Extend mft data initialized size (and data size of course) to reach
 	 * the allocated mft record, formatting the mft records allong the way.
-- 
cgit v1.2.3


From b6ad6c52fe36ab35d0fe28c064f59de2ba670c2a Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Tue, 15 Feb 2005 10:08:43 +0000
Subject: NTFS: - Split ntfs_map_runlist() into ntfs_map_runlist() and a
 non-locking 	helper ntfs_map_runlist_nolock() which is used by
 ntfs_map_runlist(). 	This allows us to map runlist fragments with the
 runlist lock already 	held without having to drop and reacquire it around
 the call.  Adapt 	all callers.       - Change ntfs_find_vcn() to
 ntfs_find_vcn_nolock() which takes a locked 	runlist.  This allows us to
 find runlist elements with the runlist 	lock already held without
 having to drop and reacquire it around the 	call.  Adapt all callers.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog  |   9 +++++
 fs/ntfs/aops.c     |  34 +++++------------
 fs/ntfs/attrib.c   | 107 +++++++++++++++++++++++++++++++----------------------
 fs/ntfs/attrib.h   |   5 ++-
 fs/ntfs/lcnalloc.c |  39 ++++++-------------
 fs/ntfs/mft.c      |  38 +++++++++++--------
 6 files changed, 119 insertions(+), 113 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index b40c334e616f..9d42393c16c0 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -63,6 +63,15 @@ ToDo/Notes:
 	- Fix a bug in fs/ntfs/runlist.c::ntfs_mapping_pairs_decompress() in
 	  the creation of the unmapped runlist element for the base attribute
 	  extent.
+	- Split ntfs_map_runlist() into ntfs_map_runlist() and a non-locking
+	  helper ntfs_map_runlist_nolock() which is used by ntfs_map_runlist().
+	  This allows us to map runlist fragments with the runlist lock already
+	  held without having to drop and reacquire it around the call.  Adapt
+	  all callers.
+	- Change ntfs_find_vcn() to ntfs_find_vcn_nolock() which takes a locked
+	  runlist.  This allows us to find runlist elements with the runlist
+	  lock already held without having to drop and reacquire it around the
+	  call.  Adapt all callers.
 
 2.1.22 - Many bug and race fixes and error handling improvements.
 
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 812d53e93354..2b4b8b9e8796 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -2,7 +2,7 @@
  * aops.c - NTFS kernel address space operations and page cache handling.
  *	    Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -135,7 +135,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 					i * rec_size), rec_size);
 		flush_dcache_page(page);
 		kunmap_atomic(addr, KM_BIO_SRC_IRQ);
-		if (likely(!PageError(page) && page_uptodate))
+		if (likely(page_uptodate && !PageError(page)))
 			SetPageUptodate(page);
 	}
 	unlock_page(page);
@@ -347,11 +347,11 @@ handle_zblock:
  */
 static int ntfs_readpage(struct file *file, struct page *page)
 {
-	loff_t i_size;
 	ntfs_inode *ni, *base_ni;
 	u8 *kaddr;
 	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *mrec;
+	unsigned long flags;
 	u32 attr_len;
 	int err = 0;
 
@@ -389,9 +389,9 @@ static int ntfs_readpage(struct file *file, struct page *page)
 	 * Attribute is resident, implying it is not compressed or encrypted.
 	 * This also means the attribute is smaller than an mft record and
 	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.  We can also do this if the file size is 0.
+	 * index above 0.
 	 */
-	if (unlikely(page->index > 0 || !i_size_read(VFS_I(ni)))) {
+	if (unlikely(page->index > 0)) {
 		kaddr = kmap_atomic(page, KM_USER0);
 		memset(kaddr, 0, PAGE_CACHE_SIZE);
 		flush_dcache_page(page);
@@ -418,9 +418,10 @@ static int ntfs_readpage(struct file *file, struct page *page)
 	if (unlikely(err))
 		goto put_unm_err_out;
 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	i_size = i_size_read(VFS_I(ni));
-	if (unlikely(attr_len > i_size))
-		attr_len = i_size;
+	read_lock_irqsave(&ni->size_lock, flags);
+	if (unlikely(attr_len > ni->initialized_size))
+		attr_len = ni->initialized_size;
+	read_unlock_irqrestore(&ni->size_lock, flags);
 	kaddr = kmap_atomic(page, KM_USER0);
 	/* Copy the data to the page. */
 	memcpy(kaddr, (u8*)ctx->attr +
@@ -1247,20 +1248,6 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 	int err;
 
 	BUG_ON(!PageLocked(page));
-	/*
-	 * If a previous ntfs_truncate() failed, repeat it and abort if it
-	 * fails again.
-	 */
-	if (unlikely(NInoTruncateFailed(ni))) {
-		down_write(&vi->i_alloc_sem);
-		err = ntfs_truncate(vi);
-		up_write(&vi->i_alloc_sem);
-		if (err || NInoTruncateFailed(ni)) {
-			if (!err)
-				err = -EIO;
-			goto err_out;
-		}
-	}
 	i_size = i_size_read(vi);
 	/* Is the page fully outside i_size? (truncate in progress) */
 	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
@@ -1490,13 +1477,12 @@ static int ntfs_prepare_nonresident_write(struct page *page,
 
 	read_lock_irqsave(&ni->size_lock, flags);
 	/*
-	 * The first out of bounds block for the allocated size. No need to
+	 * The first out of bounds block for the allocated size.  No need to
 	 * round up as allocated_size is in multiples of cluster size and the
 	 * minimum cluster size is 512 bytes, which is equal to the smallest
 	 * blocksize.
 	 */
 	ablock = ni->allocated_size >> blocksize_bits;
-
 	i_size = i_size_read(vi);
 	initialized_size = ni->initialized_size;
 	read_unlock_irqrestore(&ni->size_lock, flags);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 7d668466dcd7..7a16f7ca76d8 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1,7 +1,7 @@
 /**
  * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -30,7 +30,7 @@
 #include "types.h"
 
 /**
- * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
  * @ni:		ntfs inode for which to map (part of) a runlist
  * @vcn:	map runlist part containing this vcn
  *
@@ -38,24 +38,23 @@
  *
  * Return 0 on success and -errno on error.
  *
- * Locking: - The runlist must be unlocked on entry and is unlocked on return.
- *	    - This function takes the lock for writing and modifies the runlist.
+ * Locking: - The runlist must be locked for writing.
+ *	    - This function modifies the runlist.
  */
-int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn)
 {
 	ntfs_inode *base_ni;
-	ntfs_attr_search_ctx *ctx;
 	MFT_RECORD *mrec;
+	ntfs_attr_search_ctx *ctx;
+	runlist_element *rl;
 	int err = 0;
 
 	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
 			(unsigned long long)vcn);
-
 	if (!NInoAttr(ni))
 		base_ni = ni;
 	else
 		base_ni = ni->ext.base_ntfs_ino;
-
 	mrec = map_mft_record(base_ni);
 	if (IS_ERR(mrec))
 		return PTR_ERR(mrec);
@@ -66,15 +65,7 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
 	}
 	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
 			CASE_SENSITIVE, vcn, NULL, 0, ctx);
-	if (unlikely(err))
-		goto put_err_out;
-
-	down_write(&ni->runlist.lock);
-	/* Make sure someone else didn't do the work while we were sleeping. */
-	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
-			LCN_RL_NOT_MAPPED)) {
-		runlist_element *rl;
-
+	if (likely(!err)) {
 		rl = ntfs_mapping_pairs_decompress(ni->vol, ctx->attr,
 				ni->runlist.rl);
 		if (IS_ERR(rl))
@@ -82,9 +73,6 @@ int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
 		else
 			ni->runlist.rl = rl;
 	}
-	up_write(&ni->runlist.lock);
-
-put_err_out:
 	ntfs_attr_put_search_ctx(ctx);
 err_out:
 	unmap_mft_record(base_ni);
@@ -92,17 +80,45 @@ err_out:
 }
 
 /**
- * ntfs_find_vcn - find a vcn in the runlist described by an ntfs inode
- * @ni:		ntfs inode describing the runlist to search
- * @vcn:	vcn to find
- * @need_write:	if false, lock for reading and if true, lock for writing
+ * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * @ni:		ntfs inode for which to map (part of) a runlist
+ * @vcn:	map runlist part containing this vcn
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - The runlist must be unlocked on entry and is unlocked on return.
+ *	    - This function takes the runlist lock for writing and modifies the
+ *	      runlist.
+ */
+int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+{
+	int err = 0;
+
+	down_write(&ni->runlist.lock);
+	/* Make sure someone else didn't do the work while we were sleeping. */
+	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
+			LCN_RL_NOT_MAPPED))
+		err = ntfs_map_runlist_nolock(ni, vcn);
+	up_write(&ni->runlist.lock);
+	return err;
+}
+
+/**
+ * ntfs_find_vcn_nolock - find a vcn in the runlist described by an ntfs inode
+ * @ni:			ntfs inode describing the runlist to search
+ * @vcn:		vcn to find
+ * @write_locked:	true if the runlist is locked for writing
  *
  * Find the virtual cluster number @vcn in the runlist described by the ntfs
  * inode @ni and return the address of the runlist element containing the @vcn.
- * The runlist is left locked and the caller has to unlock it.  If @need_write
- * is true, the runlist is locked for writing and if @need_write is false, the
- * runlist is locked for reading.  In the error case, the runlist is not left
- * locked.
+ * The runlist is left locked and the caller has to unlock it.  In the error
+ * case, the runlist is left in the same locking state as on entry.
+ *
+ * Note if @write_locked is FALSE the lock may be dropped inside the function
+ * so you cannot rely on the runlist still being the same when this function
+ * returns.
  *
  * Note you need to distinguish between the lcn of the returned runlist element
  * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
@@ -124,28 +140,24 @@ err_out:
  *	      true, it is locked for writing.  Otherwise is is locked for
  *	      reading.
  */
-runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
-		const BOOL need_write)
+runlist_element *ntfs_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const BOOL write_locked)
 {
 	runlist_element *rl;
 	int err = 0;
 	BOOL is_retry = FALSE;
 
-	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, lock for %sing.",
+	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
 			ni->mft_no, (unsigned long long)vcn,
-			!need_write ? "read" : "writ");
+			write_locked ? "write" : "read");
 	BUG_ON(!ni);
 	BUG_ON(!NInoNonResident(ni));
 	BUG_ON(vcn < 0);
-lock_retry_remap:
-	if (!need_write)
-		down_read(&ni->runlist.lock);
-	else
-		down_write(&ni->runlist.lock);
+retry_remap:
 	rl = ni->runlist.rl;
 	if (likely(rl && vcn >= rl[0].vcn)) {
 		while (likely(rl->length)) {
-			if (likely(vcn < rl[1].vcn)) {
+			if (unlikely(vcn < rl[1].vcn)) {
 				if (likely(rl->lcn >= LCN_HOLE)) {
 					ntfs_debug("Done.");
 					return rl;
@@ -161,19 +173,23 @@ lock_retry_remap:
 				err = -EIO;
 		}
 	}
-	if (!need_write)
-		up_read(&ni->runlist.lock);
-	else
-		up_write(&ni->runlist.lock);
 	if (!err && !is_retry) {
 		/*
 		 * The @vcn is in an unmapped region, map the runlist and
 		 * retry.
 		 */
-		err = ntfs_map_runlist(ni, vcn);
+		if (!write_locked) {
+			up_read(&ni->runlist.lock);
+			down_write(&ni->runlist.lock);
+		}
+		err = ntfs_map_runlist_nolock(ni, vcn);
+		if (!write_locked) {
+			up_write(&ni->runlist.lock);
+			down_read(&ni->runlist.lock);
+		}
 		if (likely(!err)) {
 			is_retry = TRUE;
-			goto lock_retry_remap;
+			goto retry_remap;
 		}
 		/*
 		 * -EINVAL and -ENOENT coming from a failed mapping attempt are
@@ -184,7 +200,8 @@ lock_retry_remap:
 			err = -EIO;
 	} else if (!err)
 		err = -EIO;
-	ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
+	if (err != -ENOENT)
+		ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index e0c2c6c81bc0..3eb451657025 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -60,10 +60,11 @@ typedef struct {
 	ATTR_RECORD *base_attr;
 } ntfs_attr_search_ctx;
 
+extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn);
 extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
 
-extern runlist_element *ntfs_find_vcn(ntfs_inode *ni, const VCN vcn,
-		const BOOL need_write);
+extern runlist_element *ntfs_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+		const BOOL write_locked);
 
 int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
 		const u32 name_len, const IGNORE_CASE_BOOL ic,
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 5346596fa871..8db4492b139c 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -849,7 +849,8 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 	total_freed = real_freed = 0;
 
 	/* This returns with ni->runlist locked for reading on success. */
-	rl = ntfs_find_vcn(ni, start_vcn, FALSE);
+	down_read(&ni->runlist.lock);
+	rl = ntfs_find_vcn_nolock(ni, start_vcn, FALSE);
 	if (IS_ERR(rl)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "Failed to find first runlist "
@@ -863,7 +864,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 			ntfs_error(vol->sb, "First runlist element has "
 					"invalid lcn, aborting.");
 		err = -EIO;
-		goto unl_err_out;
+		goto err_out;
 	}
 	/* Find the starting cluster inside the run that needs freeing. */
 	delta = start_vcn - rl->vcn;
@@ -881,7 +882,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 			if (!is_rollback)
 				ntfs_error(vol->sb, "Failed to clear first run "
 						"(error %i), aborting.", err);
-			goto unl_err_out;
+			goto err_out;
 		}
 		/* We have freed @to_free real clusters. */
 		real_freed = to_free;
@@ -901,30 +902,15 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 		if (unlikely(rl->lcn < LCN_HOLE)) {
 			VCN vcn;
 
-			/*
-			 * Attempt to map runlist, dropping runlist lock for
-			 * the duration.
-			 */
+			/* Attempt to map runlist. */
 			vcn = rl->vcn;
-			up_read(&ni->runlist.lock);
-			err = ntfs_map_runlist(ni, vcn);
-			if (err) {
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to map "
-							"runlist fragment.");
-				if (err == -EINVAL || err == -ENOENT)
-					err = -EIO;
-				goto err_out;
-			}
-			/*
-			 * This returns with ni->runlist locked for reading on
-			 * success.
-			 */
-			rl = ntfs_find_vcn(ni, vcn, FALSE);
+			rl = ntfs_find_vcn_nolock(ni, vcn, FALSE);
 			if (IS_ERR(rl)) {
 				err = PTR_ERR(rl);
 				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to find "
+					ntfs_error(vol->sb, "Failed to map "
+							"runlist fragment or "
+							"failed to find "
 							"subsequent runlist "
 							"element.");
 				goto err_out;
@@ -937,7 +923,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 							(unsigned long long)
 							rl->lcn);
 				err = -EIO;
-				goto unl_err_out;
+				goto err_out;
 			}
 		}
 		/* The number of clusters in this run that need freeing. */
@@ -953,7 +939,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 				if (!is_rollback)
 					ntfs_error(vol->sb, "Failed to clear "
 							"subsequent run.");
-				goto unl_err_out;
+				goto err_out;
 			}
 			/* We have freed @to_free real clusters. */
 			real_freed += to_free;
@@ -974,9 +960,8 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 	/* We are done.  Return the number of actually freed clusters. */
 	ntfs_debug("Done.");
 	return real_freed;
-unl_err_out:
-	up_read(&ni->runlist.lock);
 err_out:
+	up_read(&ni->runlist.lock);
 	if (is_rollback)
 		return err;
 	/* If no real clusters were freed, no need to rollback. */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 4e0bf39426cf..0975d738834c 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1,7 +1,7 @@
 /**
  * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
  *
- * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2001-2005 Anton Altaparmakov
  * Copyright (c) 2002 Richard Russon
  *
  * This program/include file is free software; you can redistribute it and/or
@@ -287,7 +287,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
 			}
 			unmap_mft_record(ni);
 			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-					"reference! Corrupt file system. "
+					"reference! Corrupt filesystem. "
 					"Run chkdsk.");
 			return ERR_PTR(-EIO);
 		}
@@ -318,7 +318,7 @@ map_err_out:
 	/* Verify the sequence number if it is present. */
 	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
 		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-				"reference! Corrupt file system. Run chkdsk.");
+				"reference! Corrupt filesystem. Run chkdsk.");
 		destroy_ni = TRUE;
 		m = ERR_PTR(-EIO);
 		goto unm_err_out;
@@ -1292,19 +1292,20 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	/*
 	 * Determine the last lcn of the mft bitmap.  The allocated size of the
 	 * mft bitmap cannot be zero so we are ok to do this.
-	 * ntfs_find_vcn() returns the runlist locked on success.
 	 */
+	down_write(&mftbmp_ni->runlist.lock);
 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	ll = mftbmp_ni->allocated_size;
 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	rl = ntfs_find_vcn(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, TRUE);
+	rl = ntfs_find_vcn_nolock(mftbmp_ni,
+			(ll - 1) >> vol->cluster_size_bits, TRUE);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mftbmp_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft bitmap attribute.");
-		if (!IS_ERR(rl)) {
-			up_write(&mftbmp_ni->runlist.lock);
+		if (!IS_ERR(rl))
 			ret = -EIO;
-		} else
+		else
 			ret = PTR_ERR(rl);
 		return ret;
 	}
@@ -1428,6 +1429,8 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 		// TODO: Deal with this by moving this extent to a new mft
 		// record or by starting a new extent in a new mft record or by
 		// moving other attributes out of this mft record.
+		// Note: It will need to be a special mft record and if none of
+		// those are available it gets rather complicated...
 		ntfs_error(vol->sb, "Not enough space in this mft record to "
 				"accomodate extended mft bitmap attribute "
 				"extent.  Cannot handle this yet.");
@@ -1719,19 +1722,20 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	 * Determine the preferred allocation location, i.e. the last lcn of
 	 * the mft data attribute.  The allocated size of the mft data
 	 * attribute cannot be zero so we are ok to do this.
-	 * ntfs_find_vcn() returns the runlist locked on success.
 	 */
+	down_write(&mft_ni->runlist.lock);
 	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ll = mft_ni->allocated_size;
 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	rl = ntfs_find_vcn(mft_ni, (ll - 1) >> vol->cluster_size_bits, TRUE);
+	rl = ntfs_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits,
+			TRUE);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
+		up_write(&mft_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
 				"cluster of mft data attribute.");
-		if (!IS_ERR(rl)) {
-			up_write(&mft_ni->runlist.lock);
+		if (!IS_ERR(rl))
 			ret = -EIO;
-		} else
+		else
 			ret = PTR_ERR(rl);
 		return ret;
 	}
@@ -1858,7 +1862,11 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		// moving other attributes out of this mft record.
 		// Note: Use the special reserved mft records and ensure that
 		// this extent is not required to find the mft record in
-		// question.
+		// question.  If no free special records left we would need to
+		// move an existing record away, insert ours in its place, and
+		// then place the moved record into the newly allocated space
+		// and we would then need to update all references to this mft
+		// record appropriately.  This is rather complicated...
 		ntfs_error(vol->sb, "Not enough space in this mft record to "
 				"accomodate extended mft data attribute "
 				"extent.  Cannot handle this yet.");
@@ -2021,7 +2029,7 @@ static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
 				"reports this as corruption, please email "
 				"linux-ntfs-dev@lists.sourceforge.net stating "
 				"that you saw this message and that the "
-				"modified file system created was corrupt.  "
+				"modified filesystem created was corrupt.  "
 				"Thank you.");
 	}
 	/* Set the update sequence number to 1. */
-- 
cgit v1.2.3


From 8907547d4b099e67762ea4891c127ea1f6dd1cb7 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rddunlap@osdl.org>
Date: Thu, 3 Mar 2005 11:19:53 +0000
Subject: NTFS: Fix printk format warnings on ia64. (Randy Dunlap)

Signed-off-by: Randy Dunlap <rddunlap@osdl.org>
Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog  |  1 +
 fs/ntfs/aops.c     |  3 ++-
 fs/ntfs/debug.c    | 15 +++++++++------
 fs/ntfs/inode.c    | 12 ++++++------
 fs/ntfs/lcnalloc.c |  2 +-
 fs/ntfs/mft.c      |  2 +-
 6 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index ad2c4e88f053..bfd86c5cc480 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -29,6 +29,7 @@ ToDo/Notes:
 	  compiled without debug.  This avoids a possible denial of service
 	  attack.  Thanks to Carl-Daniel Hailfinger from SuSE for pointing this
 	  out.
+	- Fix compilation warnings on ia64.  (Randy Dunlap)
 	- Use i_size_read() in fs/ntfs/attrib.c::ntfs_attr_set().
 	- Use i_size_read() in fs/ntfs/logfile.c::ntfs_{check,empty}_logfile().
 	- Use i_size_read() once and then use the cached value in
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 2b4b8b9e8796..2a7cba258cca 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -965,7 +965,8 @@ lock_retry_remap:
 						"attribute type 0x%x) because "
 						"its location on disk could "
 						"not be determined (error "
-						"code %lli).", (s64)block <<
+						"code %lli).",
+						(long long)block <<
 						bh_size_bits >>
 						vol->mft_record_size_bits,
 						ni->mft_no, ni->type,
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 6fb6bb5e3723..807150e2c2b9 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -164,14 +164,17 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
 			if (index > -LCN_ENOENT - 1)
 				index = 3;
 			printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
-					(rl + i)->vcn, lcn_str[index],
-					(rl + i)->length, (rl + i)->length ?
-					"" : " (runlist end)");
+					(long long)(rl + i)->vcn, lcn_str[index],
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
 		} else
 			printk(KERN_DEBUG "%-16Lx %-16Lx  %-16Lx%s\n",
-					(rl + i)->vcn, (rl + i)->lcn,
-					(rl + i)->length, (rl + i)->length ?
-					"" : " (runlist end)");
+					(long long)(rl + i)->vcn,
+					(long long)(rl + i)->lcn,
+					(long long)(rl + i)->length,
+					(rl + i)->length ? "" :
+						" (runlist end)");
 		if (!(rl + i)->length)
 			break;
 	}
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index cfca17384115..7ae647c640bb 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2518,18 +2518,18 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	nt = utc2ntfs(vi->i_mtime);
 	if (si->last_data_change_time != nt) {
 		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino,
+				"new = 0x%llx", vi->i_ino, (long long)
 				sle64_to_cpu(si->last_data_change_time),
-				sle64_to_cpu(nt));
+				(long long)sle64_to_cpu(nt));
 		si->last_data_change_time = nt;
 		modified = TRUE;
 	}
 	nt = utc2ntfs(vi->i_ctime);
 	if (si->last_mft_change_time != nt) {
 		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino,
+				"new = 0x%llx", vi->i_ino, (long long)
 				sle64_to_cpu(si->last_mft_change_time),
-				sle64_to_cpu(nt));
+				(long long)sle64_to_cpu(nt));
 		si->last_mft_change_time = nt;
 		modified = TRUE;
 	}
@@ -2537,8 +2537,8 @@ int ntfs_write_inode(struct inode *vi, int sync)
 	if (si->last_access_time != nt) {
 		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
 				"new = 0x%llx", vi->i_ino,
-				sle64_to_cpu(si->last_access_time),
-				sle64_to_cpu(nt));
+				(long long)sle64_to_cpu(si->last_access_time),
+				(long long)sle64_to_cpu(nt));
 		si->last_access_time = nt;
 		modified = TRUE;
 	}
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 8db4492b139c..f2b7f8582c59 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -763,7 +763,7 @@ out:
 					"could allocate up to 0x%llx "
 					"clusters.",
 					(unsigned long long)rl[0].lcn,
-					(unsigned long long)count - clusters);
+					(unsigned long long)(count - clusters));
 		/* Deallocate all allocated clusters. */
 		ntfs_debug("Attempting rollback...");
 		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0975d738834c..66ef6e275a48 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1802,7 +1802,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 		return PTR_ERR(rl);
 	}
 	mft_ni->runlist.rl = rl;
-	ntfs_debug("Allocated %lli clusters.", nr);
+	ntfs_debug("Allocated %lli clusters.", (long long)nr);
 	/* Find the last run in the new runlist. */
 	for (; rl[1].length; rl++)
 		;
-- 
cgit v1.2.3


From c0c1cc0e46b36347f11b566f99087dc5e6fc1b89 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Mon, 7 Mar 2005 21:43:38 +0000
Subject: NTFS: - Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where
 after         dropping the read lock and taking the write lock we were not
 checking         whether someone else did not already do the work we wanted
 to do.       - Rename ntfs_find_vcn_nolock() to ntfs_attr_find_vcn_nolock(). 
      - Tidy up some comments in fs/ntfs/runlist.c.       - Add LCN_ENOMEM and
 LCN_EIO definitions to fs/ntfs/runlist.h.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog  |  5 +++++
 fs/ntfs/attrib.c   | 29 +++++++++++++++++------------
 fs/ntfs/attrib.h   |  4 ++--
 fs/ntfs/lcnalloc.c |  4 ++--
 fs/ntfs/mft.c      |  6 +++---
 fs/ntfs/runlist.c  | 15 ++++++++-------
 fs/ntfs/runlist.h  |  2 ++
 7 files changed, 39 insertions(+), 26 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 4af6ae6ff12b..7507a56a4921 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -92,6 +92,11 @@ ToDo/Notes:
 	  non-resident in fs/ntfs/attrib.c::ntfs_attr_can_be_non_resident().
 	- Add fs/ntfs/attrib.c::ntfs_attr_vcn_to_lcn_nolock() used by the new
 	  write code.
+	- Fix bug in fs/ntfs/attrib.c::ntfs_find_vcn_nolock() where after
+	  dropping the read lock and taking the write lock we were not checking
+	  whether someone else did not already do the work we wanted to do.
+	- Rename fs/ntfs/attrib.c::ntfs_find_vcn_nolock() to
+	  ntfs_attr_find_vcn_nolock() and update all callers.
 
 2.1.22 - Many bug and race fixes and error handling improvements.
 
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 1610f1cd2862..6de5e04e97a2 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -193,19 +193,19 @@ retry_remap:
 }
 
 /**
- * ntfs_find_vcn_nolock - find a vcn in the runlist described by an ntfs inode
+ * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
  * @ni:			ntfs inode describing the runlist to search
  * @vcn:		vcn to find
  * @write_locked:	true if the runlist is locked for writing
  *
  * Find the virtual cluster number @vcn in the runlist described by the ntfs
  * inode @ni and return the address of the runlist element containing the @vcn.
- * The runlist is left locked and the caller has to unlock it.  In the error
- * case, the runlist is left in the same locking state as on entry.
  *
- * Note if @write_locked is FALSE the lock may be dropped inside the function
- * so you cannot rely on the runlist still being the same when this function
- * returns.
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @write_locked is true the caller has locked the runlist for writing and
+ * if false for reading.
  *
  * Note you need to distinguish between the lcn of the returned runlist element
  * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
@@ -221,13 +221,12 @@ retry_remap:
  *	-ENOMEM - Not enough memory to map runlist.
  *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
  *
- * Locking: - The runlist must be unlocked on entry.
- *	    - On failing return, the runlist is unlocked.
- *	    - On successful return, the runlist is locked.  If @need_write us
- *	      true, it is locked for writing.  Otherwise is is locked for
- *	      reading.
+ * Locking: - The runlist must be locked on entry and is left locked on return.
+ *	    - If @write_locked is FALSE, i.e. the runlist is locked for reading,
+ *	      the lock may be dropped inside the function so you cannot rely on
+ *	      the runlist still being the same when this function returns.
  */
-runlist_element *ntfs_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
 		const BOOL write_locked)
 {
 	runlist_element *rl;
@@ -268,6 +267,12 @@ retry_remap:
 		if (!write_locked) {
 			up_read(&ni->runlist.lock);
 			down_write(&ni->runlist.lock);
+			if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
+					LCN_RL_NOT_MAPPED)) {
+				up_write(&ni->runlist.lock);
+				down_read(&ni->runlist.lock);
+				goto retry_remap;
+			}
 		}
 		err = ntfs_map_runlist_nolock(ni, vcn);
 		if (!write_locked) {
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
index 75041c89c738..e0a50f1ca76f 100644
--- a/fs/ntfs/attrib.h
+++ b/fs/ntfs/attrib.h
@@ -66,8 +66,8 @@ extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
 extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
 		const BOOL write_locked);
 
-extern runlist_element *ntfs_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const BOOL write_locked);
+extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
+		const VCN vcn, const BOOL write_locked);
 
 int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
 		const u32 name_len, const IGNORE_CASE_BOOL ic,
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index f2b7f8582c59..6e584ab743c5 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -850,7 +850,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 	/* This returns with ni->runlist locked for reading on success. */
 	down_read(&ni->runlist.lock);
-	rl = ntfs_find_vcn_nolock(ni, start_vcn, FALSE);
+	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, FALSE);
 	if (IS_ERR(rl)) {
 		if (!is_rollback)
 			ntfs_error(vol->sb, "Failed to find first runlist "
@@ -904,7 +904,7 @@ s64 __ntfs_cluster_free(struct inode *vi, const VCN start_vcn, s64 count,
 
 			/* Attempt to map runlist. */
 			vcn = rl->vcn;
-			rl = ntfs_find_vcn_nolock(ni, vcn, FALSE);
+			rl = ntfs_attr_find_vcn_nolock(ni, vcn, FALSE);
 			if (IS_ERR(rl)) {
 				err = PTR_ERR(rl);
 				if (!is_rollback)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 66ef6e275a48..61ce09f1b652 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1297,7 +1297,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
 	ll = mftbmp_ni->allocated_size;
 	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	rl = ntfs_find_vcn_nolock(mftbmp_ni,
+	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
 			(ll - 1) >> vol->cluster_size_bits, TRUE);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
 		up_write(&mftbmp_ni->runlist.lock);
@@ -1727,8 +1727,8 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	read_lock_irqsave(&mft_ni->size_lock, flags);
 	ll = mft_ni->allocated_size;
 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	rl = ntfs_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits,
-			TRUE);
+	rl = ntfs_attr_find_vcn_nolock(mft_ni,
+			(ll - 1) >> vol->cluster_size_bits, TRUE);
 	if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) {
 		up_write(&mft_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to determine last allocated "
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 1b344dd4d407..3f479f176610 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -933,17 +933,18 @@ err_out:
  *
  * It is up to the caller to serialize access to the runlist @rl.
  *
- * Since lcns must be >= 0, we use negative return values with special meaning:
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
  *
- * Return value			Meaning / Description
+ * Return code		Meaning / Description
  * ==================================================
- *  -1 = LCN_HOLE		Hole / not allocated on disk.
- *  -2 = LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
- *				inserted into the runlist yet.
- *  -3 = LCN_ENOENT		There is no such vcn in the attribute.
+ *  LCN_HOLE		Hole / not allocated on disk.
+ *  LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
+ *			inserted into the runlist yet.
+ *  LCN_ENOENT		There is no such vcn in the attribute.
  *
  * Locking: - The caller must have locked the runlist (for reading or writing).
- *	    - This function does not touch the lock.
+ *	    - This function does not touch the lock, nor does it modify the
+ *	      runlist.
  */
 LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
 {
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index 7107fde59df9..60c42f3a3fc4 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h
@@ -66,6 +66,8 @@ typedef enum {
 	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
 	LCN_RL_NOT_MAPPED	= -2,
 	LCN_ENOENT		= -3,
+	LCN_ENOMEM		= -4,
+	LCN_EIO			= -5,
 } LCN_SPECIAL_VALUES;
 
 extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
-- 
cgit v1.2.3


From fa3be92317c4ae34edcf5274e8bbeff181e20b7a Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Sat, 25 Jun 2005 17:15:36 +0100
Subject: NTFS: Add an extra parameter @last_vcn to
 ntfs_get_size_for_mapping_pairs()       and ntfs_mapping_pairs_build() to
 allow the runlist encoding to be       partial which is desirable when
 filling holes in sparse attributes.       Update all callers.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 fs/ntfs/ChangeLog |   4 ++
 fs/ntfs/attrib.c  |   4 +-
 fs/ntfs/mft.c     |  12 ++--
 fs/ntfs/runlist.c | 167 ++++++++++++++++++++++++++++++++++++++----------------
 fs/ntfs/runlist.h |   5 +-
 5 files changed, 132 insertions(+), 60 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index aff749db314c..67994c9c248f 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -135,6 +135,10 @@ ToDo/Notes:
 	- Change the runlist terminator of the newly allocated cluster(s) to
 	  LCN_ENOENT in ntfs_attr_make_non_resident().  Otherwise the runlist
 	  code gets confused.
+	- Add an extra parameter @last_vcn to ntfs_get_size_for_mapping_pairs()
+	  and ntfs_mapping_pairs_build() to allow the runlist encoding to be
+	  partial which is desirable when filling holes in sparse attributes.
+	  Update all callers.
 
 2.1.22 - Many bug and race fixes and error handling improvements.
 
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 34ea405b883d..c6b2bb64d651 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1317,7 +1317,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 		page = NULL;
 	}
 	/* Determine the size of the mapping pairs array. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0);
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
 	if (unlikely(mp_size < 0)) {
 		err = mp_size;
 		ntfs_debug("Failed to get size for mapping pairs array, error "
@@ -1416,7 +1416,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni)
 			cpu_to_sle64(attr_size);
 	/* Generate the mapping pairs array into the attribute record. */
 	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
-			arec_size - mp_ofs, rl, 0, NULL);
+			arec_size - mp_ofs, rl, 0, -1, NULL);
 	if (unlikely(err)) {
 		ntfs_debug("Failed to build mapping pairs, error code %i.",
 				err);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 61ce09f1b652..3d0ba8e60adc 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -1407,7 +1407,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	BUG_ON(ll < rl2->vcn);
 	BUG_ON(ll >= rl2->vcn + rl2->length);
 	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
 	if (unlikely(mp_size <= 0)) {
 		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
 				"mft bitmap attribute extent.");
@@ -1441,7 +1441,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	/* Generate the mapping pairs array directly into the attr record. */
 	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, NULL);
+			mp_size, rl2, ll, -1, NULL);
 	if (unlikely(ret)) {
 		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
 				"mft bitmap attribute.");
@@ -1529,7 +1529,7 @@ undo_alloc:
 				a->data.non_resident.mapping_pairs_offset),
 				old_alen - le16_to_cpu(
 				a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, NULL)) {
+				rl2, ll, -1, NULL)) {
 			ntfs_error(vol->sb, "Failed to restore mapping pairs "
 					"array.%s", es);
 			NVolSetErrors(vol);
@@ -1838,7 +1838,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	BUG_ON(ll < rl2->vcn);
 	BUG_ON(ll >= rl2->vcn + rl2->length);
 	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll);
+	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
 	if (unlikely(mp_size <= 0)) {
 		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
 				"mft data attribute extent.");
@@ -1877,7 +1877,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
 	/* Generate the mapping pairs array directly into the attr record. */
 	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
 			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, NULL);
+			mp_size, rl2, ll, -1, NULL);
 	if (unlikely(ret)) {
 		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
 				"mft data attribute.");
@@ -1959,7 +1959,7 @@ undo_alloc:
 				a->data.non_resident.mapping_pairs_offset),
 				old_alen - le16_to_cpu(
 				a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, NULL)) {
+				rl2, ll, -1, NULL)) {
 			ntfs_error(vol->sb, "Failed to restore mapping pairs "
 					"array.%s", es);
 			NVolSetErrors(vol);
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 396d767c2cab..758855b0414e 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1048,10 +1048,17 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n)
  * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
  * @vol:	ntfs volume (needed for the ntfs version)
  * @rl:		locked runlist to determine the size of the mapping pairs of
- * @start_vcn:	vcn at which to start the mapping pairs array
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
  *
  * Walk the locked runlist @rl and calculate the size in bytes of the mapping
- * pairs array corresponding to the runlist @rl, starting at vcn @start_vcn.
+ * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
+ * finishing with vcn @last_vcn.
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the size of the
+ * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
+ * and finishing at the end of the runlist is determined.
+ *
  * This for example allows us to allocate a buffer of the right size when
  * building the mapping pairs array.
  *
@@ -1067,34 +1074,50 @@ static inline int ntfs_get_nr_significant_bytes(const s64 n)
  *	    remains locked throughout, and is left locked upon return.
  */
 int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN start_vcn)
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn)
 {
 	LCN prev_lcn;
 	int rls;
+	BOOL the_end = FALSE;
 
-	BUG_ON(start_vcn < 0);
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
 	if (!rl) {
-		BUG_ON(start_vcn);
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
 		return 1;
 	}
-	/* Skip to runlist element containing @start_vcn. */
-	while (rl->length && start_vcn >= rl[1].vcn)
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
 		rl++;
-	if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
 		return -EINVAL;
 	prev_lcn = 0;
 	/* Always need the termining zero byte. */
 	rls = 1;
 	/* Do the first partial run if present. */
-	if (start_vcn > rl->vcn) {
-		s64 delta;
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
 
 		/* We know rl->length != 0 already. */
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
-		delta = start_vcn - rl->vcn;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = TRUE;
+		}
+		delta = first_vcn - rl->vcn;
 		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(rl->length - delta);
+		rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
 		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
@@ -1102,9 +1125,9 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 		 * Note: this assumes that on NTFS 1.2-, holes are stored with
 		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			prev_lcn = rl->lcn;
-			if (rl->lcn >= 0)
+			if (likely(rl->lcn >= 0))
 				prev_lcn += delta;
 			/* Change in lcn. */
 			rls += ntfs_get_nr_significant_bytes(prev_lcn);
@@ -1113,11 +1136,23 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 		rl++;
 	}
 	/* Do the full runs. */
-	for (; rl->length; rl++) {
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = TRUE;
+		}
 		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(rl->length);
+		rls += 1 + ntfs_get_nr_significant_bytes(length);
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
 		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
@@ -1125,7 +1160,7 @@ int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
 		 * Note: this assumes that on NTFS 1.2-, holes are stored with
 		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			/* Change in lcn. */
 			rls += ntfs_get_nr_significant_bytes(rl->lcn -
 					prev_lcn);
@@ -1168,7 +1203,7 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
 
 	i = 0;
 	do {
-		if (dst > dst_max)
+		if (unlikely(dst > dst_max))
 			goto err_out;
 		*dst++ = l & 0xffll;
 		l >>= 8;
@@ -1177,12 +1212,12 @@ static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
 	j = (n >> 8 * (i - 1)) & 0xff;
 	/* If the sign bit is wrong, we need an extra byte. */
 	if (n < 0 && j >= 0) {
-		if (dst > dst_max)
+		if (unlikely(dst > dst_max))
 			goto err_out;
 		i++;
 		*dst = (s8)-1;
 	} else if (n > 0 && j < 0) {
-		if (dst > dst_max)
+		if (unlikely(dst > dst_max))
 			goto err_out;
 		i++;
 		*dst = (s8)0;
@@ -1198,13 +1233,18 @@ err_out:
  * @dst:	destination buffer to which to write the mapping pairs array
  * @dst_len:	size of destination buffer @dst in bytes
  * @rl:		locked runlist for which to build the mapping pairs array
- * @start_vcn:	vcn at which to start the mapping pairs array
+ * @first_vcn:	first vcn which to include in the mapping pairs array
+ * @last_vcn:	last vcn which to include in the mapping pairs array
  * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
  *
  * Create the mapping pairs array from the locked runlist @rl, starting at vcn
- * @start_vcn and save the array in @dst.  @dst_len is the size of @dst in
- * bytes and it should be at least equal to the value obtained by calling
- * ntfs_get_size_for_mapping_pairs().
+ * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
+ * @dst_len is the size of @dst in bytes and it should be at least equal to the
+ * value obtained by calling ntfs_get_size_for_mapping_pairs().
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
+ * array corresponding to the runlist starting at vcn @first_vcn and finishing
+ * at the end of the runlist is created.
  *
  * If @rl is NULL, just write a single terminator byte to @dst.
  *
@@ -1213,7 +1253,7 @@ err_out:
  * been filled with all the mapping pairs that will fit, thus it can be treated
  * as partial success, in that a new attribute extent needs to be created or
  * the next extent has to be used and the mapping pairs build has to be
- * continued with @start_vcn set to *@stop_vcn.
+ * continued with @first_vcn set to *@stop_vcn.
  *
  * Return 0 on success and -errno on error.  The following error codes are
  * defined:
@@ -1227,27 +1267,32 @@ err_out:
  */
 int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		const int dst_len, const runlist_element *rl,
-		const VCN start_vcn, VCN *const stop_vcn)
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
 {
 	LCN prev_lcn;
 	s8 *dst_max, *dst_next;
 	int err = -ENOSPC;
+	BOOL the_end = FALSE;
 	s8 len_len, lcn_len;
 
-	BUG_ON(start_vcn < 0);
+	BUG_ON(first_vcn < 0);
+	BUG_ON(last_vcn < -1);
+	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
 	BUG_ON(dst_len < 1);
 	if (!rl) {
-		BUG_ON(start_vcn);
+		BUG_ON(first_vcn);
+		BUG_ON(last_vcn > 0);
 		if (stop_vcn)
 			*stop_vcn = 0;
 		/* Terminator byte. */
 		*dst = 0;
 		return 0;
 	}
-	/* Skip to runlist element containing @start_vcn. */
-	while (rl->length && start_vcn >= rl[1].vcn)
+	/* Skip to runlist element containing @first_vcn. */
+	while (rl->length && first_vcn >= rl[1].vcn)
 		rl++;
-	if ((!rl->length && start_vcn > rl->vcn) || start_vcn < rl->vcn)
+	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+			first_vcn < rl->vcn))
 		return -EINVAL;
 	/*
 	 * @dst_max is used for bounds checking in
@@ -1256,17 +1301,27 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 	dst_max = dst + dst_len - 1;
 	prev_lcn = 0;
 	/* Do the first partial run if present. */
-	if (start_vcn > rl->vcn) {
-		s64 delta;
+	if (first_vcn > rl->vcn) {
+		s64 delta, length = rl->length;
 
 		/* We know rl->length != 0 already. */
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
-		delta = start_vcn - rl->vcn;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = TRUE;
+		}
+		delta = first_vcn - rl->vcn;
 		/* Write length. */
 		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				rl->length - delta);
-		if (len_len < 0)
+				length - delta);
+		if (unlikely(len_len < 0))
 			goto size_err;
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
@@ -1277,19 +1332,19 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		 * case on NT4. - We assume that we just need to write the lcn
 		 * change until someone tells us otherwise... (AIA)
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			prev_lcn = rl->lcn;
-			if (rl->lcn >= 0)
+			if (likely(rl->lcn >= 0))
 				prev_lcn += delta;
 			/* Write change in lcn. */
 			lcn_len = ntfs_write_significant_bytes(dst + 1 +
 					len_len, dst_max, prev_lcn);
-			if (lcn_len < 0)
+			if (unlikely(lcn_len < 0))
 				goto size_err;
 		} else
 			lcn_len = 0;
 		dst_next = dst + len_len + lcn_len + 1;
-		if (dst_next > dst_max)
+		if (unlikely(dst_next > dst_max))
 			goto size_err;
 		/* Update header byte. */
 		*dst = lcn_len << 4 | len_len;
@@ -1299,13 +1354,25 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		rl++;
 	}
 	/* Do the full runs. */
-	for (; rl->length; rl++) {
-		if (rl->length < 0 || rl->lcn < LCN_HOLE)
+	for (; rl->length && !the_end; rl++) {
+		s64 length = rl->length;
+
+		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
 			goto err_out;
+		/*
+		 * If @stop_vcn is given and finishes inside this run, cap the
+		 * run length.
+		 */
+		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+			s64 s1 = last_vcn + 1;
+			if (unlikely(rl[1].vcn > s1))
+				length = s1 - rl->vcn;
+			the_end = TRUE;
+		}
 		/* Write length. */
 		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				rl->length);
-		if (len_len < 0)
+				length);
+		if (unlikely(len_len < 0))
 			goto size_err;
 		/*
 		 * If the logical cluster number (lcn) denotes a hole and we
@@ -1316,17 +1383,17 @@ int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		 * case on NT4. - We assume that we just need to write the lcn
 		 * change until someone tells us otherwise... (AIA)
 		 */
-		if (rl->lcn >= 0 || vol->major_ver < 3) {
+		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
 			/* Write change in lcn. */
 			lcn_len = ntfs_write_significant_bytes(dst + 1 +
 					len_len, dst_max, rl->lcn - prev_lcn);
-			if (lcn_len < 0)
+			if (unlikely(lcn_len < 0))
 				goto size_err;
 			prev_lcn = rl->lcn;
 		} else
 			lcn_len = 0;
 		dst_next = dst + len_len + lcn_len + 1;
-		if (dst_next > dst_max)
+		if (unlikely(dst_next > dst_max))
 			goto size_err;
 		/* Update header byte. */
 		*dst = lcn_len << 4 | len_len;
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
index cf5c1b44bea8..aa0ee6540e7c 100644
--- a/fs/ntfs/runlist.h
+++ b/fs/ntfs/runlist.h
@@ -84,11 +84,12 @@ extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
 		const VCN vcn);
 
 extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN start_vcn);
+		const runlist_element *rl, const VCN first_vcn,
+		const VCN last_vcn);
 
 extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
 		const int dst_len, const runlist_element *rl,
-		const VCN start_vcn, VCN *const stop_vcn);
+		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
 
 extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
 		runlist *const runlist, const s64 new_length);
-- 
cgit v1.2.3


From ba6d2377c85c9b8a793f455d8c9b6cf31985d70f Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cantab.net>
Date: Sun, 26 Jun 2005 22:12:02 +0100
Subject: NTFS: Fix a nasty deadlock that appeared in recent kernels.       The
 situation: VFS inode X on a mounted ntfs volume is dirty.  For       same
 inode X, the ntfs_inode is dirty and thus corresponding on-disk       inode,
 i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging       to the
 table of inodes, i.e. $MFT, inode 0.       What happens:       Process 1:
 sys_sync()/umount()/whatever...  calls       __sync_single_inode() for $MFT
 -> do_writepages() -> write_page for       the dirty page containing the
 on-disk inode X, the page is now locked       -> ntfs_write_mst_block() which
 clears PageUptodate() on the page to       prevent anyone else getting hold
 of it whilst it does the write out.       This is necessary as the on-disk
 inode needs "fixups" applied before       the write to disk which are removed
 again after the write and       PageUptodate is then set again.  It then
 analyses the page looking       for dirty on-disk inodes and when it finds
 one it calls       ntfs_may_write_mft_record() to see if it is safe to write
 this       on-disk inode.  This then calls ilookup5() to check if the      
 corresponding VFS inode is in icache().  This in turn calls ifind()      
 which waits on the inode lock via wait_on_inode whilst holding the      
 global inode_lock.       Process 2: pdflush results in a call to
 __sync_single_inode for the       same VFS inode X on the ntfs volume.  This
 locks the inode (I_LOCK)       then calls write-inode -> ntfs_write_inode ->
 map_mft_record() ->       read_cache_page() for the page (in page cache of
 table of inodes       $MFT, inode 0) containing the on-disk inode.  This page
 has       PageUptodate() clear because of Process 1 (see above) so      
 read_cache_page() blocks when it tries to take the page lock for the      
 page so it can call ntfs_read_page().       Thus Process 1 is holding the
 page lock on the page containing the       on-disk inode X and it is waiting
 on the inode X to be unlocked in       ifind() so it can write the page out
 and then unlock the page.       And Process 2 is holding the inode lock on
 inode X and is waiting for       the page to be unlocked so it can call
 ntfs_readpage() or discover       that Process 1 set PageUptodate() again and
 use the page.       Thus we have a deadlock due to ifind() waiting on the
 inode lock.       The solution: The fix is to use the newly introduced      
 ilookup5_nowait() which does not wait on the inode's lock and hence      
 avoids the deadlock.  This is safe as we do not care about the VFS      
 inode and only use the fact that it is in the VFS inode cache and the      
 fact that the vfs and ntfs inodes are one struct in memory to find       the
 ntfs inode in memory if present.  Also, the ntfs inode has its       own
 locking so it does not matter if the vfs inode is locked.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
---
 Documentation/filesystems/ntfs.txt |  5 ++++-
 fs/ntfs/ChangeLog                  | 42 ++++++++++++++++++++++++++++++++++++++
 fs/ntfs/mft.c                      | 29 +++++++++++++++++---------
 3 files changed, 65 insertions(+), 11 deletions(-)

(limited to 'fs/ntfs/mft.c')

diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 1415b96ed491..eef4aca0c753 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -451,9 +451,12 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
 	- Implement extension of resident files using the normal file write
 	  code paths, i.e. most very small files can be extended to be a little
 	  bit bigger but not by much.
+	- Add new mount option "disable_sparse".  (See list of mount options
+	  above for details.)
 	- Improve handling of ntfs volumes with errors and strange boot sectors
 	  in particular.
-	- Fix various bugs.
+	- Fix various bugs including a nasty deadlock that appeared in recent
+	  kernels (around 2.6.11-2.6.12 timeframe).
 2.1.22:
 	- Improve handling of ntfs volumes with errors.
 	- Fix various bugs and race conditions.
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 3d2cac4061d6..9709fac6531d 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -132,6 +132,48 @@ ToDo/Notes:
 	  the already mapped runlist fragment which causes
 	  ntfs_mapping_pairs_decompress() to fail and return error.  Update
 	  ntfs_attr_find_vcn_nolock() accordingly.
+	- Fix a nasty deadlock that appeared in recent kernels.
+	  The situation: VFS inode X on a mounted ntfs volume is dirty.  For
+	  same inode X, the ntfs_inode is dirty and thus corresponding on-disk
+	  inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
+	  to the table of inodes, i.e. $MFT, inode 0.
+	  What happens:
+	  Process 1: sys_sync()/umount()/whatever...  calls
+	  __sync_single_inode() for $MFT -> do_writepages() -> write_page for
+	  the dirty page containing the on-disk inode X, the page is now locked
+	  -> ntfs_write_mst_block() which clears PageUptodate() on the page to
+	  prevent anyone else getting hold of it whilst it does the write out.
+	  This is necessary as the on-disk inode needs "fixups" applied before
+	  the write to disk which are removed again after the write and
+	  PageUptodate is then set again.  It then analyses the page looking
+	  for dirty on-disk inodes and when it finds one it calls
+	  ntfs_may_write_mft_record() to see if it is safe to write this
+	  on-disk inode.  This then calls ilookup5() to check if the
+	  corresponding VFS inode is in icache().  This in turn calls ifind()
+	  which waits on the inode lock via wait_on_inode whilst holding the
+	  global inode_lock.
+	  Process 2: pdflush results in a call to __sync_single_inode for the
+	  same VFS inode X on the ntfs volume.  This locks the inode (I_LOCK)
+	  then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
+	  read_cache_page() for the page (in page cache of table of inodes
+	  $MFT, inode 0) containing the on-disk inode.  This page has
+	  PageUptodate() clear because of Process 1 (see above) so
+	  read_cache_page() blocks when it tries to take the page lock for the
+	  page so it can call ntfs_read_page().
+	  Thus Process 1 is holding the page lock on the page containing the
+	  on-disk inode X and it is waiting on the inode X to be unlocked in
+	  ifind() so it can write the page out and then unlock the page.
+	  And Process 2 is holding the inode lock on inode X and is waiting for
+	  the page to be unlocked so it can call ntfs_readpage() or discover
+	  that Process 1 set PageUptodate() again and use the page.
+	  Thus we have a deadlock due to ifind() waiting on the inode lock.
+	  The solution: The fix is to use the newly introduced
+	  ilookup5_nowait() which does not wait on the inode's lock and hence
+	  avoids the deadlock.  This is safe as we do not care about the VFS
+	  inode and only use the fact that it is in the VFS inode cache and the
+	  fact that the vfs and ntfs inodes are one struct in memory to find
+	  the ntfs inode in memory if present.  Also, the ntfs inode has its
+	  own locking so it does not matter if the vfs inode is locked.
 
 2.1.22 - Many bug and race fixes and error handling improvements.
 
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3d0ba8e60adc..ac9ff39aa834 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -948,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	na.name_len = 0;
 	na.type = AT_UNUSED;
 	/*
-	 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or
-	 * we deadlock because the inode is already locked by the kernel
-	 * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
-	 * until the inode is unlocked before returning it and it never gets
-	 * unlocked because ntfs_should_write_mft_record() never returns.  )-:
-	 * Fortunately, we have inode 0 pinned in icache for the duration of
-	 * the mount so we can access it directly.
+	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
+	 * we get here for it rather often.
 	 */
 	if (!mft_no) {
 		/* Balance the below iput(). */
 		vi = igrab(mft_vi);
 		BUG_ON(vi != mft_vi);
-	} else
-		vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	} else {
+		/*
+		 * Have to use ilookup5_nowait() since ilookup5() waits for the
+		 * inode lock which causes ntfs to deadlock when a concurrent
+		 * inode write via the inode dirty code paths and the page
+		 * dirty code path of the inode dirty code path when writing
+		 * $MFT occurs.
+		 */
+		vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
+	}
 	if (vi) {
 		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
 		/* The inode is in icache. */
@@ -1016,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
 	na.mft_no = MREF_LE(m->base_mft_record);
 	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
 			"inode 0x%lx in icache.", mft_no, na.mft_no);
-	vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na);
+	if (!na.mft_no) {
+		/* Balance the below iput(). */
+		vi = igrab(mft_vi);
+		BUG_ON(vi != mft_vi);
+	} else
+		vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
+				&na);
 	if (!vi) {
 		/*
 		 * The base inode is not in icache, write this extent mft
-- 
cgit v1.2.3