summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig5
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/block_dev.c17
-rw-r--r--fs/btrfs/extent_io.c9
-rw-r--r--fs/btrfs/super.c2
-rw-r--r--fs/buffer.c64
-rw-r--r--fs/ceph/addr.c5
-rw-r--r--fs/ceph/caps.c61
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/export.c25
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/dcache.c8
-rw-r--r--fs/drop_caches.c5
-rw-r--r--fs/ecryptfs/inode.c5
-rw-r--r--fs/exec.c12
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c146
-rw-r--r--fs/ext4/ext4.h127
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h5
-rw-r--r--fs/ext4/extents.c1410
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c25
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c82
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c206
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fscache/operation.c10
-rw-r--r--fs/fscache/page.c13
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/gfs2/glock.c5
-rw-r--r--fs/gfs2/quota.c12
-rw-r--r--fs/gfs2/quota.h4
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c9
-rw-r--r--fs/hugetlbfs/inode.c7
-rw-r--r--fs/inode.c9
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c58
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/mbcache.c10
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/mpage.c7
-rw-r--r--fs/namei.c380
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/dir.c5
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/partitions/check.c8
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c20
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h26
-rw-r--r--fs/proc/namespaces.c198
-rw-r--r--fs/proc/task_mmu.c206
-rw-r--r--fs/quota/dquot.c5
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/splice.c33
-rw-r--r--fs/super.c3
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c5
-rw-r--r--fs/xfs/quota/xfs_qm.c6
-rw-r--r--fs/xfs/xfs_ag.h3
-rw-r--r--fs/xfs/xfs_alloc.c35
-rw-r--r--fs/xfs/xfs_alloc.h5
-rw-r--r--fs/xfs/xfs_alloc_btree.c3
-rw-r--r--fs/xfs/xfs_bmap.c549
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_inode.c15
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_log_cil.c13
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c2
109 files changed, 3339 insertions, 1835 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a8..0a93dc1cb4ac 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
config 9P_FS
- tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
- depends on INET && NET_9P && EXPERIMENTAL
+ tristate "Plan 9 Resource Sharing Support (9P2000)"
+ depends on INET && NET_9P
help
If you say Y here, you will get experimental support for
Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,7 +10,6 @@ config 9P_FS
If unsure, say N.
if 9P_FS
-
config 9P_FSCACHE
bool "Enable 9P client caching support (EXPERIMENTAL)"
depends on EXPERIMENTAL
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c67703195..8d7f3e69ae29 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
{
+ dentry_unhash(d);
return v9fs_remove(i, d, 1);
}
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
P9_DPRINTK(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = old_dentry->d_inode;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 82a7c38ddad0..691c78f58bef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
if (IS_ERR(inode_fid)) {
err = PTR_ERR(inode_fid);
mutex_unlock(&v9inode->v_mutex);
- goto error;
+ goto err_clunk_old_fid;
}
v9inode->writeback_fid = (void *) inode_fid;
}
@@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
/* Since we are opening a file, assign the open fid to the file */
filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
if (IS_ERR(filp)) {
- p9_client_clunk(ofid);
- return PTR_ERR(filp);
+ err = PTR_ERR(filp);
+ goto err_clunk_old_fid;
}
filp->private_data = ofid;
#ifdef CONFIG_9P_FSCACHE
@@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
return 0;
error:
- if (ofid)
- p9_client_clunk(ofid);
if (fid)
p9_client_clunk(fid);
+err_clunk_old_fid:
+ if (ofid)
+ p9_client_clunk(ofid);
return err;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b228..19891aab9c6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
def_bool n
config EXPORTFS
- bool
+ tristate
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
@@ -124,6 +124,7 @@ config TMPFS
config TMPFS_POSIX_ACL
bool "Tmpfs POSIX Access Control Lists"
depends on TMPFS
+ select TMPFS_XATTR
select GENERIC_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -134,6 +135,22 @@ config TMPFS_POSIX_ACL
If you don't know what Access Control Lists are, say N.
+config TMPFS_XATTR
+ bool "Tmpfs extended attributes"
+ depends on TMPFS
+ default n
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ Currently this enables support for the trusted.* and
+ security.* namespaces.
+
+ You need this for POSIX ACL support on tmpfs.
+
+ If unsure, say N.
+
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..03330e2e390c 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
dentry->d_inode->i_ino,
(int)dentry->d_name.len, dentry->d_name.name);
+ dentry_unhash(dentry);
+
return affs_remove_header(dentry);
}
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
(u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
(u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..2c4e05160042 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
_enter("{%x:%u},{%s}",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+ dentry_unhash(dentry);
+
ret = -ENAMETOOLONG;
if (dentry->d_name.len >= AFSNAMEMAX)
goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct key *key;
int ret;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
vnode = AFS_FS_I(old_dentry->d_inode);
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..87d95a8cddbc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EACCES;
+ dentry_unhash(dentry);
+
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..c7d1d06b0483 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct bfs_sb_info *info;
int error = -ENOENT;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_bh = new_bh = NULL;
old_inode = old_dentry->d_inode;
if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bf9c7a720371..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1238,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
res = __blkdev_get(bdev, mode, 0);
if (whole) {
+ struct gendisk *disk = whole->bd_disk;
+
/* finish claiming */
mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
@@ -1264,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
spin_unlock(&bdev_lock);
/*
- * Block event polling for write claims. Any write
- * holder makes the write_holder state stick until all
- * are released. This is good enough and tracking
- * individual writeable reference is too fragile given
- * the way @mode is used in blkdev_get/put().
+ * Block event polling for write claims if requested. Any
+ * write holder makes the write_holder state stick until
+ * all are released. This is good enough and tracking
+ * individual writeable reference is too fragile given the
+ * way @mode is used in blkdev_get/put().
*/
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+ if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+ !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
bdev->bd_write_holder = true;
- disk_block_events(bdev->bd_disk);
+ disk_block_events(disk);
}
mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96fcfa522dab..4f9893243dae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -11,6 +11,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
+#include <linux/cleancache.h>
#include "extent_io.h"
#include "extent_map.h"
#include "compat.h"
@@ -2016,6 +2017,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
set_page_extent_mapped(page);
+ if (!PageUptodate(page)) {
+ if (cleancache_get_page(page) == 0) {
+ BUG_ON(blocksize != PAGE_SIZE);
+ goto out;
+ }
+ }
+
end = page_end;
while (1) {
lock_extent(tree, start, end, GFP_NOFS);
@@ -2149,6 +2157,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
cur = cur + iosize;
page_offset += iosize;
}
+out:
if (!nr) {
if (!PageError(page))
SetPageUptodate(page);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0ac712efcdf2..be4ffa12f3ef 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,6 +39,7 @@
#include <linux/miscdevice.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include <linux/cleancache.h>
#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
@@ -624,6 +625,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_root = root_dentry;
save_mount_options(sb, data);
+ cleancache_init_fs(sb);
return 0;
fail_close:
diff --git a/fs/buffer.c b/fs/buffer.c
index a08bb8e61c6f..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -41,6 +41,7 @@
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
+#include <linux/cleancache.h>
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -269,6 +270,10 @@ void invalidate_bdev(struct block_device *bdev)
invalidate_bh_lrus();
lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
+ /* 99% of the time, we don't need to flush the cleancache on the bdev.
+ * But, for the strange corners, lets be cautious
+ */
+ cleancache_flush_inode(mapping);
}
EXPORT_SYMBOL(invalidate_bdev);
@@ -2331,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
* page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
*/
-int
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
{
struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
unsigned long end;
loff_t size;
- int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+ int ret;
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
(page_offset(page) > size)) {
- /* page got truncated out from underneath us */
- unlock_page(page);
- goto out;
+ /* We overload EFAULT to mean page got truncated */
+ ret = -EFAULT;
+ goto out_unlock;
}
/* page is wholly or partially inside EOF */
@@ -2361,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!ret)
ret = block_commit_write(page, 0, end);
- if (unlikely(ret)) {
- unlock_page(page);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else /* -ENOSPC, -EIO, etc */
- ret = VM_FAULT_SIGBUS;
- } else
- ret = VM_FAULT_LOCKED;
-
-out:
+ if (unlikely(ret < 0))
+ goto out_unlock;
+ /*
+ * Freezing in progress? We check after the page is marked dirty and
+ * with page lock held so if the test here fails, we are sure freezing
+ * code will wait during syncing until the page fault is done - at that
+ * point page will be dirty and unlocked so freezing code will write it
+ * and writeprotect it again.
+ */
+ set_page_dirty(page);
+ if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ return 0;
+out_unlock:
+ unlock_page(page);
return ret;
}
+EXPORT_SYMBOL(__block_page_mkwrite);
+
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
+{
+ int ret;
+ struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+
+ /*
+ * This check is racy but catches the common case. The check in
+ * __block_page_mkwrite() is reliable.
+ */
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ return block_page_mkwrite_return(ret);
+}
EXPORT_SYMBOL(block_page_mkwrite);
/*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b8ab554924..33da49dc3cc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -848,7 +848,8 @@ get_more_pages:
op->payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);
- ceph_osdc_start_request(&fsc->client->osdc, req, true);
+ rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+ BUG_ON(rc);
req = NULL;
/* continue? */
@@ -880,8 +881,6 @@ release_pvec_pages:
out:
if (req)
ceph_osdc_put_request(req);
- if (rc > 0)
- rc = 0; /* vfs expects us to return 0 */
ceph_put_snap_context(snapc);
dout("writepages done, rc = %d\n", rc);
return rc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a5404c1c42f..1f72b00447c4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -569,7 +569,8 @@ retry:
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
- }
+ } else if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
if (!ci->i_snap_realm) {
/*
@@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
struct ceph_mds_session *session,
int *open_target_sessions)
{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
* export targets, so that we get the matching IMPORT
*/
*open_target_sessions = 1;
+
+ /*
+ * we can't flush dirty caps that we've seen the
+ * EXPORT but no IMPORT for
+ */
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p to cap_dirty_migrating\n",
+ inode);
+ list_move(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
}
__ceph_remove_cap(cap);
}
@@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
ci->i_cap_exporting_issued = 0;
ci->i_cap_exporting_mseq = 0;
ci->i_cap_exporting_mds = -1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p back to cap_dirty\n", inode);
+ list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
} else {
dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
inode, ci, mds, mseq);
@@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
*/
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
- struct ceph_inode_info *ci, *nci = NULL;
- struct inode *inode, *ninode = NULL;
- struct list_head *p, *n;
+ struct ceph_inode_info *ci;
+ struct inode *inode;
dout("flush_dirty_caps\n");
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_safe(p, n, &mdsc->cap_dirty) {
- if (nci) {
- ci = nci;
- inode = ninode;
- ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
- dout("flush_dirty_caps inode %p (was next inode)\n",
- inode);
- } else {
- ci = list_entry(p, struct ceph_inode_info,
- i_dirty_item);
- inode = igrab(&ci->vfs_inode);
- BUG_ON(!inode);
- dout("flush_dirty_caps inode %p\n", inode);
- }
- if (n != &mdsc->cap_dirty) {
- nci = list_entry(n, struct ceph_inode_info,
- i_dirty_item);
- ninode = igrab(&nci->vfs_inode);
- BUG_ON(!ninode);
- nci->i_ceph_flags |= CEPH_I_NOFLUSH;
- dout("flush_dirty_caps next inode %p, noflush\n",
- ninode);
- } else {
- nci = NULL;
- ninode = NULL;
- }
+ while (!list_empty(&mdsc->cap_dirty)) {
+ ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+ i_dirty_item);
+ inode = igrab(&ci->vfs_inode);
+ dout("flush_dirty_caps %p\n", inode);
spin_unlock(&mdsc->cap_dirty_lock);
if (inode) {
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
@@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
spin_lock(&mdsc->cap_dirty_lock);
}
spin_unlock(&mdsc->cap_dirty_lock);
+ dout("flush_dirty_caps done\n");
}
/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601ae..33729e822bb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -360,7 +360,7 @@ more:
rinfo = &fi->last_readdir->r_reply_info;
dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
rinfo->dir_nr, off, fi->offset);
- while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+ while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
@@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
+ const int bufsize = 1024;
if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!cf->dir_info) {
- cf->dir_info = kmalloc(1024, GFP_NOFS);
+ cf->dir_info = kmalloc(bufsize, GFP_NOFS);
if (!cf->dir_info)
return -ENOMEM;
cf->dir_info_len =
- sprintf(cf->dir_info,
+ snprintf(cf->dir_info, bufsize,
"entries: %20lld\n"
" files: %20lld\n"
" subdirs: %20lld\n"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf8..a610d3d67488 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_nfs_fh *fh)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
struct dentry *dentry;
struct ceph_vino vino;
@@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
vino.ino = fh->ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+ if (!inode) {
+ struct ceph_mds_request *req;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+
+ req->r_ino1 = vino;
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ igrab(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ESTALE);
+ }
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
@@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ igrab(inode);
ceph_mdsc_put_request(req);
- inode = ceph_find_inode(sb, vino);
if (!inode)
return ERR_PTR(err ? err : -ESTALE);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d0fae4ce9ba5..79743d146be6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
+ ihold(dir);
spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+
+ iput(req->r_unsafe_dir);
+ req->r_unsafe_dir = NULL;
}
ceph_mdsc_put_request(req);
@@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
{
struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
- struct ceph_inode_info *ci;
struct dentry *parent, *dentry;
struct ceph_dentry_info *di;
int mds = session->s_mds;
@@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dout("handle_lease no inode %llx\n", vino.ino);
goto release;
}
- ci = ceph_inode(inode);
/* dentry */
parent = d_find_alias(inode);
@@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->cap_flush_seq = 0;
INIT_LIST_HEAD(&mdsc->cap_dirty);
+ INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba6..7d8a0d662d56 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -278,6 +278,7 @@ struct ceph_mds_client {
u64 cap_flush_seq;
struct list_head cap_dirty; /* inodes with dirty caps */
+ struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..a46126fd5735 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
int len = de->d_name.len;
int error;
+ dentry_unhash(de);
+
error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
if (!error) {
/* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
int new_length = new_dentry->d_name.len;
int error;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
coda_i2f(new_dir), old_length, new_length,
(const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..9d17d350abc5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
+ dentry_unhash(dentry);
+
if (dentry->d_parent == configfs_sb->s_root)
return -EPERM;
diff --git a/fs/dcache.c b/fs/dcache.c
index 18b2a1f10ed8..37f72ee5bf7c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1220,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
EXPORT_SYMBOL(shrink_dcache_parent);
/*
- * Scan `nr' dentries and return the number which remain.
+ * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
*
* We need to avoid reentering the filesystem if the caller is performing a
* GFP_NOFS allocation attempt. One example deadlock is:
@@ -1231,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
*
* In this case we return -1 to tell the caller that we baled.
*/
-static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c89494c..c00e055b6282 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
static void drop_slab(void)
{
int nr_objects;
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
do {
- nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+ nr_objects = shrink_slab(&shrink, 1000, 1000);
} while (nr_objects > 10);
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a90cd5..227b409b8406 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
struct dentry *lower_dir_dentry;
int rc;
+ dentry_unhash(dentry);
+
lower_dentry = ecryptfs_dentry_to_lower(dentry);
dget(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
@@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dir_dentry;
struct dentry *trap = NULL;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
dget(lower_old_dentry);
diff --git a/fs/exec.c b/fs/exec.c
index c1cf372f17a7..936f5776655c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -200,7 +200,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
#ifdef CONFIG_STACK_GROWSUP
if (write) {
- ret = expand_stack_downwards(bprm->vma, pos);
+ ret = expand_downwards(bprm->vma, pos);
if (ret < 0)
return NULL;
}
@@ -600,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
BUG_ON(new_start > new_end);
@@ -626,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
return -ENOMEM;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
- free_pgd_range(tlb, new_end, old_end, new_end,
+ free_pgd_range(&tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
} else {
/*
@@ -640,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* have constraints on va-space that make this illegal (IA64) -
* for the others its just a little faster.
*/
- free_pgd_range(tlb, old_start, old_end, new_end,
+ free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
}
- tlb_finish_mmu(tlb, new_end, old_end);
+ tlb_finish_mmu(&tlb, new_end, old_end);
/*
* Shrink the vma to just the new range. Always succeeds.
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c6a9e0eadc1..aad153ef6b78 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -36,6 +36,7 @@
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/log2.h>
+#include <linux/cleancache.h>
#include <asm/uaccess.h>
@@ -1367,6 +1368,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
} else {
ext3_msg(sb, KERN_INFO, "using internal journal");
}
+ cleancache_init_fs(sb);
return res;
}
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
}
/**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle: handle to this transaction
- * @sb: super block
- * @block: start physcial block to add to the block group
- * @count: number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
- ext4_group_t block_group;
- ext4_grpblk_t bit;
- unsigned int i;
- struct ext4_group_desc *desc;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err = 0, ret, blk_free_count;
- ext4_grpblk_t blocks_freed;
- struct ext4_group_info *grp;
-
- ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-
- ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- grp = ext4_get_group_info(sb, block_group);
- /*
- * Check to see if we are freeing blocks across a group
- * boundary.
- */
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- goto error_return;
- }
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
- goto error_return;
- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!desc)
- goto error_return;
-
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
- ext4_error(sb, "Adding blocks in system zones - "
- "Block = %llu, count = %lu",
- block, count);
- goto error_return;
- }
-
- /*
- * We are about to add blocks to the bitmap,
- * so we need undo access.
- */
- BUFFER_TRACE(bitmap_bh, "getting undo access");
- err = ext4_journal_get_undo_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
- /*
- * make sure we don't allow a parallel init on other groups in the
- * same buddy cache
- */
- down_write(&grp->alloc_sem);
- for (i = 0, blocks_freed = 0; i < count; i++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- blocks_freed++;
- }
- }
- ext4_lock_group(sb, block_group);
- blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
- ext4_free_blks_set(sb, desc, blk_free_count);
- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
- ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(blocks_freed,
- &sbi->s_flex_groups[flex_group].free_blocks);
- }
- /*
- * request to reload the buddy with the
- * new bitmap information
- */
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- grp->bb_free += blocks_freed;
- up_write(&grp->alloc_sem);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
-error_return:
- brelse(bitmap_bh);
- ext4_std_error(sb, err);
- return;
-}
-
-/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
* @nblocks: number of needed blocks
@@ -493,7 +369,8 @@ error_return:
* Check if filesystem has nblocks free & available for allocation.
* On success return 1, return 0 on failure.
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks, unsigned int flags)
{
s64 free_blocks, dirty_blocks, root_blocks;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
EXT4_FREEBLOCKS_WATERMARK) {
free_blocks = percpu_counter_sum_positive(fbc);
dirty_blocks = percpu_counter_sum_positive(dbc);
- if (dirty_blocks < 0) {
- printk(KERN_CRIT "Dirty block accounting "
- "went wrong %lld\n",
- (long long)dirty_blocks);
- }
}
/* Check whether we have space after
* accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
/* Hm, nope. Are (enough) root reserved blocks available? */
if (sbi->s_resuid == current_fsuid() ||
((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE)) {
+ capable(CAP_SYS_RESOURCE) ||
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
if (free_blocks >= (nblocks + dirty_blocks))
return 1;
}
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
}
int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks)
+ s64 nblocks, unsigned int flags)
{
- if (ext4_has_free_blocks(sbi, nblocks)) {
+ if (ext4_has_free_blocks(sbi, nblocks, flags)) {
percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
return 0;
} else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal)
return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* error stores in errp pointer
*/
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
+ ext4_fsblk_t goal, unsigned int flags,
+ unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode;
ar.goal = goal;
ar.len = count ? *count : 1;
+ ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_DELALLOC_RESERVED 0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC 0x0800
-
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
#define EXT4_ROOT_INO 2 /* Root inode */
+#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
+#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
+ /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/*
* Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
__le16 s_want_extra_isize; /* New inodes should reserve # bytes */
__le32 s_flags; /* Miscellaneous flags */
__le16 s_raid_stride; /* RAID stride */
- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
unsigned long s_ext_blocks;
unsigned long s_ext_extents;
#endif
+ /* ext4 extent cache stats */
+ unsigned long extent_cache_hits;
+ unsigned long extent_cache_misses;
/* for buddy allocator */
struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
struct ext4_li_request *s_li_request;
/* Wait multiplier for lazy initialization thread */
unsigned int s_li_wait_mult;
+
+ /* Kernel thread for multiple mount protection */
+ struct task_struct *s_mmp_tsk;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
- EXT4_FEATURE_INCOMPAT_FLEX_BG)
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_MMP)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
*/
struct ext4_lazy_init {
unsigned long li_state;
-
- wait_queue_head_t li_wait_daemon;
- wait_queue_head_t li_wait_task;
- struct timer_list li_timer;
- struct task_struct *li_task;
-
struct list_head li_request_list;
struct mutex li_list_mtx;
};
@@ -1615,6 +1639,67 @@ struct ext4_features {
};
/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+ __le32 mmp_magic; /* Magic number for MMP */
+ __le32 mmp_seq; /* Sequence no. updated periodically */
+
+ /*
+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+ * purposes and do not affect the correctness of the algorithm
+ */
+ __le64 mmp_time; /* Time last updated */
+ char mmp_nodename[64]; /* Node which last updated MMP block */
+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */
+
+ /*
+ * mmp_check_interval is used to verify if the MMP block has been
+ * updated on the block device. The value is updated based on the
+ * maximum time to write the MMP block during an update cycle.
+ */
+ __le16 mmp_check_interval;
+
+ __le16 mmp_pad1;
+ __le32 mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+ struct buffer_head *bh; /* bh from initial read_mmp_block() */
+ struct super_block *sb; /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT 2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
+
+/*
* Function prototypes
*/
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count);
+ ext4_fsblk_t goal,
+ unsigned int flags,
+ unsigned long *count,
+ int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
unsigned long count, int flags);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
/* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
__LINE__, ## message)
extern void ext4_msg(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+ const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
+ __LINE__, msg)
extern void __ext4_grp_locked_error(const char *, unsigned int, \
struct super_block *, ext4_group_t, \
unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+ loff_t length);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc);
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
#include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
-{
- int err = 0;
-
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_get_undo_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__, bh,
- handle, err);
- }
- return err;
-}
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb);
-#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
#include <trace/events/ext4.h>
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags);
+
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex, int *err)
+ struct ext4_extent *ex, int *err, unsigned int flags)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+ newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+ NULL, err);
return newblock;
}
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
}
ext_debug("\n");
}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t newblock, int level)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+
+ if (depth != level) {
+ struct ext4_extent_idx *idx;
+ idx = path[level].p_idx;
+ while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+ ext_debug("%d: move %d:%llu in new index %llu\n", level,
+ le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx),
+ newblock);
+ idx++;
+ }
+
+ return;
+ }
+
+ ex = path[depth].p_ext;
+ while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ newblock);
+ ex++;
+ }
+}
+
#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
#endif
void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
* - initializes subtree
*/
static int ext4_ext_split(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext, int at)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int at)
{
struct buffer_head *bh = NULL;
int depth = ext_depth(inode);
struct ext4_extent_header *neh;
struct ext4_extent_idx *fidx;
- struct ext4_extent *ex;
int i = at, k, m, a;
ext4_fsblk_t newblock, oldblock;
__le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
- newext, &err);
+ newext, &err, flags);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
- ex = EXT_FIRST_EXTENT(neh);
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
/* start copy from next extent */
- /* TODO: we could do it by single memmove */
- m = 0;
- path[depth].p_ext++;
- while (path[depth].p_ext <=
- EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
- le32_to_cpu(path[depth].p_ext->ee_block),
- ext4_ext_pblock(path[depth].p_ext),
- ext4_ext_is_uninitialized(path[depth].p_ext),
- ext4_ext_get_actual_len(path[depth].p_ext),
- newblock);
- /*memmove(ex++, path[depth].p_ext++,
- sizeof(struct ext4_extent));
- neh->eh_entries++;*/
- path[depth].p_ext++;
- m++;
- }
+ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+ ext4_ext_show_move(inode, path, newblock, depth);
if (m) {
- memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+ struct ext4_extent *ex;
+ ex = EXT_FIRST_EXTENT(neh);
+ memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
le16_add_cpu(&neh->eh_entries, m);
}
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
- /* copy indexes */
- m = 0;
- path[i].p_idx++;
- ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
- EXT_MAX_INDEX(path[i].p_hdr));
+ /* move remainder of path[i] to the new index block */
if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
EXT_LAST_INDEX(path[i].p_hdr))) {
EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = -EIO;
goto cleanup;
}
- while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
- ext_debug("%d: move %d:%llu in new index %llu\n", i,
- le32_to_cpu(path[i].p_idx->ei_block),
- ext4_idx_pblock(path[i].p_idx),
- newblock);
- /*memmove(++fidx, path[i].p_idx++,
- sizeof(struct ext4_extent_idx));
- neh->eh_entries++;
- BUG_ON(neh->eh_entries > neh->eh_max);*/
- path[i].p_idx++;
- m++;
- }
+ /* start copy indexes */
+ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+ ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ EXT_MAX_INDEX(path[i].p_hdr));
+ ext4_ext_show_move(inode, path, newblock, i);
if (m) {
- memmove(++fidx, path[i].p_idx - m,
+ memmove(++fidx, path[i].p_idx,
sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
@@ -1056,8 +1073,9 @@ cleanup:
* just created block
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err, flags);
if (newblock == 0)
return err;
@@ -1140,8 +1159,9 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
struct ext4_ext_path *curp;
int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, path, newext, i);
+ err = ext4_ext_split(handle, inode, flags, path, newext, i);
if (err)
goto out;
@@ -1174,7 +1194,8 @@ repeat:
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, path, newext);
+ err = ext4_ext_grow_indepth(handle, inode, flags,
+ path, newext);
if (err)
goto out;
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
* 1 if they got merged.
*/
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex)
{
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
}
/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex) {
+ struct ext4_extent_header *eh;
+ unsigned int depth;
+ int merge_done = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ if (ex > EXT_FIRST_EXTENT(eh))
+ merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+ if (!merge_done)
+ ret = ext4_ext_try_to_merge_right(inode, path, ex);
+
+ return ret;
+}
+
+/*
* check if a portion of the "newext" extent overlaps with an
* existing extent.
*
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
unsigned uninitialized = 0;
+ int flags = 0;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+ if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+ flags = EXT4_MB_USE_ROOT_BLOCKS;
+ err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
}
/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer. If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex: Pointer where the cached extent will be stored
+ * if it contains block
+ *
* Return 0 if cache is invalid; 1 if the cache is valid
*/
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_ext_cache *ex){
struct ext4_ext_cache *cex;
+ struct ext4_sb_info *sbi;
int ret = 0;
/*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
*/
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
cex = &EXT4_I(inode)->i_cached_extent;
+ sbi = EXT4_SB(inode->i_sb);
/* has cache valid data? */
if (cex->ec_len == 0)
goto errout;
if (in_range(block, cex->ec_block, cex->ec_len)) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
+ memcpy(ex, cex, sizeof(struct ext4_ext_cache));
ext_debug("%u cached by %u:%u:%llu\n",
block,
cex->ec_block, cex->ec_len, cex->ec_start);
ret = 1;
}
errout:
+ if (!ret)
+ sbi->extent_cache_misses++;
+ else
+ sbi->extent_cache_hits++;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return ret;
}
/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex: Pointer where the cached extent will be stored
+ * if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_extent *ex)
+{
+ struct ext4_ext_cache cex;
+ int ret = 0;
+
+ if (ext4_ext_check_cache(inode, block, &cex)) {
+ ex->ee_block = cpu_to_le32(cex.ec_block);
+ ext4_ext_store_pblock(ex, cex.ec_start);
+ ex->ee_len = cpu_to_le16(cex.ec_len);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+/*
* ext4_ext_rm_idx:
* removes index from the index block.
* It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, start, num, flags);
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
+ /* head removal */
+ ext4_lblk_t num;
+ ext4_fsblk_t start;
+
+ num = to - from;
+ start = ext4_ext_pblock(ex);
+
+ ext_debug("free first %u blocks starting %llu\n", num, start);
+ ext4_free_blocks(handle, inode, 0, start, num, flags);
+
} else {
printk(KERN_INFO "strange request: removal(2) "
"%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
return 0;
}
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode: The files inode
+ * @path: The path to the leaf
+ * @start: The first block to remove
+ * @end: The last block to remove
+ */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_lblk_t start)
+ struct ext4_ext_path *path, ext4_lblk_t start,
+ ext4_lblk_t end)
{
int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
+ struct ext4_map_blocks map;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
- ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
ext_debug(" border %u:%u\n", a, b);
- if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
- block = 0;
- num = 0;
- BUG();
+ /* If this extent is beyond the end of the hole, skip it */
+ if (end <= ex_ee_block) {
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+ continue;
+ } else if (a != ex_ee_block &&
+ b != ex_ee_block + ex_ee_len - 1) {
+ /*
+ * If this is a truncate, then this condition should
+ * never happen because at least one of the end points
+ * needs to be on the edge of the extent.
+ */
+ if (end == EXT_MAX_BLOCK) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ block = 0;
+ num = 0;
+ err = -EIO;
+ goto out;
+ }
+ /*
+ * else this is a hole punch, so the extent needs to
+ * be split since neither edge of the hole is on the
+ * extent edge
+ */
+ else{
+ map.m_pblk = ext4_ext_pblock(ex);
+ map.m_lblk = ex_ee_block;
+ map.m_len = b - ex_ee_block;
+
+ err = ext4_split_extent(handle,
+ inode, path, &map, 0,
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+ EXT4_GET_BLOCKS_PRE_IO);
+
+ if (err < 0)
+ goto out;
+
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
+
+ /* Then remove tail of this extent */
+ block = ex_ee_block;
+ num = a - block;
+ }
} else if (a != ex_ee_block) {
/* remove tail of the extent */
block = ex_ee_block;
num = a - block;
} else if (b != ex_ee_block + ex_ee_len - 1) {
/* remove head of the extent */
- block = a;
- num = b - a;
- /* there is no "make a hole" API yet */
- BUG();
+ block = b;
+ num = ex_ee_block + ex_ee_len - b;
+
+ /*
+ * If this is a truncate, this condition
+ * should never happen
+ */
+ if (end == EXT_MAX_BLOCK) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
+ }
} else {
/* remove whole extent: excellent! */
block = ex_ee_block;
num = 0;
- BUG_ON(a != ex_ee_block);
- BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+ if (a != ex_ee_block) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
+ }
+
+ if (b != ex_ee_block + ex_ee_len - 1) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
+ }
}
/*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (num == 0) {
/* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0);
- le16_add_cpu(&eh->eh_entries, -1);
+ } else if (block != ex_ee_block) {
+ /*
+ * If this was a head removal, then we need to update
+ * the physical block since it is now at a different
+ * location
+ */
+ ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
}
ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
+ /*
+ * If the extent was completely released,
+ * we need to remove it from the leaf
+ */
+ if (num == 0) {
+ if (end != EXT_MAX_BLOCK) {
+ /*
+ * For hole punching, we need to scoot all the
+ * extents up when an extent is removed so that
+ * we dont have blank extents in the middle
+ */
+ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+ sizeof(struct ext4_extent));
+
+ /* Now get rid of the one at the end */
+ memset(EXT_LAST_EXTENT(eh), 0,
+ sizeof(struct ext4_extent));
+ }
+ le16_add_cpu(&eh->eh_entries, -1);
+ }
+
ext_debug("new extent: %u:%u:%llu\n", block, num,
ext4_ext_pblock(ex));
ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
while (i >= 0 && err == 0) {
if (i == depth) {
/* this is leaf block */
- err = ext4_ext_rm_leaf(handle, inode, path, start);
+ err = ext4_ext_rm_leaf(handle, inode, path,
+ start, end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
return ret;
}
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
+ due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ * the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ * a> the extent are splitted into two extent.
+ * b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags)
+{
+ ext4_fsblk_t newblock;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex2 = NULL;
+ unsigned int ee_len, depth;
+ int err = 0;
+
+ ext_debug("ext4_split_extents_at: inode %lu, logical"
+ "block %llu\n", inode->i_ino, (unsigned long long)split);
+
+ ext4_ext_show_leaf(inode, path);
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ newblock = split - ee_block + ext4_ext_pblock(ex);
+
+ BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (split == ee_block) {
+ /*
+ * case b: block @split is the block that the extent begins with
+ * then we just change the state of the extent, and splitting
+ * is not needed.
+ */
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ ext4_ext_mark_uninitialized(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+
+ if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(inode, path, ex);
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ }
+
+ /* case a */
+ memcpy(&orig_ex, ex, sizeof(orig_ex));
+ ex->ee_len = cpu_to_le16(split - ee_block);
+ if (split_flag & EXT4_EXT_MARK_UNINIT1)
+ ext4_ext_mark_uninitialized(ex);
+
+ /*
+ * path may lead to new leaf, not to original leaf any more
+ * after ext4_ext_insert_extent() returns,
+ */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto fix_extent_len;
+
+ ex2 = &newex;
+ ex2->ee_block = cpu_to_le32(split);
+ ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
+ ext4_ext_store_pblock(ex2, newblock);
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ ext4_ext_mark_uninitialized(ex2);
+
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le32(ee_len);
+ ext4_ext_try_to_merge(inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ } else if (err)
+ goto fix_extent_len;
+
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+
+fix_extent_len:
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ * a> There is no split required
+ * b> Splits in two extents: Split is happening at either end of the extent
+ * c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags)
+{
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len, depth;
+ int err = 0;
+ int uninitialized;
+ int split_flag1, flags1;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ uninitialized = ext4_ext_is_uninitialized(ex);
+
+ if (map->m_lblk + map->m_len < ee_block + ee_len) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ if (uninitialized)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+ EXT4_EXT_MARK_UNINIT2;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk + map->m_len, split_flag1, flags1);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ if (map->m_lblk >= ee_block) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ if (uninitialized)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk, split_flag1, flags);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_show_leaf(inode, path);
+out:
+ return err ? err : map->m_len;
+}
+
#define EXT4_EXT_ZERO_LEN 7
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
- struct ext4_extent *ex, newex, orig_ex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
- struct ext4_extent_header *eh;
+ struct ext4_map_blocks split_map;
+ struct ext4_extent zero_ex;
+ struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
- ext4_fsblk_t newblock;
int err = 0;
- int ret = 0;
- int may_zeroout;
+ int split_flag = 0;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
- eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
- newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
- ex2 = ex;
- orig_ex.ee_block = ex->ee_block;
- orig_ex.ee_len = cpu_to_le16(ee_len);
- ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
+ WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully insde i_size or new_size.
*/
- may_zeroout = ee_block + ee_len <= eof_block;
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ err = ext4_ext_zeroout(inode, ex);
if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- return allocated;
- }
-
- /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
- if (map->m_lblk > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
- */
- if (!ex1 && allocated > map->m_len)
- ex2->ee_len = cpu_to_le16(map->m_len);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > map->m_len) {
- unsigned int newdepth;
- /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
- /*
- * map->m_lblk == ee_block is handled by the zerouout
- * at the beginning.
- * Mark first half uninitialized.
- * Mark second half initialized and zero out the
- * initialized extent
- */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = cpu_to_le16(ee_len - allocated);
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
-
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex3, newblock);
- ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path,
- ex3, 0);
- if (err == -ENOSPC) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex,
- ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
-
- /*
- * We need to zero out the second half because
- * an fallocate request can update file size and
- * converting the second half to initialized extent
- * implies that we can leak some junk data to user
- * space.
- */
- err = ext4_ext_zeroout(inode, ex3);
- if (err) {
- /*
- * We should actually mark the
- * second half as uninit and return error
- * Insert would have changed the extent
- */
- depth = ext_depth(inode);
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk,
- path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- return err;
- }
- /* get the second half extent details */
- ex = path[depth].p_ext;
- err = ext4_ext_get_access(handle, inode,
- path + depth);
- if (err)
- return err;
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
- }
-
- /* zeroed the second half */
- return allocated;
- }
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
- ext4_ext_store_pblock(ex3, newblock + map->m_len);
- ex3->ee_len = cpu_to_le16(allocated - map->m_len);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
- /*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
- */
- newdepth = ext_depth(inode);
- /*
- * update the extent length after successful insert of the
- * split extent
- */
- ee_len -= ext4_ext_get_actual_len(ex3);
- orig_ex.ee_len = cpu_to_le16(ee_len);
- may_zeroout = ee_block + ee_len <= eof_block;
-
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
goto out;
- }
- eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
-
- allocated = map->m_len;
-
- /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
- * to insert a extent in the middle zerout directly
- * otherwise give the extent a chance to merge to left
- */
- if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- map->m_lblk != ee_block && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- /* blocks available from map->m_lblk */
- return allocated;
- }
- }
- /*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
- */
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
- ex2->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- if (ex2 != ex)
- goto insert;
- /*
- * New (initialized) extent starts from the first block
- * in the current extent. i.e., ex2 == ex
- * We have to see if it can be merged with the extent
- * on the left.
- */
- if (ex2 > EXT_FIRST_EXTENT(eh)) {
- /*
- * To merge left, pass "ex2 - 1" to try_to_merge(),
- * since it merges towards right _only_.
- */
- ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- ex2--;
- }
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_try_to_merge(inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
}
+
/*
- * Try to Merge towards right. This might be required
- * only when the whole extent is being written to.
- * i.e. ex2 == ex and ex3 == NULL.
+ * four cases:
+ * 1. split the extent into three extents.
+ * 2. split the extent into two extents, zeroout the first half.
+ * 3. split the extent into two extents, zeroout the second half.
+ * 4. split the extent into two extents with out zeroout.
*/
- if (!ex3) {
- ret = ext4_ext_try_to_merge(inode, path, ex2);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = map->m_len;
+
+ if (allocated > map->m_len) {
+ if (allocated <= EXT4_EXT_ZERO_LEN &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ /* case 3 */
+ zero_ex.ee_block =
+ cpu_to_le32(map->m_lblk);
+ zero_ex.ee_len = cpu_to_le16(allocated);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+ err = ext4_ext_zeroout(inode, &zero_ex);
if (err)
goto out;
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = allocated;
+ } else if ((map->m_lblk - ee_block + map->m_len <
+ EXT4_EXT_ZERO_LEN) &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ /* case 2 */
+ if (map->m_lblk != ee_block) {
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+ ee_block);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ err = ext4_ext_zeroout(inode, &zero_ex);
+ if (err)
+ goto out;
+ }
+
+ split_map.m_lblk = ee_block;
+ split_map.m_len = map->m_lblk - ee_block + map->m_len;
+ allocated = map->m_len;
}
}
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- return allocated;
- } else if (err)
- goto fix_extent_len;
+
+ allocated = ext4_split_extent(handle, inode, path,
+ &split_map, split_flag, 0);
+ if (allocated < 0)
+ err = allocated;
+
out:
- ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
-
-fix_extent_len:
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
}
/*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_ext_path *path,
int flags)
{
- struct ext4_extent *ex, newex, orig_ex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
- ext4_lblk_t ee_block, eof_block;
- unsigned int allocated, ee_len, depth;
- ext4_fsblk_t newblock;
- int err = 0;
- int may_zeroout;
+ ext4_lblk_t eof_block;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len;
+ int split_flag = 0, depth;
ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
-
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (map->m_lblk - ee_block);
- newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
- ex2 = ex;
- orig_ex.ee_block = ex->ee_block;
- orig_ex.ee_len = cpu_to_le16(ee_len);
- ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
-
/*
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully insde i_size or new_size.
*/
- may_zeroout = ee_block + ee_len <= eof_block;
-
- /*
- * If the uninitialized extent begins at the same logical
- * block where the write begins, and the write completely
- * covers the extent, then we don't need to split it.
- */
- if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
- return allocated;
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
- /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
- if (map->m_lblk > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
- */
- if (!ex1 && allocated > map->m_len)
- ex2->ee_len = cpu_to_le16(map->m_len);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > map->m_len) {
- unsigned int newdepth;
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
- ext4_ext_store_pblock(ex3, newblock + map->m_len);
- ex3->ee_len = cpu_to_le16(allocated - map->m_len);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
- /*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
- */
- newdepth = ext_depth(inode);
- /*
- * update the extent length after successful insert of the
- * split extent
- */
- ee_len -= ext4_ext_get_actual_len(ex3);
- orig_ex.ee_len = cpu_to_le16(ee_len);
- may_zeroout = ee_block + ee_len <= eof_block;
-
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= EXT4_EXT_MARK_UNINIT2;
- allocated = map->m_len;
- }
- /*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
- */
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
- * using direct I/O, uninitialised still.
- */
- ex2->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- ext4_ext_mark_uninitialized(ex2);
- if (ex2 != ex)
- goto insert;
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- ext_debug("out here\n");
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- return allocated;
- } else if (err)
- goto fix_extent_len;
-out:
- ext4_ext_show_leaf(inode, path);
- return err ? err : allocated;
-
-fix_extent_len:
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
+ flags |= EXT4_GET_BLOCKS_PRE_IO;
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
+
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct ext4_extent_header *eh;
int depth;
int err = 0;
- int ret = 0;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
+ ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
- /*
- * We have to see if it can be merged with the extent
- * on the left.
- */
- if (ex > EXT_FIRST_EXTENT(eh)) {
- /*
- * To merge left, pass "ex - 1" to try_to_merge(),
- * since it merges towards right _only_.
- */
- ret = ext4_ext_try_to_merge(inode, path, ex - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- ex--;
- }
- }
- /*
- * Try to Merge towards right.
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
*/
- ret = ext4_ext_try_to_merge(inode, path, ex);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- }
+ ext4_ext_try_to_merge(inode, path, ex);
+
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock = 0;
int err = 0, depth, ret;
unsigned int allocated = 0;
+ unsigned int punched_out = 0;
+ unsigned int result = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ struct ext4_map_blocks punch_map;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* check in cache */
- if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+ if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+ ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) {
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- /* Do not put uninitialized extent in the cache */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start);
- goto out;
+ if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
+ /*
+ * Do not put uninitialized extent
+ * in the cache
+ */
+ if (!ext4_ext_is_uninitialized(ex)) {
+ ext4_ext_put_in_cache(inode, ee_block,
+ ee_len, ee_start);
+ goto out;
+ }
+ ret = ext4_ext_handle_uninitialized_extents(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ return ret;
}
- ret = ext4_ext_handle_uninitialized_extents(handle,
- inode, map, path, flags, allocated,
- newblock);
- return ret;
+
+ /*
+ * Punch out the map length, but only to the
+ * end of the extent
+ */
+ punched_out = allocated < map->m_len ?
+ allocated : map->m_len;
+
+ /*
+ * Sense extents need to be converted to
+ * uninitialized, they must fit in an
+ * uninitialized extent
+ */
+ if (punched_out > EXT_UNINIT_MAX_LEN)
+ punched_out = EXT_UNINIT_MAX_LEN;
+
+ punch_map.m_lblk = map->m_lblk;
+ punch_map.m_pblk = newblock;
+ punch_map.m_len = punched_out;
+ punch_map.m_flags = 0;
+
+ /* Check to see if the extent needs to be split */
+ if (punch_map.m_len != ee_len ||
+ punch_map.m_lblk != ee_block) {
+
+ ret = ext4_split_extent(handle, inode,
+ path, &punch_map, 0,
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+ EXT4_GET_BLOCKS_PRE_IO);
+
+ if (ret < 0) {
+ err = ret;
+ goto out2;
+ }
+ /*
+ * find extent for the block at
+ * the start of the hole
+ */
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ path = ext4_ext_find_extent(inode,
+ map->m_lblk, NULL);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out2;
+ }
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_start = ext4_ext_pblock(ex);
+
+ }
+
+ ext4_ext_mark_uninitialized(ex);
+
+ err = ext4_ext_remove_space(inode, map->m_lblk,
+ map->m_lblk + punched_out);
+
+ goto out2;
}
}
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
@@ -3529,7 +3647,11 @@ out2:
}
trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
newblock, map->m_len, err ? err : allocated);
- return err ? err : allocated;
+
+ result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+ punched_out : allocated;
+
+ return err ? err : result;
}
void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
- err = ext4_ext_remove_space(inode, last_block);
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
/* In a multi-transaction truncate, we only make the final
* transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-out_stop:
up_write(&EXT4_I(inode)->i_data_sem);
+
+out_stop:
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- /* We only support the FALLOC_FL_KEEP_SIZE mode */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
- return -EOPNOTSUPP;
-
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
+ /* Return error if mode is not supported */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return ext4_punch_hole(file, offset, len);
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -3691,7 +3817,8 @@ retry:
break;
}
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+ EXT4_GET_BLOCKS_NO_NORMALIZE);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
pgoff_t last_offset;
pgoff_t offset;
pgoff_t index;
+ pgoff_t start_index = 0;
struct page **pages = NULL;
struct buffer_head *bh = NULL;
struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
kfree(pages);
return EXT_CONTINUE;
}
+ index = 0;
+next_page:
/* Try to find the 1st mapped buffer. */
- end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+ end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
blksize_bits;
- if (!page_has_buffers(pages[0]))
+ if (!page_has_buffers(pages[index]))
goto out;
- head = page_buffers(pages[0]);
+ head = page_buffers(pages[index]);
if (!head)
goto out;
+ index++;
bh = head;
do {
- if (buffer_mapped(bh)) {
+ if (end >= newex->ec_block +
+ newex->ec_len)
+ /* The buffer is out of
+ * the request range.
+ */
+ goto out;
+
+ if (buffer_mapped(bh) &&
+ end >= newex->ec_block) {
+ start_index = index - 1;
/* get the 1st mapped buffer. */
- if (end > newex->ec_block +
- newex->ec_len)
- /* The buffer is out of
- * the request range.
- */
- goto out;
goto found_mapped_buffer;
}
+
bh = bh->b_this_page;
end++;
} while (bh != head);
- /* No mapped buffer found. */
- goto out;
+ /* No mapped buffer in the range found in this page,
+ * We need to look up next page.
+ */
+ if (index >= ret) {
+ /* There is no page left, but we need to limit
+ * newex->ec_len.
+ */
+ newex->ec_len = end - newex->ec_block;
+ goto out;
+ }
+ goto next_page;
} else {
/*Find contiguous delayed buffers. */
if (ret > 0 && pages[0]->index == last_offset)
head = page_buffers(pages[0]);
bh = head;
+ index = 1;
+ start_index = 0;
}
found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
end++;
} while (bh != head);
- for (index = 1; index < ret; index++) {
+ for (; index < ret; index++) {
if (!page_has_buffers(pages[index])) {
bh = NULL;
break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
bh = NULL;
break;
}
+
if (pages[index]->index !=
- pages[0]->index + index) {
+ pages[start_index]->index + index
+ - start_index) {
/* Blocks are not contiguous. */
bh = NULL;
break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
}
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode: The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_ext_cache cache_ex;
+ ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+ struct address_space *mapping = inode->i_mapping;
+ struct ext4_map_blocks map;
+ handle_t *handle;
+ loff_t first_block_offset, last_block_offset, block_len;
+ loff_t first_page, last_page, first_page_offset, last_page_offset;
+ int ret, credits, blocks_released, err = 0;
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+ last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ first_page_offset == 0 ? 0 : first_page_offset-1,
+ last_page_offset);
+
+ if (err)
+ return err;
+ }
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_inode_pages_range(mapping, first_page_offset,
+ last_page_offset-1);
+ }
+
+ /* finish any pending end_io work */
+ ext4_flush_completed_IO(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err)
+ goto out;
+
+ /*
+ * Now we need to zero out the un block aligned data.
+ * If the file is smaller than a block, just
+ * zero out the middle
+ */
+ if (first_block > last_block)
+ ext4_block_zero_page_range(handle, mapping, offset, length);
+ else {
+ /* zero out the head of the hole before the first block */
+ block_len = first_block_offset - offset;
+ if (block_len > 0)
+ ext4_block_zero_page_range(handle, mapping,
+ offset, block_len);
+
+ /* zero out the tail of the hole after the last block */
+ block_len = offset + length - last_block_offset;
+ if (block_len > 0) {
+ ext4_block_zero_page_range(handle, mapping,
+ last_block_offset, block_len);
+ }
+ }
+
+ /* If there are no blocks to remove, return now */
+ if (first_block >= last_block)
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
+
+ /*
+ * Loop over all the blocks and identify blocks
+ * that need to be punched out
+ */
+ iblock = first_block;
+ blocks_released = 0;
+ while (iblock < last_block) {
+ max_blocks = last_block - iblock;
+ num_blocks = 1;
+ memset(&map, 0, sizeof(map));
+ map.m_lblk = iblock;
+ map.m_len = max_blocks;
+ ret = ext4_ext_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+ if (ret > 0) {
+ blocks_released += ret;
+ num_blocks = ret;
+ } else if (ret == 0) {
+ /*
+ * If map blocks could not find the block,
+ * then it is in a hole. If the hole was
+ * not already cached, then map blocks should
+ * put it in the cache. So we can get the hole
+ * out of the cache
+ */
+ memset(&cache_ex, 0, sizeof(cache_ex));
+ if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+ !cache_ex.ec_start) {
+
+ /* The hole is cached */
+ num_blocks = cache_ex.ec_block +
+ cache_ex.ec_len - iblock;
+
+ } else {
+ /* The block could not be identified */
+ err = -EIO;
+ break;
+ }
+ } else {
+ /* Map blocks error */
+ err = ret;
+ break;
+ }
+
+ if (num_blocks == 0) {
+ /* This condition should never happen */
+ ext_debug("Block lookup failed");
+ err = -EIO;
+ break;
+ }
+
+ iblock += num_blocks;
+ }
+
+ if (blocks_released > 0) {
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
+ }
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+ ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ return err;
+}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return error;
}
-
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
};
const struct inode_operations ext4_file_inode_operations = {
- .truncate = ext4_truncate,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
#ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
static void dump_completed_IO(struct inode * inode)
{
-#ifdef EXT4_DEBUG
+#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret;
tid_t commit_tid;
+ bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
}
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (jbd2_log_start_commit(journal, commit_tid)) {
- /*
- * When the journal is on a different device than the
- * fs data disk, we need to issue the barrier in
- * writeback mode. (In ordered mode, the jbd2 layer
- * will take care of issuing the barrier. In
- * data=journal, all of the data blocks are written to
- * the journal device.)
- */
- if (ext4_should_writeback_data(inode) &&
- (journal->j_fs_dev != journal->j_dev) &&
- (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
- NULL);
- ret = jbd2_log_wait_commit(journal, commit_tid);
- } else if (journal->j_flags & JBD2_BARRIER)
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ jbd2_log_start_commit(journal, commit_tid);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
+ if (needs_barrier)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
out:
trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_meta_blocks(handle, inode,
- goal, &count, err);
+ current_block = ext4_new_meta_blocks(handle, inode, goal,
+ 0, &count, err);
if (*err)
goto failed_out;
@@ -1930,7 +1930,7 @@ repeat:
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
- if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+ if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
dquot_release_reservation_block(inode, 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
continue;
}
- if (PageWriteback(page))
- wait_on_page_writeback(page);
-
+ wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ ext4_truncate_failed_write(inode);
}
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
+{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, length, pos;
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
return -EINVAL;
blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
+ max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
int ext4_can_truncate(struct inode *inode)
{
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return 0;
if (S_ISREG(inode->i_mode))
return 1;
if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ if (!S_ISREG(inode->i_mode))
+ return -ENOTSUPP;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ /* TODO: Add support for non extent hole punching */
+ return -ENOTSUPP;
+ }
+
+ return ext4_ext_punch_hole(file, offset, length);
+}
+
+/*
* ext4_truncate()
*
* We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/*
* Figure out the offset within the block group inode table
*/
- inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
+ (attr->ia_size < inode->i_size)) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out;
}
}
- /* ext4_truncate will clear the flag */
- if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
- ext4_truncate(inode);
}
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- rc = vmtruncate(inode, attr->ia_size);
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ ext4_truncate(inode);
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+ ext4_truncate(inode);
+ }
if (!rc) {
setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out_unlock;
}
ret = 0;
- if (PageMappedToDisk(page))
- goto out_unlock;
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ if (PageMappedToDisk(page)) {
+ up_read(&inode->i_alloc_sem);
+ return VM_FAULT_LOCKED;
+ }
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
- lock_page(page);
/*
* return if we have all the buffers mapped. This avoid
* the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
ext4_bh_unmapped)) {
- unlock_page(page);
- goto out_unlock;
+ up_read(&inode->i_alloc_sem);
+ return VM_FAULT_LOCKED;
}
}
unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret < 0)
goto out_unlock;
ret = 0;
+
+ /*
+ * write_begin/end might have created a dirty page and someone
+ * could wander in and start the IO. Make sure that hasn't
+ * happened.
+ */
+ lock_page(page);
+ wait_on_page_writeback(page);
+ up_read(&inode->i_alloc_sem);
+ return VM_FAULT_LOCKED;
out_unlock:
if (ret)
ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
struct inode *inode;
char *data;
char *bitmap;
+ struct ext4_group_info *grinfo;
mb_debug(1, "init page %lu\n", page->index);
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (first_group + i >= ngroups)
break;
+ grinfo = ext4_get_group_info(sb, first_group + i);
+ /*
+ * If page is uptodate then we came here after online resize
+ * which added some new uninitialized group info structs, so
+ * we must skip all initialized uptodate buddies on the page,
+ * which may be currently in use by an allocating task.
+ */
+ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ bh[i] = NULL;
+ continue;
+ }
+
err = -EIO;
desc = ext4_get_group_desc(sb, first_group + i, NULL);
if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page && bh[i]; i++)
- wait_on_buffer(bh[i]);
+ for (i = 0; i < groups_per_page; i++)
+ if (bh[i])
+ wait_on_buffer(bh[i]);
err = -EIO;
- for (i = 0; i < groups_per_page && bh[i]; i++)
- if (!buffer_uptodate(bh[i]))
+ for (i = 0; i < groups_per_page; i++)
+ if (bh[i] && !buffer_uptodate(bh[i]))
goto out;
err = 0;
first_block = page->index * blocks_per_page;
- /* init the page */
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
for (i = 0; i < blocks_per_page; i++) {
int group;
- struct ext4_group_info *grinfo;
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
+ if (!bh[group - first_group])
+ /* skip initialized uptodate buddy */
+ continue;
+
/*
* data carry information regarding this
* particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
* incore got set to the group block bitmap below
*/
ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group);
ext4_unlock_group(sb, group);
incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
out:
if (bh) {
- for (i = 0; i < groups_per_page && bh[i]; i++)
+ for (i = 0; i < groups_per_page; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -957,22 +974,21 @@ out:
}
/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
+ struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+ int block, pnum, poff;
int blocks_per_page;
- int groups_per_page;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- ext4_group_t first_group;
- struct ext4_group_info *grp;
+ struct page *page;
+
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
/*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
*/
block = group * 2;
pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
-
- groups_per_page = blocks_per_page >> 1;
- if (groups_per_page == 0)
- groups_per_page = 1;
- /* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -EIO;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- if ((first_group + i) >= ngroups)
- break;
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- down_write_nested(&grp->alloc_sem, i);
+ if (blocks_per_page >= 2) {
+ /* buddy and bitmap are on the same page */
+ return 0;
}
- return i;
+
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -EIO;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_buddy_page = page;
+ return 0;
}
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
- int blocks_per_page;
- ext4_group_t first_group;
- struct ext4_group_info *grp;
-
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
- /* release locks on all the groups */
- for (i = 0; i < locked_group; i++) {
-
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- up_write(&grp->alloc_sem);
+ if (e4b->bd_bitmap_page) {
+ unlock_page(e4b->bd_bitmap_page);
+ page_cache_release(e4b->bd_bitmap_page);
+ }
+ if (e4b->bd_buddy_page) {
+ unlock_page(e4b->bd_buddy_page);
+ page_cache_release(e4b->bd_buddy_page);
}
-
}
/*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
- int ret = 0;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
+ struct ext4_buddy e4b;
+ struct page *page;
+ int ret = 0;
mb_debug(1, "init group %u\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
this_grp = ext4_get_group_info(sb, group);
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
- * would have taken the alloc_sem lock.
+ * would have pinned buddy page to page cache.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
* return without doing anything
*/
- ret = 0;
goto err;
}
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
+
+ page = e4b.bd_bitmap_page;
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
+ if (e4b.bd_buddy_page == NULL) {
/*
* If both the bitmap and buddy are in
* the same page we don't need to force
* init the buddy
*/
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
+ ret = 0;
+ goto err;
}
- if (page == NULL || !PageUptodate(page)) {
+ /* init buddy cache */
+ page = e4b.bd_buddy_page;
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
mark_page_accessed(page);
err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
+ ext4_mb_put_buddy_page_lock(&e4b);
return ret;
}
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
- e4b->alloc_semp = &grp->alloc_sem;
-
- /* Take the read lock on the group alloc
- * sem. This would make sure a parallel
- * ext4_mb_init_group happening on other
- * groups mapped by the page is blocked
- * till we are done with allocation
- */
-repeat_load_buddy:
- down_read(e4b->alloc_semp);
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- /* we need to check for group need init flag
- * with alloc_semp held so that we can be sure
- * that new blocks didn't get added to the group
- * when we are loading the buddy cache
- */
- up_read(e4b->alloc_semp);
/*
* we need full data about the group
* to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
ret = ext4_mb_init_group(sb, group);
if (ret)
return ret;
- goto repeat_load_buddy;
}
/*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
return 0;
err:
+ if (page)
+ page_cache_release(page);
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
-
- /* Done with the buddy cache */
- up_read(e4b->alloc_semp);
return ret;
}
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
- /* Done with the buddy cache */
- if (e4b->alloc_semp)
- up_read(e4b->alloc_semp);
}
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
- /* on allocation we use ac to track the held semaphore */
- ac->alloc_semp = e4b->alloc_semp;
- e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
struct ext4_group_info *db;
- int err, ret, count = 0, count2 = 0;
+ int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
struct list_head *l, *ltmp;
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
- if (test_opt(sb, DISCARD)) {
- ret = ext4_issue_discard(sb, entry->group,
- entry->start_blk, entry->count);
- if (unlikely(ret == -EOPNOTSUPP)) {
- ext4_warning(sb, "discard not supported, "
- "disabling");
- clear_opt(sb, DISCARD);
- }
- }
+ if (test_opt(sb, DISCARD))
+ ext4_issue_discard(sb, entry->group,
+ entry->start_blk, entry->count);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
spin_unlock(&pa->pa_lock);
}
}
- if (ac->alloc_semp)
- up_read(ac->alloc_semp);
if (pa) {
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
- * doesn't grow big. We need to release
- * alloc_semp before calling ext4_mb_add_n_trim()
+ * doesn't grow big.
*/
if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
* there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits.
*/
- while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ while (ar->len &&
+ ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+
/* let others to free the space */
yield();
ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
return 0;
}
reserv_blks = ar->len;
- while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
- ar->flags |= EXT4_MB_HINT_NOPREALLOC;
- ar->len--;
+ if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+ dquot_alloc_block_nofail(ar->inode, ar->len);
+ } else {
+ while (ar->len &&
+ dquot_alloc_block(ar->inode, ar->len)) {
+
+ ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+ ar->len--;
+ }
}
inquota = ar->len;
if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
}
/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physcial block to add to the block group
+ * @count: number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ ext4_grpblk_t bit;
+ unsigned int i;
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+ struct ext4_group_info *grp;
+
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ grp = ext4_get_group_info(sb, block_group);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+ goto error_return;
+
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh)
+ goto error_return;
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!desc)
+ goto error_return;
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group)) {
+ ext4_error(sb, "Adding blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ for (i = 0, blocks_freed = 0; i < count; i++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+ ext4_error(sb, "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ blocks_freed++;
+ }
+ }
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ /*
+ * need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_free_blocks(NULL, &e4b, bit, count);
+ blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+ ext4_free_blks_set(sb, desc, blk_free_count);
+ desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic_add(blocks_freed,
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return;
+}
+
+/**
* ext4_trim_extent -- function to TRIM one single free extent in the group
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+ ext4_group_t group, struct ext4_buddy *e4b)
{
struct ext4_free_extent ex;
- int ret = 0;
assert_spin_locked(ext4_group_lock_ptr(sb, group));
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
-
- ret = ext4_issue_discard(sb, group, start, count);
-
+ ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
- return ret;
}
/**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
* the group buddy bitmap. This is done until whole group is scanned.
*/
static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
- ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
{
void *bitmap;
ext4_grpblk_t next, count = 0;
- ext4_group_t group;
- int ret = 0;
+ struct ext4_buddy e4b;
+ int ret;
- BUG_ON(e4b == NULL);
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_error(sb, "Error in loading buddy "
+ "information for %u", group);
+ return ret;
+ }
+ bitmap = e4b.bd_bitmap;
- bitmap = e4b->bd_bitmap;
- group = e4b->bd_group;
- start = (e4b->bd_info->bb_first_free > start) ?
- e4b->bd_info->bb_first_free : start;
ext4_lock_group(sb, group);
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
while (start < max) {
start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
next = mb_find_next_bit(bitmap, max, start);
if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, e4b);
- if (ret < 0)
- break;
+ ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
count += next - start;
}
start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
ext4_lock_group(sb, group);
}
- if ((e4b->bd_info->bb_free - count) < minblocks)
+ if ((e4b.bd_info->bb_free - count) < minblocks)
break;
}
ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- if (ret < 0)
- count = ret;
-
return count;
}
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct ext4_buddy e4b;
+ struct ext4_group_info *grp;
ext4_group_t first_group, last_group;
ext4_group_t group, ngroups = ext4_get_groups_count(sb);
ext4_grpblk_t cnt = 0, first_block, last_block;
- uint64_t start, len, minlen, trimmed;
+ uint64_t start, len, minlen, trimmed = 0;
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> sb->s_blocksize_bits;
len = range->len >> sb->s_blocksize_bits;
minlen = range->minlen >> sb->s_blocksize_bits;
- trimmed = 0;
if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
return -EINVAL;
for (group = first_group; group <= last_group; group++) {
- ret = ext4_mb_load_buddy(sb, group, &e4b);
- if (ret) {
- ext4_error(sb, "Error in loading buddy "
- "information for %u", group);
- break;
+ grp = ext4_get_group_info(sb, group);
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ break;
}
/*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
last_block = first_block + len;
len -= last_block - first_block;
- if (e4b.bd_info->bb_free >= minlen) {
- cnt = ext4_trim_all_free(sb, &e4b, first_block,
+ if (grp->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, group, first_block,
last_block, minlen);
if (cnt < 0) {
ret = cnt;
- ext4_mb_unload_buddy(&e4b);
break;
}
}
- ext4_mb_unload_buddy(&e4b);
trimmed += cnt;
first_block = 0;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
__u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page;
struct page *ac_buddy_page;
- /*
- * pointer to the held semaphore upon successful
- * block allocation
- */
- struct rw_semaphore *alloc_semp;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -215,7 +210,6 @@ struct ext4_buddy {
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
- struct rw_semaphore *alloc_semp;
};
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* We have the extent map build with the tmp inode.
* Now copy the i_data across
*/
- ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+ mark_buffer_dirty(bh);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(WRITE_SYNC, bh);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+ ext4_fsblk_t mmp_block)
+{
+ struct mmp_struct *mmp;
+
+ if (*bh)
+ clear_buffer_uptodate(*bh);
+
+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
+ * that the MD RAID device cache has been bypassed, and that the read
+ * is not blocked in the elevator. */
+ if (!*bh)
+ *bh = sb_getblk(sb, mmp_block);
+ if (*bh) {
+ get_bh(*bh);
+ lock_buffer(*bh);
+ (*bh)->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_SYNC, *bh);
+ wait_on_buffer(*bh);
+ if (!buffer_uptodate(*bh)) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ }
+ if (!*bh) {
+ ext4_warning(sb, "Error while reading MMP block %llu",
+ mmp_block);
+ return -EIO;
+ }
+
+ mmp = (struct mmp_struct *)((*bh)->b_data);
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+ const char *function, unsigned int line, const char *msg)
+{
+ __ext4_warning(sb, function, line, msg);
+ __ext4_warning(sb, function, line,
+ "MMP failure info: last update time: %llu, last update "
+ "node: %s, last update device: %s\n",
+ (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+ mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+ struct super_block *sb = ((struct mmpd_data *) data)->sb;
+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct mmp_struct *mmp;
+ ext4_fsblk_t mmp_block;
+ u32 seq = 0;
+ unsigned long failed_writes = 0;
+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned mmp_check_interval;
+ unsigned long last_update_time;
+ unsigned long diff;
+ int retval;
+
+ mmp_block = le64_to_cpu(es->s_mmp_block);
+ mmp = (struct mmp_struct *)(bh->b_data);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ /*
+ * Start with the higher mmp_check_interval and reduce it if
+ * the MMP block is being updated on time.
+ */
+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+ memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+ sizeof(mmp->mmp_nodename));
+
+ while (!kthread_should_stop()) {
+ if (++seq > EXT4_MMP_SEQ_MAX)
+ seq = 1;
+
+ mmp->mmp_seq = cpu_to_le32(seq);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ last_update_time = jiffies;
+
+ retval = write_mmp_block(bh);
+ /*
+ * Don't spew too many error messages. Print one every
+ * (s_mmp_update_interval * 60) seconds.
+ */
+ if (retval && (failed_writes % 60) == 0) {
+ ext4_error(sb, "Error writing to MMP block");
+ failed_writes++;
+ }
+
+ if (!(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_MMP)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ if (sb->s_flags & MS_RDONLY) {
+ ext4_warning(sb, "kmmpd being stopped since filesystem "
+ "has been remounted as readonly.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ diff = jiffies - last_update_time;
+ if (diff < mmp_update_interval * HZ)
+ schedule_timeout_interruptible(mmp_update_interval *
+ HZ - diff);
+
+ /*
+ * We need to make sure that more than mmp_check_interval
+ * seconds have not passed since writing. If that has happened
+ * we need to check if the MMP block is as we left it.
+ */
+ diff = jiffies - last_update_time;
+ if (diff > mmp_check_interval * HZ) {
+ struct buffer_head *bh_check = NULL;
+ struct mmp_struct *mmp_check;
+
+ retval = read_mmp_block(sb, &bh_check, mmp_block);
+ if (retval) {
+ ext4_error(sb, "error reading MMP data: %d",
+ retval);
+
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+ sizeof(mmp->mmp_nodename))) {
+ dump_mmp_msg(sb, mmp_check,
+ "Error while updating MMP info. "
+ "The filesystem seems to have been"
+ " multiply mounted.");
+ ext4_error(sb, "abort");
+ goto failed;
+ }
+ put_bh(bh_check);
+ }
+
+ /*
+ * Adjust the mmp_check_interval depending on how much time
+ * it took for the MMP block to be written.
+ */
+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MAX_CHECK_INTERVAL),
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ }
+
+ /*
+ * Unmount seems to be clean.
+ */
+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+
+ retval = write_mmp_block(bh);
+
+failed:
+ kfree(data);
+ brelse(bh);
+ return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+ u32 new_seq;
+
+ do {
+ get_random_bytes(&new_seq, sizeof(u32));
+ } while (new_seq > EXT4_MMP_SEQ_MAX);
+
+ return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+ ext4_fsblk_t mmp_block)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = NULL;
+ struct mmp_struct *mmp = NULL;
+ struct mmpd_data *mmpd_data;
+ u32 seq;
+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned int wait_time = 0;
+ int retval;
+
+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+ mmp_block >= ext4_blocks_count(es)) {
+ ext4_warning(sb, "Invalid MMP block in superblock");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+
+ mmp = (struct mmp_struct *)(bh->b_data);
+
+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+ /*
+ * If check_interval in MMP block is larger, use that instead of
+ * update_interval from the superblock.
+ */
+ if (mmp->mmp_check_interval > mmp_check_interval)
+ mmp_check_interval = mmp->mmp_check_interval;
+
+ seq = le32_to_cpu(mmp->mmp_seq);
+ if (seq == EXT4_MMP_SEQ_CLEAN)
+ goto skip;
+
+ if (seq == EXT4_MMP_SEQ_FSCK) {
+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ goto failed;
+ }
+
+ wait_time = min(mmp_check_interval * 2 + 1,
+ mmp_check_interval + 60);
+
+ /* Print MMP interval if more than 20 secs. */
+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+ ext4_warning(sb, "MMP interval %u higher than expected, please"
+ " wait.\n", wait_time * 2);
+
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+skip:
+ /*
+ * write a new random sequence number.
+ */
+ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+ retval = write_mmp_block(bh);
+ if (retval)
+ goto failed;
+
+ /*
+ * wait for MMP interval and check mmp_seq.
+ */
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+ if (!mmpd_data) {
+ ext4_warning(sb, "not enough memory for mmpd_data");
+ goto failed;
+ }
+ mmpd_data->sb = sb;
+ mmpd_data->bh = bh;
+
+ /*
+ * Start a kernel thread to update the MMP block periodically.
+ */
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ bdevname(bh->b_bdev,
+ mmp->mmp_bdevname));
+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ kfree(mmpd_data);
+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+ sb->s_id);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ brelse(bh);
+ return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* It needs to call wait_on_page_writeback() to wait for the
* writeback of the page.
*/
- if (PageWriteback(page))
- wait_on_page_writeback(page);
+ wait_on_page_writeback(page);
/* Release old bh and drop refs */
try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+
+ ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ ext4_handle_dirty_metadata(handle, dir, bh);
+
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
handle_t *handle;
struct inode *inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
dquot_initialize(dir);
+ if (l > EXT4_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks.
+ */
+ credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2263,21 +2292,44 @@ retry:
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof(EXT4_I(inode)->i_data)) {
+ if (l > EXT4_N_BLOCKS * 4) {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
- * page_symlink() calls into ext4_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext4_write_begin() which can wait
+ * for transaction commit if we are running out of space
+ * and thus we deadlock. So we have to stop transaction now
+ * and restart it when symlink contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
*/
+ drop_nlink(inode);
+ err = ext4_orphan_add(handle, inode);
+ ext4_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+ * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ inc_nlink(inode);
+ err = ext4_orphan_del(handle, inode);
if (err) {
+ ext4_journal_stop(handle);
clear_nlink(inode);
- unlock_new_inode(inode);
- ext4_mark_inode_dirty(handle, inode);
- iput(inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
/* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head;
- int partial_write = 0;
+ loff_t offset;
+ loff_t io_end_offset;
- head = page_buffers(page);
- if (error)
+ if (error) {
SetPageError(page);
- BUG_ON(!head);
- if (head->b_size != PAGE_CACHE_SIZE) {
- loff_t offset;
- loff_t io_end_offset = io_end->offset + io_end->size;
+ set_bit(AS_EIO, &page->mapping->flags);
+ head = page_buffers(page);
+ BUG_ON(!head);
+
+ io_end_offset = io_end->offset + io_end->size;
offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
bh = head;
do {
if ((offset >= io_end->offset) &&
- (offset+bh->b_size <= io_end_offset)) {
- if (error)
- buffer_io_error(bh);
-
- }
- if (buffer_delay(bh))
- partial_write = 1;
- else if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- else if (buffer_dirty(bh))
- partial_write = 1;
+ (offset+bh->b_size <= io_end_offset))
+ buffer_io_error(bh);
+
offset += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
}
- /*
- * If this is a partial write which happened to make
- * all buffers uptodate then we can optimize away a
- * bogus readpage() for the next read(). Here we
- * 'discover' whether the page went uptodate as a
- * result of this (potentially partial) write.
- */
- if (!partial_write)
- SetPageUptodate(page);
-
put_io_page(io_end->pages[i]);
}
io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8553dfb310af..cc5c157aa11d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,6 +38,7 @@
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
+#include <linux/cleancache.h>
#include <asm/uaccess.h>
#include <linux/kthread.h>
@@ -75,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext3_fs_type = {
.owner = THIS_MODULE,
@@ -806,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL;
/*
* Now that we are completely done shutting down the
@@ -1096,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (!test_opt(sb, INIT_INODE_TABLE))
seq_puts(seq, ",noinit_inode_table");
- else if (sbi->s_li_wait_mult)
+ else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
seq_printf(seq, ",init_inode_table=%u",
(unsigned) sbi->s_li_wait_mult);
@@ -1187,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
-#endif
.write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
@@ -1900,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_WARNING,
"warning: mounting fs with errors, "
"running e2fsck is recommended");
- else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
ext4_msg(sb, KERN_WARNING,
@@ -1932,6 +1949,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt, sbi->s_mount_opt2);
+ cleancache_init_fs(sb);
return res;
}
@@ -2425,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
+
static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
@@ -2482,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2497,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(extent_cache_hits),
+ ATTR_LIST(extent_cache_misses),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -2659,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
}
-static void ext4_lazyinode_timeout(unsigned long data)
-{
- struct task_struct *p = (struct task_struct *)data;
- wake_up_process(p);
-}
-
/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
@@ -2696,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
if (elr->lr_timeout == 0) {
- timeout = jiffies - timeout;
- if (elr->lr_sbi->s_li_wait_mult)
- timeout *= elr->lr_sbi->s_li_wait_mult;
- else
- timeout *= 20;
+ timeout = (jiffies - timeout) *
+ elr->lr_sbi->s_li_wait_mult;
elr->lr_timeout = timeout;
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2712,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
/*
* Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
@@ -2730,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
static void ext4_unregister_li_request(struct super_block *sb)
{
- struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
-
- if (!ext4_li_info)
+ mutex_lock(&ext4_li_mtx);
+ if (!ext4_li_info) {
+ mutex_unlock(&ext4_li_mtx);
return;
+ }
mutex_lock(&ext4_li_info->li_list_mtx);
- ext4_remove_li_request(elr);
+ ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
mutex_unlock(&ext4_li_info->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
}
static struct task_struct *ext4_lazyinit_task;
@@ -2756,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
- unsigned long next_wakeup;
- DEFINE_WAIT(wait);
+ unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
- eli->li_timer.data = (unsigned long)current;
- eli->li_timer.function = ext4_lazyinode_timeout;
-
- eli->li_task = current;
- wake_up(&eli->li_wait_task);
-
cont_thread:
while (true) {
next_wakeup = MAX_JIFFY_OFFSET;
@@ -2797,19 +2817,15 @@ cont_thread:
if (freezing(current))
refrigerator();
- if ((time_after_eq(jiffies, next_wakeup)) ||
+ cur = jiffies;
+ if ((time_after_eq(cur, next_wakeup)) ||
(MAX_JIFFY_OFFSET == next_wakeup)) {
cond_resched();
continue;
}
- eli->li_timer.expires = next_wakeup;
- add_timer(&eli->li_timer);
- prepare_to_wait(&eli->li_wait_daemon, &wait,
- TASK_INTERRUPTIBLE);
- if (time_before(jiffies, next_wakeup))
- schedule();
- finish_wait(&eli->li_wait_daemon, &wait);
+ schedule_timeout_interruptible(next_wakeup - cur);
+
if (kthread_should_stop()) {
ext4_clear_request_list();
goto exit_thread;
@@ -2833,12 +2849,7 @@ exit_thread:
goto cont_thread;
}
mutex_unlock(&eli->li_list_mtx);
- del_timer_sync(&ext4_li_info->li_timer);
- eli->li_task = NULL;
- wake_up(&eli->li_wait_task);
-
kfree(ext4_li_info);
- ext4_lazyinit_task = NULL;
ext4_li_info = NULL;
mutex_unlock(&ext4_li_mtx);
@@ -2866,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
if (IS_ERR(ext4_lazyinit_task)) {
int err = PTR_ERR(ext4_lazyinit_task);
ext4_clear_request_list();
- del_timer_sync(&ext4_li_info->li_timer);
kfree(ext4_li_info);
ext4_li_info = NULL;
printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2875,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
return err;
}
ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-
- wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
return 0;
}
@@ -2911,13 +2919,9 @@ static int ext4_li_info_new(void)
if (!eli)
return -ENOMEM;
- eli->li_task = NULL;
INIT_LIST_HEAD(&eli->li_request_list);
mutex_init(&eli->li_list_mtx);
- init_waitqueue_head(&eli->li_wait_daemon);
- init_waitqueue_head(&eli->li_wait_task);
- init_timer(&eli->li_timer);
eli->li_state |= EXT4_LAZYINIT_QUIT;
ext4_li_info = eli;
@@ -2960,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int ret = 0;
- if (sbi->s_li_request != NULL)
+ if (sbi->s_li_request != NULL) {
+ /*
+ * Reset timeout so it can be computed again, because
+ * s_li_wait_mult might have changed.
+ */
+ sbi->s_li_request->lr_timeout = 0;
return 0;
+ }
if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) ||
- !test_opt(sb, INIT_INODE_TABLE)) {
- sbi->s_li_request = NULL;
+ !test_opt(sb, INIT_INODE_TABLE))
return 0;
- }
-
- if (first_not_zeroed == ngroups) {
- sbi->s_li_request = NULL;
- return 0;
- }
elr = ext4_li_request_new(sb, first_not_zeroed);
if (!elr)
@@ -3166,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
&journal_devnum, &journal_ioprio, NULL, 0)) {
ext4_msg(sb, KERN_WARNING,
@@ -3187,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
+ if (IS_EXT2_SB(sb)) {
+ if (ext2_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
+ if (IS_EXT3_SB(sb)) {
+ if (ext3_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
@@ -3459,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_RECOVER));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+ !(sb->s_flags & MS_RDONLY))
+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ goto failed_mount3;
+
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
@@ -3474,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
} else {
clear_opt(sb, DATA_FLAGS);
- set_opt(sb, WRITEBACK_DATA);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
@@ -3707,6 +3742,8 @@ failed_mount3:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -4242,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
int enable_quota = 0;
ext4_group_t g;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- int err;
+ int err = 0;
#ifdef CONFIG_QUOTA
int i;
#endif
@@ -4368,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block))) {
+ err = -EROFS;
+ goto restore_opts;
+ }
enable_quota = 1;
}
}
@@ -4432,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
u64 fsid;
+ s64 bfree;
if (test_opt(sb, MINIX_DF)) {
sbi->s_overhead_last = 0;
@@ -4475,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+ bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+ /* prevent underflow in case that few free space is available */
+ buf->f_bfree = max_t(s64, bfree, 0);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
@@ -4652,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
+ if (!inode)
+ goto out;
+
/* Update modification times of quota files when userspace can
* start looking at them */
handle = ext4_journal_start(inode, 1);
@@ -4772,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
}
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext2",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-
static inline void register_as_ext2(void)
{
int err = register_filesystem(&ext2_fs_type);
@@ -4792,10 +4834,22 @@ static inline void unregister_as_ext2(void)
{
unregister_filesystem(&ext2_fs_type);
}
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
MODULE_ALIAS("ext2");
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4811,10 +4865,24 @@ static inline void unregister_as_ext3(void)
{
unregister_filesystem(&ext3_fs_type);
}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
MODULE_ALIAS("ext3");
#else
static inline void register_as_ext3(void) { }
static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
#endif
static struct file_system_type ext4_fs_type = {
@@ -4898,8 +4966,8 @@ static int __init ext4_init_fs(void)
err = init_inodecache();
if (err)
goto out1;
- register_as_ext2();
register_as_ext3();
+ register_as_ext2();
err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
- block = ext4_new_meta_blocks(handle, inode,
- goal, NULL, &error);
+ block = ext4_new_meta_blocks(handle, inode, goal, 0,
+ NULL, &error);
if (error)
goto cleanup;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 3b222dafd15b..be15437c272e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
+ dentry_unhash(dentry);
+
lock_super(sb);
/*
* Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
err = fat_scan(old_dir, old_name, &old_sinfo);
if (err) {
err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 20b4ea53fdc4..c61a6789f36c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
+ dentry_unhash(dentry);
+
lock_super(sb);
err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f184d50..30afdfa7aec7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
_enter("{OBJ%x OP%x,%u}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
- fscache_set_op_state(op, "EnQ");
-
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
static void fscache_run_op(struct fscache_object *object,
struct fscache_operation *op)
{
- fscache_set_op_state(op, "Run");
-
object->n_in_progress++;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
- fscache_set_op_state(op, "SubmitX");
-
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
ASSERTCMP(atomic_read(&op->usage), >, 0);
- fscache_set_op_state(op, "Submit");
-
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
if (!atomic_dec_and_test(&op->usage))
return;
- fscache_set_op_state(op, "Put");
-
_debug("PUT OP");
if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c2058d..a2a5d19ece6a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
if (fscache_object_is_active(object)) {
- fscache_set_op_state(op, "CallFS");
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
- fscache_set_op_state(op, "Done");
if (ret < 0)
fscache_abort_object(object);
}
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
fscache_operation_init(op, fscache_attr_changed_op, NULL);
op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
- fscache_set_op_name(op, "Attr");
spin_lock(&cookie->lock);
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
op->context = context;
op->start_time = jiffies;
INIT_LIST_HEAD(&op->to_do);
- fscache_set_op_name(&op->op, "Retr");
return op;
}
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
_leave(" = -ENOMEM");
return -ENOMEM;
}
- fscache_set_op_name(&op->op, "RetrRA1");
spin_lock(&cookie->lock);
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(mapping, end_io_func, context);
if (!op)
return -ENOMEM;
- fscache_set_op_name(&op->op, "RetrRAN");
spin_lock(&cookie->lock);
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
- fscache_set_op_name(&op->op, "RetrAL1");
spin_lock(&cookie->lock);
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
- fscache_set_op_state(&op->op, "GetPage");
-
spin_lock(&object->lock);
cookie = object->cookie;
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
- fscache_set_op_state(&op->op, "Store");
fscache_stat(&fscache_n_store_pages);
fscache_stat(&fscache_n_cop_write_page);
ret = object->cache->ops->write_page(op, page);
fscache_stat_d(&fscache_n_cop_write_page);
- fscache_set_op_state(&op->op, "EndWrite");
fscache_end_page_write(object, page);
if (ret < 0) {
- fscache_set_op_state(&op->op, "Abort");
fscache_abort_object(object);
} else {
fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_write_op,
fscache_release_write_op);
op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
- fscache_set_op_name(&op->op, "Write1");
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29a4e6f..0d0e3faddcfa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
if (IS_ERR(req))
return PTR_ERR(req);
+ dentry_unhash(entry);
+
req->in.h.opcode = FUSE_RMDIR;
req->in.h.nodeid = get_node_id(dir);
req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
struct fuse_rename_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
struct fuse_req *req = fuse_get_req(fc);
+
+ if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
+ dentry_unhash(newent);
+
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a2a6abbccc07..2792a790e50b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1346,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
}
-static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct gfs2_glock *gl;
int may_demote;
int nr_skipped = 0;
+ int nr = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
LIST_HEAD(skipped);
if (nr == 0)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d9864c418..42e8d23bc047 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
static atomic_t qd_lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
{
struct gfs2_quota_data *qd;
struct gfs2_sbd *sdp;
+ int nr_to_scan = sc->nr_to_scan;
- if (nr == 0)
+ if (nr_to_scan == 0)
goto out;
- if (!(gfp_mask & __GFP_FS))
+ if (!(sc->gfp_mask & __GFP_FS))
return -1;
spin_lock(&qd_lru_lock);
- while (nr && !list_empty(&qd_lru_list)) {
+ while (nr_to_scan && !list_empty(&qd_lru_list)) {
qd = list_entry(qd_lru_list.next,
struct gfs2_quota_data, qd_reclaim);
sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
spin_unlock(&qd_lru_lock);
kmem_cache_free(gfs2_quotad_cachep, qd);
spin_lock(&qd_lru_lock);
- nr--;
+ nr_to_scan--;
}
spin_unlock(&qd_lru_lock);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236ca48bd..90bf1c302a98 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
struct gfs2_inode;
struct gfs2_sbd;
+struct shrink_control;
#define NO_QUOTA_CHANGE ((u32)-1)
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
return ret;
}
-extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
+ struct shrink_control *sc);
extern const struct quotactl_ops gfs2_quotactl_ops;
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..1cb70cdba2c1 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
+ if (S_ISDIR(inode->i_mode))
+ dentry_unhash(dentry);
+
if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
return -ENOTEMPTY;
res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
+ if (S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
res = hfs_remove(new_dir, new_dentry);
if (res)
return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059c25da..b28835091dd0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
+ dentry_unhash(dentry);
+
if (inode->i_size != 2)
return -ENOTEMPTY;
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode))
+ if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+ dentry_unhash(new_dentry);
res = hfsplus_rmdir(new_dir, new_dentry);
- else
+ } else {
res = hfsplus_unlink(new_dir, new_dentry);
+ }
if (res)
return res;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..e6816b9e6903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
char *file;
int err;
+ dentry_unhash(dentry);
+
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
char *from_name, *to_name;
int err;
+ if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
+ dentry_unhash(to);
+
if ((from_name = dentry_name(from)) == NULL)
return -ENOMEM;
if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839c27a7..ff0ce21c0867 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
dentry_unhash(dentry);
if (!d_unhashed(dentry)) {
- dput(dentry);
hpfs_unlock(dir->i_sb);
return -ENOSPC;
}
@@ -403,7 +402,6 @@ again:
!S_ISREG(inode->i_mode) ||
get_write_access(inode)) {
d_rehash(dentry);
- dput(dentry);
} else {
struct iattr newattrs;
/*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
err = notify_change(dentry, &newattrs);
put_write_access(inode);
- dput(dentry);
if (!err)
goto again;
}
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
int err;
int r;
+ dentry_unhash(dentry);
+
hpfs_adjust_length(name, &len);
hpfs_lock(dir->i_sb);
err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh;
struct fnode *fnode;
int err;
+
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
if ((err = hpfs_chk_name(new_name, &new_len))) return err;
err = 0;
hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1cd03ff..7aafeb8fa300 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (!prio_tree_empty(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
truncate_hugepages(inode, offset);
return 0;
}
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size,
+ vm_flags_t acctflag,
struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
diff --git a/fs/inode.c b/fs/inode.c
index 05f4fa521325..990d284877a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -326,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
memset(mapping, 0, sizeof(*mapping));
INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
spin_lock_init(&mapping->tree_lock);
- spin_lock_init(&mapping->i_mmap_lock);
+ mutex_init(&mapping->i_mmap_mutex);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
- mutex_init(&mapping->unmap_mutex);
}
EXPORT_SYMBOL(address_space_init_once);
@@ -752,8 +751,12 @@ static void prune_icache(int nr_to_scan)
* This function is passed the number of inodes to scan, and it returns the
* total number of remaining possibly-reclaimable inodes.
*/
-static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
if (nr) {
/*
* Nasty deadlock avoidance. We may hold various FS locks,
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 29148a81c783..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err;
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
- commit_transaction->t_flushed_data_blocks = 1;
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
smp_mb__after_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -672,12 +671,16 @@ start_journal_io:
err = 0;
}
+ write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT);
+ commit_transaction->t_state = T_COMMIT_DFLUSH;
+ write_unlock(&journal->j_state_lock);
/*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
* the commit record
*/
- if (commit_transaction->t_flushed_data_blocks &&
+ if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -754,8 +757,13 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /* Wake up any transactions which were waiting for this
- IO to complete */
+ /*
+ * Wake up any transactions which were waiting for this IO to
+ * complete. The barrier must be here so that changes by
+ * jbd2_journal_file_buffer() take effect before wake_up_bit()
+ * does the waitqueue check.
+ */
+ smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
@@ -794,6 +802,10 @@ wait_for_iobuf:
jbd2_journal_abort(journal, err);
jbd_debug(3, "JBD: commit phase 5\n");
+ write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+ commit_transaction->t_state = T_COMMIT_JFLUSH;
+ write_unlock(&journal->j_state_lock);
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -949,7 +961,7 @@ restart_loop:
jbd_debug(3, "JBD: commit phase 7\n");
- J_ASSERT(commit_transaction->t_state == T_COMMIT);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
commit_transaction->t_start = jiffies;
stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
/*
- * Are we already doing a recent enough commit?
+ * The only transaction we can possibly wait upon is the
+ * currently running transaction (if it exists). Otherwise,
+ * the target tid must be an old one.
*/
- if (!tid_geq(journal->j_commit_request, target)) {
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == target) {
/*
* We want a new commit: OK, mark the request and wakeup the
* commit thread. We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
return 1;
- }
+ } else if (!tid_geq(journal->j_commit_request, target))
+ /* This should never happen, but if it does, preserve
+ the evidence before kjournald goes into a loop and
+ increments j_commit_sequence beyond all recognition. */
+ WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+ journal->j_commit_request,
+ journal->j_commit_sequence,
+ target, journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0);
return 0;
}
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
}
/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+ int ret = 0;
+ transaction_t *commit_trans;
+
+ if (!(journal->j_flags & JBD2_BARRIER))
+ return 0;
+ read_lock(&journal->j_state_lock);
+ /* Transaction already committed? */
+ if (tid_geq(journal->j_commit_sequence, tid))
+ goto out;
+ commit_trans = journal->j_committing_transaction;
+ if (!commit_trans || commit_trans->t_tid != tid) {
+ ret = 1;
+ goto out;
+ }
+ /*
+ * Transaction is being committed and we already proceeded to
+ * submitting a flush to fs partition?
+ */
+ if (journal->j_fs_dev != journal->j_dev) {
+ if (!commit_trans->t_need_data_flush ||
+ commit_trans->t_state >= T_COMMIT_DFLUSH)
+ goto out;
+ } else {
+ if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+ goto out;
+ }
+ ret = 1;
+out:
+ read_unlock(&journal->j_state_lock);
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+
+/*
* Wait for a specified commit to complete.
* The caller may not hold the journal lock.
*/
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
*/
/*
- * Update transiaction's maximum wait time, if debugging is enabled.
+ * Update transaction's maximum wait time, if debugging is enabled.
*
* In order for t_max_wait to be reliable, it must be protected by a
* lock. But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
* means that maximum wait time reported by the jbd2_run_stats
* tracepoint will always be zero.
*/
-static inline void update_t_max_wait(transaction_t *transaction)
+static inline void update_t_max_wait(transaction_t *transaction,
+ unsigned long ts)
{
#ifdef CONFIG_JBD2_DEBUG
- unsigned long ts = jiffies;
-
if (jbd2_journal_enable_debug &&
time_after(transaction->t_start, ts)) {
ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
tid_t tid;
int needed, need_to_start;
int nblocks = handle->h_buffer_credits;
+ unsigned long ts = jiffies;
if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
/* OK, account for the buffers that this operation expects to
* use and add the handle to the running transaction.
*/
- update_t_max_wait(transaction);
+ update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked.
*
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
*/
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
{
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
*/
JBUFFER_TRACE(jh, "cancelling revoke");
jbd2_journal_cancel_revoke(handle, jh);
- jbd2_journal_put_journal_head(jh);
out:
+ jbd2_journal_put_journal_head(jh);
return err;
}
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
jinode->i_next_transaction == transaction)
goto done;
+ /*
+ * We only ever set this variable to 1 so the test is safe. Since
+ * t_need_data_flush is likely to be set, we do the test to save some
+ * cacheline bouncing
+ */
+ if (!transaction->t_need_data_flush)
+ transaction->t_need_data_flush = 1;
/* On some different transaction's list - should be
* the committing one */
if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd1f321..05f73328b28b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
int ret;
uint32_t now = get_seconds();
+ dentry_unhash(dentry);
+
for (fd = f->dents ; fd; fd = fd->next) {
if (fd->ino)
return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
uint8_t type;
uint32_t now;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* The VFS will check for us and prevent trying to rename a
* file over a directory and vice versa, but if it's a directory,
* the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b511e89..865df16a6cf3 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+ dentry_unhash(dentry);
+
/* Init inode for quota operations. */
dquot_initialize(dip);
dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
new_dentry->d_name.name);
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
dquot_initialize(old_dir);
dquot_initialize(new_dir);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..f34c9cde9e94 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ dentry_unhash(dentry);
+
if (!logfs_empty_dir(inode))
return -ENOTEMPTY;
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
loff_t pos;
int err;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* 1. locate source dd */
err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
if (err)
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be06555..8c32ef3ba88e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
* What the mbcache registers as to get shrunk dynamically.
*/
-static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink,
+ struct shrink_control *sc);
static struct shrinker mb_cache_shrinker = {
.shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
* gets low.
*
* @shrink: (ignored)
- * @nr_to_scan: Number of objects to scan
- * @gfp_mask: (ignored)
+ * @sc: shrink_control passed from reclaim
*
* Returns the number of objects which are present in the cache.
*/
static int
-mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
{
LIST_HEAD(free_list);
struct mb_cache *cache;
struct mb_cache_entry *entry, *tmp;
int count = 0;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
mb_debug("trying to free %d entries", nr_to_scan);
spin_lock(&mb_cache_spinlock);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..f60aed8db9c4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err = -ENOTEMPTY;
+ dentry_unhash(dentry);
+
if (minix_empty_dir(inode)) {
err = minix_unlink(dir, dentry);
if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct minix_dir_entry * old_de;
int err = -ENOENT;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_de = minix_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/mpage.c b/fs/mpage.c
index 0afc809e46e0..fdfae9fa98cd 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -27,6 +27,7 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
+#include <linux/cleancache.h>
/*
* I/O completion handler for multipage BIOs.
@@ -271,6 +272,12 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
SetPageMappedToDisk(page);
}
+ if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
+ cleancache_get_page(page) == 0) {
+ SetPageUptodate(page);
+ goto confused;
+ }
+
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
diff --git a/fs/namei.c b/fs/namei.c
index 6ff858c049c0..2358b326b221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
}
EXPORT_SYMBOL(path_put);
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt). In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode. Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
*/
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
- struct fs_struct *fs = current->fs;
- struct dentry *dentry = nd->path.dentry;
- int want_root = 0;
-
- BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- want_root = 1;
- spin_lock(&fs->lock);
- if (nd->root.mnt != fs->root.mnt ||
- nd->root.dentry != fs->root.dentry)
- goto err_root;
- }
- spin_lock(&dentry->d_lock);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err;
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
- if (want_root) {
- path_get(&nd->root);
- spin_unlock(&fs->lock);
- }
- mntget(nd->path.mnt);
-
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- nd->flags &= ~LOOKUP_RCU;
- return 0;
-err:
- spin_unlock(&dentry->d_lock);
-err_root:
- if (want_root)
- spin_unlock(&fs->lock);
- return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
- if (nd->flags & LOOKUP_RCU)
- return nameidata_drop_rcu(nd);
- return 0;
-}
/**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
* Returns: 0 on success, -ECHILD on failure
*
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode. @dentry must be a path found by a do_lookup call on
+ * @nd or NULL. Must be called from rcu-walk context.
*/
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
{
struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
goto err_root;
}
spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err;
- /*
- * If the sequence check on the child dentry passed, then the child has
- * not been removed from its parent. This means the parent dentry must
- * be valid and able to take a reference at this point.
- */
- BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
- BUG_ON(!parent->d_count);
- parent->d_count++;
- spin_unlock(&dentry->d_lock);
+ if (!dentry) {
+ if (!__d_rcu_to_refcount(parent, nd->seq))
+ goto err_parent;
+ BUG_ON(nd->inode != parent->d_inode);
+ } else {
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ if (!__d_rcu_to_refcount(dentry, nd->seq))
+ goto err_child;
+ /*
+ * If the sequence check on the child dentry passed, then
+ * the child has not been removed from its parent. This
+ * means the parent dentry must be valid and able to take
+ * a reference at this point.
+ */
+ BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+ BUG_ON(!parent->d_count);
+ parent->d_count++;
+ spin_unlock(&dentry->d_lock);
+ }
spin_unlock(&parent->d_lock);
if (want_root) {
path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
br_read_unlock(vfsmount_lock);
nd->flags &= ~LOOKUP_RCU;
return 0;
-err:
+
+err_child:
spin_unlock(&dentry->d_lock);
+err_parent:
spin_unlock(&parent->d_lock);
err_root:
if (want_root)
@@ -510,59 +468,6 @@ err_root:
return -ECHILD;
}
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
- if (nd->flags & LOOKUP_RCU) {
- if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- return -ECHILD;
- }
- }
- return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
- struct dentry *dentry = nd->path.dentry;
-
- BUG_ON(!(nd->flags & LOOKUP_RCU));
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- spin_lock(&dentry->d_lock);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err_unlock;
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
-
- mntget(nd->path.mnt);
-
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
-
- return 0;
-
-err_unlock:
- spin_unlock(&dentry->d_lock);
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- return -ECHILD;
-}
-
/**
* release_open_intent - free up open intent resources
* @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
return dentry;
}
-/*
- * handle_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
+/**
+ * complete_walk - successful completion of path walk
+ * @nd: pointer nameidata
*
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it. Return 0 on
+ * success, -error on failure. In case of failure caller does not
+ * need to drop nd->path.
*/
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
+ if (nd->flags & LOOKUP_RCU) {
+ nd->flags &= ~LOOKUP_RCU;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ spin_lock(&dentry->d_lock);
+ if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ br_read_unlock(vfsmount_lock);
+ return -ECHILD;
+ }
+ BUG_ON(nd->inode != dentry->d_inode);
+ spin_unlock(&dentry->d_lock);
+ mntget(nd->path.mnt);
+ rcu_read_unlock();
+ br_read_unlock(vfsmount_lock);
+ }
+
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
if (!status)
status = -ESTALE;
+ path_put(&nd->path);
return status;
}
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
if (likely(__follow_mount_rcu(nd, path, inode, false)))
return 0;
unlazy:
- if (dentry) {
- if (nameidata_dentry_drop_rcu(nd, dentry))
- return -ECHILD;
- } else {
- if (nameidata_drop_rcu(nd))
- return -ECHILD;
- }
+ if (unlazy_walk(nd, dentry))
+ return -ECHILD;
} else {
dentry = __d_lookup(parent, name);
}
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
if (err != -ECHILD)
return err;
- if (nameidata_drop_rcu(nd))
+ if (unlazy_walk(nd, NULL))
return -ECHILD;
}
return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
return -ENOENT;
}
if (unlikely(inode->i_op->follow_link) && follow) {
- if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
- return -ECHILD;
+ if (nd->flags & LOOKUP_RCU) {
+ if (unlikely(unlazy_walk(nd, path->dentry))) {
+ terminate_walk(nd);
+ return -ECHILD;
+ }
+ }
BUG_ON(inode != path->dentry->d_inode);
return 1;
}
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
}
}
- if (nd->flags & LOOKUP_RCU) {
- /* went all way through without dropping RCU */
- BUG_ON(err);
- if (nameidata_drop_rcu_last(nd))
- err = -ECHILD;
- }
-
- if (!err) {
- err = handle_reval_path(nd);
- if (err)
- path_put(&nd->path);
- }
+ if (!err)
+ err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
return ERR_PTR(error);
/* fallthrough */
case LAST_ROOT:
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
- error = handle_reval_path(nd);
+ error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
audit_inode(pathname, nd->path.dentry);
if (open_flag & O_CREAT) {
error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
goto ok;
case LAST_BIND:
- /* can't be RCU mode here */
- error = handle_reval_path(nd);
+ error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
audit_inode(pathname, dir);
goto ok;
}
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (error) /* symlink */
return NULL;
/* sayonara */
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
+ error = complete_walk(nd);
+ if (error)
+ return ERR_PTR(-ECHILD);
error = -ENOTDIR;
if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
/* create side of things */
-
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
+ error = complete_walk(nd);
+ if (error)
+ return ERR_PTR(error);
audit_inode(pathname, dir);
error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
}
/*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
*
* A low-level filesystem can, if it choses, legally
* do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
*/
void dentry_unhash(struct dentry *dentry)
{
- dget(dentry);
shrink_dcache_parent(dentry);
spin_lock(&dentry->d_lock);
- if (dentry->d_count == 2)
+ if (dentry->d_count == 1)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EPERM;
mutex_lock(&dentry->d_inode->i_mutex);
- dentry_unhash(dentry);
+
+ error = -EBUSY;
if (d_mountpoint(dentry))
- error = -EBUSY;
- else {
- error = security_inode_rmdir(dir, dentry);
- if (!error) {
- error = dir->i_op->rmdir(dir, dentry);
- if (!error) {
- dentry->d_inode->i_flags |= S_DEAD;
- dont_mount(dentry);
- }
- }
- }
+ goto out;
+
+ error = security_inode_rmdir(dir, dentry);
+ if (error)
+ goto out;
+
+ error = dir->i_op->rmdir(dir, dentry);
+ if (error)
+ goto out;
+
+ dentry->d_inode->i_flags |= S_DEAD;
+ dont_mount(dentry);
+
+out:
mutex_unlock(&dentry->d_inode->i_mutex);
- if (!error) {
+ if (!error)
d_delete(dentry);
- }
- dput(dentry);
-
return error;
}
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* HOWEVER, it relies on the assumption that any object with ->lookup()
* has no more than 1 dentry. If "hybrid" objects will ever appear,
* we'd better make sure that there's no link(2) for them.
- * d) some filesystems don't support opened-but-unlinked directories,
- * either because of layout or because they are not ready to deal with
- * all cases correctly. The latter will be fixed (taking this sort of
- * stuff into VFS), but the former is not going away. Solution: the same
- * trick as in rmdir().
- * e) conversion from fhandle to dentry may come in the wrong moment - when
+ * d) conversion from fhandle to dentry may come in the wrong moment - when
* we are removing the target. Solution: we will have to grab ->i_mutex
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
* ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
int error = 0;
- struct inode *target;
+ struct inode *target = new_dentry->d_inode;
/*
* If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- target = new_dentry->d_inode;
if (target)
mutex_lock(&target->i_mutex);
- if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- error = -EBUSY;
- else {
- if (target)
- dentry_unhash(new_dentry);
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- }
+
+ error = -EBUSY;
+ if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+ goto out;
+
+ error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (error)
+ goto out;
+
if (target) {
- if (!error) {
- target->i_flags |= S_DEAD;
- dont_mount(new_dentry);
- }
- mutex_unlock(&target->i_mutex);
- if (d_unhashed(new_dentry))
- d_rehash(new_dentry);
- dput(new_dentry);
+ target->i_flags |= S_DEAD;
+ dont_mount(new_dentry);
}
+out:
+ if (target)
+ mutex_unlock(&target->i_mutex);
if (!error)
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *target;
+ struct inode *target = new_dentry->d_inode;
int error;
error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
return error;
dget(new_dentry);
- target = new_dentry->d_inode;
if (target)
mutex_lock(&target->i_mutex);
+
+ error = -EBUSY;
if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- error = -EBUSY;
- else
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- if (!error) {
- if (target)
- dont_mount(new_dentry);
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry, new_dentry);
- }
+ goto out;
+
+ error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (error)
+ goto out;
+
+ if (target)
+ dont_mount(new_dentry);
+ if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+ d_move(old_dentry, new_dentry);
+out:
if (target)
mutex_unlock(&target->i_mutex);
dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf59e4c2..fe59bd145d21 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
static int flags_to_propagation_type(int flags)
{
- int type = flags & ~MS_REC;
+ int type = flags & ~(MS_REC | MS_SILENT);
/* Fail if any non-propagation flags are set */
if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb5cb55..e3e646b06404 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
DPRINTK("ncp_rmdir: removing %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
+ dentry_unhash(dentry);
+
error = -EBUSY;
if (!d_unhashed(dentry))
goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4ce4893..202f370526a7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
#endif
struct ncp_entry_info finfo;
- data.wdog_pid = NULL;
+ memset(&data, 0, sizeof(data));
server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
if (!server)
return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
data.flags = md->flags;
- data.int_flags = 0;
data.mounted_uid = md->mounted_uid;
data.wdog_pid = find_get_pid(md->wdog_pid);
data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
data.file_mode = md->file_mode;
data.dir_mode = md->dir_mode;
data.info_fd = -1;
- data.mounted_vol[0] = 0;
}
break;
default:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c8..424e47773a84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
}
}
-int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc)
{
LIST_HEAD(head);
struct nfs_inode *nfsi, *next;
struct nfs_access_entry *cache;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885dd..2df6ca7b5898 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
/* dir.c */
extern int nfs_access_cache_shrinker(struct shrinker *shrink,
- int nr_to_scan, gfp_t gfp_mask);
+ struct shrink_control *sc);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b3e88f..1102a5fbb744 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
struct nilfs_transaction_info ti;
int err;
+ dentry_unhash(dentry);
+
err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
if (err)
return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct nilfs_transaction_info ti;
int err;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
if (unlikely(err))
return err;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 823bc35334e0..cdbaf5e97308 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/quotaops.h>
+#include <linux/cleancache.h>
#define CREATE_TRACE_POINTS
#include "ocfs2_trace.h"
@@ -2352,6 +2353,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
mlog_errno(status);
goto bail;
}
+ cleancache_init_shared_fs((char *)&uuid_net_key, sb);
bail:
return status;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29f1e05..c368360c35a1 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int ret;
- if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
- return -ENOTEMPTY;
+
+ if (S_ISDIR(inode->i_mode)) {
+ dentry_unhash(dentry);
+ if (!omfs_dir_is_empty(inode))
+ return -ENOTEMPTY;
+ }
ret = omfs_delete_entry(dentry);
if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int err;
if (new_inode) {
+ if (S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* overwriting existing file/dir */
err = omfs_remove(new_dir, new_dentry);
if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..8ed4d3433199 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%u\n", p->discard_alignment);
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%u\n",
+ queue_limit_discard_alignment(&disk->queue->limits,
+ p->start_sect));
}
ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
p->start_sect = start;
p->alignment_offset =
queue_limit_alignment_offset(&disk->queue->limits, start);
- p->discard_alignment =
- queue_limit_discard_alignment(&disk->queue->limits, start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5f28fb..c1c729335924 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -20,6 +20,7 @@ proc-y += stat.o
proc-y += uptime.o
proc-y += version.o
proc-y += softirqs.o
+proc-y += namespaces.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e55..dc8bca72b002 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
return allowed;
}
-static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
struct inode *inode = dentry->d_inode;
@@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task)
return 0;
}
-
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
{
struct inode * inode;
struct proc_inode *ei;
@@ -1779,7 +1778,7 @@ out_unlock:
return NULL;
}
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
struct task_struct *task;
@@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
* made this apply to all per process world readable and executable
* directories.
*/
-static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode;
struct task_struct *task;
@@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
}
-static const struct dentry_operations pid_dentry_operations =
+const struct dentry_operations pid_dentry_operations =
{
.d_revalidate = pid_revalidate,
.d_delete = pid_delete_dentry,
@@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations =
/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
- struct task_struct *, const void *);
-
/*
* Fill a directory entry.
*
@@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
* reported by readdir in sync with the inode numbers reported
* by stat.
*/
-static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- char *name, int len,
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ const char *name, int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
struct dentry *child, *dir = filp->f_path.dentry;
@@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
@@ -3168,6 +3165,7 @@ out_no_task:
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f1281339b6fa..f1637f17c37c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -674,6 +674,7 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
}
return ent;
}
+EXPORT_SYMBOL(proc_mkdir_mode);
struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
struct proc_dir_entry *parent)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d15aa1b1cc8f..74b48cfa1bb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
+ const struct proc_ns_operations *ns_ops;
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
@@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)
rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head);
}
+ /* Release any associated namespace */
+ ns_ops = PROC_I(inode)->ns_ops;
+ if (ns_ops && ns_ops->put)
+ ns_ops->put(PROC_I(inode)->ns);
}
static struct kmem_cache * proc_inode_cachep;
@@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ ei->ns = NULL;
+ ei->ns_ops = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3a3a5b..7838e5cfec14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;
+struct proc_maps_private {
+ struct pid *pid;
+ struct task_struct *task;
+#ifdef CONFIG_MMU
+ struct vm_area_struct *tail_vma;
+#endif
+};
+
void proc_init_inodecache(void);
static inline struct pid *proc_pid(struct inode *inode)
@@ -119,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
*/
int proc_readdir(struct file *, void *, filldir_t);
struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+
+
+
+/* Lookups */
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+ struct task_struct *, const void *);
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ const char *name, int len,
+ instantiate_t instantiate, struct task_struct *task, const void *ptr);
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
+extern const struct dentry_operations pid_dentry_operations;
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int proc_setattr(struct dentry *dentry, struct iattr *attr);
+
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
+
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000000..781dec5bd682
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,198 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+ &netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+ &utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+ &ipcns_operations,
+#endif
+};
+
+static const struct file_operations ns_file_operations = {
+ .llseek = no_llseek,
+};
+
+static struct dentry *proc_ns_instantiate(struct inode *dir,
+ struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+ const struct proc_ns_operations *ns_ops = ptr;
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct dentry *error = ERR_PTR(-ENOENT);
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ goto out;
+
+ ei = PROC_I(inode);
+ inode->i_mode = S_IFREG|S_IRUSR;
+ inode->i_fop = &ns_file_operations;
+ ei->ns_ops = ns_ops;
+ ei->ns = ns_ops->get(task);
+ if (!ei->ns)
+ goto out_iput;
+
+ dentry->d_op = &pid_dentry_operations;
+ d_add(dentry, inode);
+ /* Close the race of the process dying before we return the dentry */
+ if (pid_revalidate(dentry, NULL))
+ error = NULL;
+out:
+ return error;
+out_iput:
+ iput(inode);
+ goto out;
+}
+
+static int proc_ns_fill_cache(struct file *filp, void *dirent,
+ filldir_t filldir, struct task_struct *task,
+ const struct proc_ns_operations *ops)
+{
+ return proc_fill_cache(filp, dirent, filldir,
+ ops->name, strlen(ops->name),
+ proc_ns_instantiate, task, ops);
+}
+
+static int proc_ns_dir_readdir(struct file *filp, void *dirent,
+ filldir_t filldir)
+{
+ int i;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
+ const struct proc_ns_operations **entry, **last;
+ ino_t ino;
+ int ret;
+
+ ret = -ENOENT;
+ if (!task)
+ goto out_no_task;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ ret = 0;
+ i = filp->f_pos;
+ switch (i) {
+ case 0:
+ ino = inode->i_ino;
+ if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ goto out;
+ i++;
+ filp->f_pos++;
+ /* fall through */
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+ goto out;
+ i++;
+ filp->f_pos++;
+ /* fall through */
+ default:
+ i -= 2;
+ if (i >= ARRAY_SIZE(ns_entries)) {
+ ret = 1;
+ goto out;
+ }
+ entry = ns_entries + i;
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ while (entry <= last) {
+ if (proc_ns_fill_cache(filp, dirent, filldir,
+ task, *entry) < 0)
+ goto out;
+ filp->f_pos++;
+ entry++;
+ }
+ }
+
+ ret = 1;
+out:
+ put_task_struct(task);
+out_no_task:
+ return ret;
+}
+
+const struct file_operations proc_ns_dir_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_ns_dir_readdir,
+};
+
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ struct dentry *error;
+ struct task_struct *task = get_proc_task(dir);
+ const struct proc_ns_operations **entry, **last;
+ unsigned int len = dentry->d_name.len;
+
+ error = ERR_PTR(-ENOENT);
+
+ if (!task)
+ goto out_no_task;
+
+ error = ERR_PTR(-EPERM);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ for (entry = ns_entries; entry <= last; entry++) {
+ if (strlen((*entry)->name) != len)
+ continue;
+ if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+ break;
+ }
+ error = ERR_PTR(-ENOENT);
+ if (entry > last)
+ goto out;
+
+ error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+ put_task_struct(task);
+out_no_task:
+ return error;
+}
+
+const struct inode_operations proc_ns_dir_inode_operations = {
+ .lookup = proc_ns_dir_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+};
+
+struct file *proc_ns_fget(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ return file;
+
+out_invalid:
+ fput(file);
+ return ERR_PTR(-EINVAL);
+}
+
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d8654989b..db15935fa757 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
- int flags = vma->vm_flags;
+ vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
@@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = {
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
-extern int show_numa_map(struct seq_file *m, void *v);
+
+struct numa_maps {
+ struct vm_area_struct *vma;
+ unsigned long pages;
+ unsigned long anon;
+ unsigned long active;
+ unsigned long writeback;
+ unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
+ unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+ struct proc_maps_private proc_maps;
+ struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+{
+ int count = page_mapcount(page);
+
+ md->pages++;
+ if (pte_dirty || PageDirty(page))
+ md->dirty++;
+
+ if (PageSwapCache(page))
+ md->swapcache++;
+
+ if (PageActive(page) || PageUnevictable(page))
+ md->active++;
+
+ if (PageWriteback(page))
+ md->writeback++;
+
+ if (PageAnon(page))
+ md->anon++;
+
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
+ md->node[page_to_nid(page)]++;
+}
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md;
+ spinlock_t *ptl;
+ pte_t *orig_pte;
+ pte_t *pte;
+
+ md = walk->private;
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ do {
+ struct page *page;
+ int nid;
+
+ if (!pte_present(*pte))
+ continue;
+
+ page = vm_normal_page(md->vma, addr, *pte);
+ if (!page)
+ continue;
+
+ if (PageReserved(page))
+ continue;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+ continue;
+
+ gather_stats(page, md, pte_dirty(*pte));
+
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(orig_pte, ptl);
+ return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md;
+ struct page *page;
+
+ if (pte_none(*pte))
+ return 0;
+
+ page = pte_page(*pte);
+ if (!page)
+ return 0;
+
+ md = walk->private;
+ gather_stats(page, md, pte_dirty(*pte));
+ return 0;
+}
+
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+ struct numa_maps_private *numa_priv = m->private;
+ struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+ struct vm_area_struct *vma = v;
+ struct numa_maps *md = &numa_priv->md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mm_walk walk = {};
+ struct mempolicy *pol;
+ int n;
+ char buffer[50];
+
+ if (!mm)
+ return 0;
+
+ /* Ensure we start with an empty set of numa_maps statistics. */
+ memset(md, 0, sizeof(*md));
+
+ md->vma = vma;
+
+ walk.hugetlb_entry = gather_hugetbl_stats;
+ walk.pmd_entry = gather_pte_stats;
+ walk.private = md;
+ walk.mm = mm;
+
+ pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+ mpol_to_str(buffer, sizeof(buffer), pol, 0);
+ mpol_cond_put(pol);
+
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+ if (file) {
+ seq_printf(m, " file=");
+ seq_path(m, &file->f_path, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_printf(m, " heap");
+ } else if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ seq_printf(m, " stack");
+ }
+
+ walk_page_range(vma->vm_start, vma->vm_end, &walk);
+
+ if (!md->pages)
+ goto out;
+
+ if (md->anon)
+ seq_printf(m, " anon=%lu", md->anon);
+
+ if (md->dirty)
+ seq_printf(m, " dirty=%lu", md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m, " swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m, " active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m, " writeback=%lu", md->writeback);
+
+ for_each_node_state(n, N_HIGH_MEMORY)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+ seq_putc(m, '\n');
+
+ if (m->count < m->size)
+ m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+ return 0;
+}
static const struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
@@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
static int numa_maps_open(struct inode *inode, struct file *file)
{
- return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+ struct numa_maps_private *priv;
+ int ret = -ENOMEM;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ priv->proc_maps.pid = proc_pid(inode);
+ ret = seq_open(file, &proc_pid_numa_maps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = priv;
+ } else {
+ kfree(priv);
+ }
+ }
+ return ret;
}
const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif
+#endif /* CONFIG_NUMA */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f5fa0a..5b572c89e6c4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
* This is called from kswapd when we think we need some
* more memory
*/
-static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
+
if (nr) {
spin_lock(&dq_list_lock);
prune_dqcache(nr);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..76c8164d5651 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
+ dentry_unhash(dentry);
+
/* we will be doing 2 balancings and update 2 stat data, we change quotas
* of the owner of the directory and of the owner of the parent directory.
* The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* three balancings: (1) old name removal, (2) new name insertion
and (3) maybe "save" link insertion
stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a4498b03..50f1abccd1cd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
mutex_unlock(&dentry->d_inode->i_mutex);
if (!error)
d_delete(dentry);
- dput(dentry);
return error;
}
diff --git a/fs/splice.c b/fs/splice.c
index 50a5d978da16..aa866d309695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
+
/**
* splice_to_pipe - fill passed data into a pipe
* @pipe: pipe to fill
@@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
pipe_unlock(pipe);
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (do_wakeup)
+ wakeup_pipe_readers(pipe);
while (page_nr < spd_pages)
spd->spd_release(spd, page_nr++);
@@ -1892,12 +1896,9 @@ retry:
/*
* If we put data in the output pipe, wakeup any potential readers.
*/
- if (ret > 0) {
- smp_mb();
- if (waitqueue_active(&opipe->wait))
- wake_up_interruptible(&opipe->wait);
- kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
+
if (input_wakeup)
wakeup_pipe_writers(ipipe);
@@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
/*
* If we put data in the output pipe, wakeup any potential readers.
*/
- if (ret > 0) {
- smp_mb();
- if (waitqueue_active(&opipe->wait))
- wake_up_interruptible(&opipe->wait);
- kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
return ret;
}
diff --git a/fs/super.c b/fs/super.c
index c04f7e0b7ed2..c75593953c52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -31,6 +31,7 @@
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
+#include <linux/cleancache.h>
#include "internal.h"
@@ -112,6 +113,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
s->s_maxbytes = MAX_NON_LFS;
s->s_op = &default_op;
s->s_time_gran = 1000000000;
+ s->cleancache_poolid = -1;
}
out:
return s;
@@ -177,6 +179,7 @@ void deactivate_locked_super(struct super_block *s)
{
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
+ cleancache_flush_fs(s);
fs->kill_sb(s);
/*
* We need to call rcu_barrier so all the delayed rcu free
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..e2cc6756f3b1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
struct inode *inode = dentry->d_inode;
int err = -ENOTEMPTY;
+ dentry_unhash(dentry);
+
if (sysv_empty_dir(inode)) {
err = sysv_unlink(dir, dentry);
if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
struct sysv_dir_entry * old_de;
int err = -ENOENT;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_de = sysv_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef5abd38f0bf..c2b80943560d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+ dentry_unhash(dentry);
+
/*
* Budget request settings: deletion direntry, deletion inode and
* changing the parent inode. If budgeting fails, go ahead anyway
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4d76594c2a8f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
+ dentry_unhash(dentry);
+
retval = -ENOENT;
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e25417f..953ebdfc5bf7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err= -ENOTEMPTY;
+ dentry_unhash(dentry);
+
lock_ufs(dir->i_sb);
if (ufs_empty_dir (inode)) {
err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 52b2b5da566e..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1422,12 +1422,12 @@ restart:
int
xfs_buftarg_shrink(
struct shrinker *shrink,
- int nr_to_scan,
- gfp_t mask)
+ struct shrink_control *sc)
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
struct xfs_buf *bp;
+ int nr_to_scan = sc->nr_to_scan;
LIST_HEAD(dispose);
if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c88012..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
return -XFS_ERROR(EFAULT);
return 0;
}
+
+int
+xfs_discard_extents(
+ struct xfs_mount *mp,
+ struct list_head *list)
+{
+ struct xfs_busy_extent *busyp;
+ int error = 0;
+
+ list_for_each_entry(busyp, list, list) {
+ trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+ busyp->length);
+
+ error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, 0);
+ if (error && error != EOPNOTSUPP) {
+ xfs_info(mp,
+ "discard failed for extent [0x%llu,%u], error %d",
+ (unsigned long long)busyp->bno,
+ busyp->length,
+ error);
+ return error;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd3e127..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
#define XFS_DISCARD_H 1
struct fstrim_range;
+struct list_head;
extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b0aa59e51fd0..98b9c91fcdf1 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
+#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
+#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
+#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
/*
* Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DELAYLOG;
} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+ } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+ mp->m_flags |= XFS_MOUNT_DISCARD;
+ } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, "ihashsize")) {
xfs_warn(mp,
"ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
return EINVAL;
}
+ if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+ !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+ xfs_warn(mp,
+ "the discard option is incompatible with the nodelaylog option");
+ return EINVAL;
+ }
+
#ifndef CONFIG_XFS_QUOTA
if (XFS_IS_QUOTA_RUNNING(mp)) {
xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
+ { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index cb1bb2080e44..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -1032,13 +1032,14 @@ xfs_reclaim_inodes(
static int
xfs_reclaim_inode_shrink(
struct shrinker *shrink,
- int nr_to_scan,
- gfp_t gfp_mask)
+ struct shrink_control *sc)
{
struct xfs_mount *mp;
struct xfs_perag *pag;
xfs_agnumber_t ag;
int reclaimable;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
if (nr_to_scan) {
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa8605a..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t);
+STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
static struct shrinker xfs_qm_shaker = {
.shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
STATIC int
xfs_qm_shake(
struct shrinker *shrink,
- int nr_to_scan,
- gfp_t gfp_mask)
+ struct shrink_control *sc)
{
int ndqused, nfree, n;
+ gfp_t gfp_mask = sc->gfp_mask;
if (!kmem_shake_allow(gfp_mask))
return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index da0a561ffba2..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,6 +187,9 @@ struct xfs_busy_extent {
xfs_agnumber_t agno;
xfs_agblock_t bno;
xfs_extlen_t length;
+ unsigned int flags;
+#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
};
/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index acdced86413c..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2469,7 +2469,7 @@ xfs_free_extent(
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
if (!error)
- xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
+ xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
error0:
xfs_perag_put(args.pag);
return error;
@@ -2480,7 +2480,8 @@ xfs_alloc_busy_insert(
struct xfs_trans *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
- xfs_extlen_t len)
+ xfs_extlen_t len,
+ unsigned int flags)
{
struct xfs_busy_extent *new;
struct xfs_busy_extent *busyp;
@@ -2504,6 +2505,7 @@ xfs_alloc_busy_insert(
new->bno = bno;
new->length = len;
INIT_LIST_HEAD(&new->list);
+ new->flags = flags;
/* trace before insert to be able to see failed inserts */
trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
@@ -2609,6 +2611,18 @@ xfs_alloc_busy_update_extent(
xfs_agblock_t bend = bbno + busyp->length;
/*
+ * This extent is currently being discarded. Give the thread
+ * performing the discard a chance to mark the extent unbusy
+ * and retry.
+ */
+ if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+ spin_unlock(&pag->pagb_lock);
+ delay(1);
+ spin_lock(&pag->pagb_lock);
+ return false;
+ }
+
+ /*
* If there is a busy extent overlapping a user allocation, we have
* no choice but to force the log and retry the search.
*
@@ -2813,7 +2827,8 @@ restart:
* If this is a metadata allocation, try to reuse the busy
* extent instead of trimming the allocation.
*/
- if (!args->userdata) {
+ if (!args->userdata &&
+ !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
busyp, fbno, flen,
false))
@@ -2979,10 +2994,16 @@ xfs_alloc_busy_clear_one(
kmem_free(busyp);
}
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
void
xfs_alloc_busy_clear(
struct xfs_mount *mp,
- struct list_head *list)
+ struct list_head *list,
+ bool do_discard)
{
struct xfs_busy_extent *busyp, *n;
struct xfs_perag *pag = NULL;
@@ -2999,7 +3020,11 @@ xfs_alloc_busy_clear(
agno = busyp->agno;
}
- xfs_alloc_busy_clear_one(mp, pag, busyp);
+ if (do_discard && busyp->length &&
+ !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+ busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+ else
+ xfs_alloc_busy_clear_one(mp, pag, busyp);
}
if (pag) {
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 240ad288f2f9..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,10 +137,11 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
#ifdef __KERNEL__
void
xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len);
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+ bool do_discard);
int
xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 8b469d53599f..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -120,7 +120,8 @@ xfs_allocbt_free_block(
if (error)
return error;
- xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_ALLOC_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788de2f5..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
int *flags); /* inode logging flags */
/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_add_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd); /* OK to allocate reserved blocks */
-
-/*
* Called by xfs_bmap_add_extent to handle cases converting a delayed
* allocation to a real allocation.
*/
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int rsvd); /* OK to allocate reserved blocks */
+ int *logflagsp); /* inode logging flags */
/*
* Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
STATIC int /* error */
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp,/* inode logging flags */
- int rsvd); /* OK to allocate reserved blocks */
+ int *logflagsp); /* inode logging flags */
/*
* Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
STATIC int /* error */
xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
int whichfork); /* data or attr fork */
/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_del_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_trans_t *tp, /* current trans pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp,/* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd); /* OK to allocate reserved blocks */
-
-/*
* Remove the entry "free" from the free item list. Prev points to the
* previous entry, unless "free" is the head of the list.
*/
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
STATIC int /* error */
xfs_bmap_add_extent(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd) /* OK to use reserved data blocks */
+ int whichfork) /* data or attr fork */
{
xfs_btree_cur_t *cur; /* btree cursor or null */
xfs_filblks_t da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
xfs_extnum_t nextents; /* number of extents in file now */
XFS_STATS_INC(xs_add_exlist);
+
cur = *curp;
ifp = XFS_IFORK_PTR(ip, whichfork);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(idx <= nextents);
da_old = da_new = 0;
error = 0;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= nextents);
+
/*
* This is the first extent added to a new/empty file.
* Special case this one, so other routines get to assume there are
* already extents in the list.
*/
if (nextents == 0) {
- xfs_iext_insert(ip, 0, 1, new,
+ xfs_iext_insert(ip, *idx, 1, new,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
ASSERT(cur == NULL);
- ifp->if_lastex = 0;
+
if (!isnullstartblock(new->br_startblock)) {
XFS_IFORK_NEXT_SET(ip, whichfork, 1);
logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
if (cur)
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
- &logflags, rsvd)))
- goto done;
+ error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+ &logflags);
}
/*
* Real allocation off the end of the file.
*/
- else if (idx == nextents) {
+ else if (*idx == nextents) {
if (cur)
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
- &logflags, whichfork)))
- goto done;
+ error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+ &logflags, whichfork);
} else {
xfs_bmbt_irec_t prev; /* old extent at offset idx */
/*
* Get the record referred to by idx.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
/*
* If it's a real allocation record, and the new allocation ends
* after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
if (cur)
ASSERT(cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL);
- if ((error = xfs_bmap_add_extent_delay_real(ip,
- idx, &cur, new, &da_new, first, flist,
- &logflags, rsvd)))
- goto done;
- } else if (new->br_state == XFS_EXT_NORM) {
- ASSERT(new->br_state == XFS_EXT_NORM);
- if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags)))
- goto done;
+ error = xfs_bmap_add_extent_delay_real(ip,
+ idx, &cur, new, &da_new,
+ first, flist, &logflags);
} else {
- ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
- if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags)))
+ ASSERT(new->br_state == XFS_EXT_NORM ||
+ new->br_state == XFS_EXT_UNWRITTEN);
+
+ error = xfs_bmap_add_extent_unwritten_real(ip,
+ idx, &cur, new, &logflags);
+ if (error)
goto done;
}
- ASSERT(*curp == cur || *curp == NULL);
}
/*
* Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
if (cur)
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
- new, &logflags, whichfork)))
- goto done;
+ error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+ new, &logflags, whichfork);
}
}
+ if (error)
+ goto done;
ASSERT(*curp == cur || *curp == NULL);
+
/*
* Convert to a btree if necessary.
*/
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
ASSERT(nblks <= da_old);
if (nblks < da_old)
xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - nblks), rsvd);
+ (int64_t)(da_old - nblks), 0);
}
/*
* Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int rsvd) /* OK to use reserved data block allocation */
+ int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
int diff; /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
*/
cur = *curp;
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &PREV);
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left and right neighbors are both contiguous with new.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, idx, 2, state);
- ip->i_df.if_lastex = idx - 1;
+ xfs_iext_remove(ip, *idx + 1, 2, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx - 1;
- xfs_iext_remove(ip, idx, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
- xfs_iext_remove(ip, idx + 1, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount, PREV.br_state)))
goto done;
}
+
*dnew = 0;
break;
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
+
*dnew = 0;
break;
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- ip->i_df.if_lastex = idx - 1;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock));
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
*dnew = temp;
break;
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, new_endoff);
temp = PREV.br_blockcount - new->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(ip, idx, 1, new, state);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock) -
(cur ? cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, idx + 1);
+ ep = xfs_iext_get_ext(ifp, *idx + 1);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+
*dnew = temp;
break;
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is contiguous with the new allocation.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
RIGHT.br_state);
- trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
- ip->i_df.if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_state)))
goto done;
}
+
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock));
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
*dnew = temp;
break;
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is not contiguous.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(ip, idx + 1, 1, new, state);
- ip->i_df.if_lastex = idx + 1;
+ xfs_iext_insert(ip, *idx + 1, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock) -
(cur ? cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
*dnew = temp;
break;
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
*/
temp = new->br_startoff - PREV.br_startoff;
temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
- trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
LEFT = *new;
RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_startoff = new_endoff;
RIGHT.br_blockcount = temp2;
/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
- xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
- ip->i_df.if_lastex = idx + 1;
+ xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
(cur ? cur->bc_private.b.allocated : 0));
if (diff > 0 &&
xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- -((int64_t)diff), rsvd)) {
+ -((int64_t)diff), 0)) {
/*
* Ick gross gag me with a spoon.
*/
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
if (!diff ||
!xfs_icsb_modify_counters(ip->i_mount,
XFS_SBS_FDBLOCKS,
- -((int64_t)diff), rsvd))
+ -((int64_t)diff), 0))
break;
}
if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
if (!diff ||
!xfs_icsb_modify_counters(ip->i_mount,
XFS_SBS_FDBLOCKS,
- -((int64_t)diff), rsvd))
+ -((int64_t)diff), 0))
break;
}
}
}
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
nullstartblock((int)temp2));
- trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+
+ ++*idx;
*dnew = temp + temp2;
break;
@@ -1161,7 +1131,7 @@ done:
STATIC int /* error */
xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
error = 0;
cur = *curp;
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &PREV);
newext = new->br_state;
oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left and right neighbors are both contiguous with new.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, idx, 2, state);
- ip->i_df.if_lastex = idx - 1;
+ xfs_iext_remove(ip, *idx + 1, 2, state);
ip->i_d.di_nextents -= 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx - 1;
- xfs_iext_remove(ip, idx, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
- xfs_iext_remove(ip, idx + 1, 1, state);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep,
new->br_startblock + new->br_blockcount);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
- ip->i_df.if_lastex = idx - 1;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
xfs_bmbt_set_startoff(ep, new_endoff);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
xfs_bmbt_set_startblock(ep,
new->br_startblock + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_insert(ip, idx, 1, new, state);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is contiguous with the new allocation.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount, newext);
- trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx + 1;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
- xfs_iext_insert(ip, idx + 1, 1, new, state);
- ip->i_df.if_lastex = idx + 1;
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
* newext. Contiguity is impossible here.
* One extent becomes three extents.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
new->br_startoff - PREV.br_startoff);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
r[0] = *new;
r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startoff + PREV.br_blockcount - new_endoff;
r[1].br_startblock = new->br_startblock + new->br_blockcount;
r[1].br_state = oldext;
- xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
- ip->i_df.if_lastex = idx + 1;
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
ip->i_d.di_nextents += 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
STATIC int /* error */
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp, /* inode logging flags */
- int rsvd) /* OK to allocate reserved blocks */
+ int *logflagsp) /* inode logging flags */
{
- xfs_bmbt_rec_host_t *ep; /* extent record for idx */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t temp=0; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, idx);
state = 0;
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(ep, &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
* on the left and on the right.
* Merge all three into a single extent record.
*/
+ --*idx;
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, idx, 1, state);
- ip->i_df.if_lastex = idx - 1;
+ xfs_iext_remove(ip, *idx + 1, 1, state);
break;
case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
* on the left.
* Merge the new allocation with the left neighbor.
*/
+ --*idx;
temp = left.br_blockcount + new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-
- ip->i_df.if_lastex = idx - 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_allf(ep, new->br_startoff,
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff,
nullstartblock((int)newlen), temp, right.br_state);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-
- ip->i_df.if_lastex = idx;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
* Insert a new entry.
*/
oldlen = newlen = 0;
- xfs_iext_insert(ip, idx, 1, new, state);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int64_t)(oldlen - newlen), rsvd);
+ (int64_t)(oldlen - newlen), 0);
/*
* Nothing to do for disk quota accounting here.
*/
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
int state; /* state bits, accessed thru macros */
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
- ep = xfs_iext_get_ext(ifp, idx);
+ ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(ep, &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
- xfs_iext_remove(ip, idx, 1, state);
- ifp->if_lastex = idx - 1;
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ifp->if_lastex = idx - 1;
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ifp->if_lastex = idx;
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(ip, idx, 1, new, state);
- ifp->if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int /* error */
xfs_bmap_del_extent(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t idx, /* extent number to update/delete */
+ xfs_extnum_t *idx, /* extent number to update/delete */
xfs_bmap_free_t *flist, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd) /* OK to allocate reserved blocks */
+ int whichfork) /* data or attr fork */
{
xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((idx >= 0) && (idx < ifp->if_bytes /
+ ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
ASSERT(del->br_blockcount > 0);
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &got);
ASSERT(got.br_startoff <= del->br_startoff);
del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_iext_remove(ip, idx, 1,
+ xfs_iext_remove(ip, *idx, 1,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
- ifp->if_lastex = idx;
+ --*idx;
if (delay)
break;
+
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, del_endoff);
temp = got.br_blockcount - del->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- ifp->if_lastex = idx;
if (delay) {
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
da_old);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
da_new = temp;
break;
}
xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
* Deleting the last part of the extent.
*/
temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- ifp->if_lastex = idx;
if (delay) {
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
da_old);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
da_new = temp;
break;
}
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
* Deleting the middle of the extent.
*/
temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
new.br_startoff = del_endoff;
temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
}
}
}
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- xfs_iext_insert(ip, idx + 1, 1, &new, state);
- ifp->if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+ ++*idx;
break;
}
/*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
ASSERT(da_old >= da_new);
if (da_old > da_new) {
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - da_new), rsvd);
+ (int64_t)(da_old - da_new), 0);
}
done:
*logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
if (rt) {
error = xfs_mod_incore_sb(mp,
XFS_SBS_FREXTENTS,
- -((int64_t)extsz), (flags &
- XFS_BMAPI_RSVBLOCKS));
+ -((int64_t)extsz), 0);
} else {
error = xfs_icsb_modify_counters(mp,
XFS_SBS_FDBLOCKS,
- -((int64_t)alen), (flags &
- XFS_BMAPI_RSVBLOCKS));
+ -((int64_t)alen), 0);
}
if (!error) {
error = xfs_icsb_modify_counters(mp,
XFS_SBS_FDBLOCKS,
- -((int64_t)indlen), (flags &
- XFS_BMAPI_RSVBLOCKS));
+ -((int64_t)indlen), 0);
if (error && rt)
xfs_mod_incore_sb(mp,
XFS_SBS_FREXTENTS,
- (int64_t)extsz, (flags &
- XFS_BMAPI_RSVBLOCKS));
+ (int64_t)extsz, 0);
else if (error)
xfs_icsb_modify_counters(mp,
XFS_SBS_FDBLOCKS,
- (int64_t)alen, (flags &
- XFS_BMAPI_RSVBLOCKS));
+ (int64_t)alen, 0);
}
if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
got.br_state = XFS_EXT_UNWRITTEN;
}
- error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
firstblock, flist, &tmp_logflags,
- whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+ whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
- lastx = ifp->if_lastex;
ep = xfs_iext_get_ext(ifp, lastx);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM
: XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
firstblock, flist, &tmp_logflags,
- whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+ whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
- lastx = ifp->if_lastex;
ep = xfs_iext_get_ext(ifp, lastx);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
/*
* Else go on to the next record.
*/
- ep = xfs_iext_get_ext(ifp, ++lastx);
prev = got;
- if (lastx >= nextents)
- eof = 1;
- else
+ if (++lastx < nextents) {
+ ep = xfs_iext_get_ext(ifp, lastx);
xfs_bmbt_get_all(ep, &got);
+ } else {
+ eof = 1;
+ }
}
- ifp->if_lastex = lastx;
*nmap = n;
/*
* Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
ASSERT(!isnullstartblock(got.br_startblock));
ASSERT(bno < got.br_startoff + got.br_blockcount);
*fsb = got.br_startblock + (bno - got.br_startoff);
- ifp->if_lastex = lastx;
return 0;
}
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
- int rsvd; /* OK to allocate reserved blocks */
xfs_fsblock_t sum;
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
mp = ip->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+
ASSERT(len > 0);
ASSERT(nexts >= 0);
ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
del.br_blockcount = mod;
}
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ XFS_DATA_FORK);
if (error)
goto error0;
goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
*/
ASSERT(bno >= del.br_blockcount);
bno -= del.br_blockcount;
- if (bno < got.br_startoff) {
- if (--lastx >= 0)
- xfs_bmbt_get_all(--ep, &got);
+ if (got.br_startoff > bno) {
+ if (--lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ xfs_bmbt_get_all(ep, &got);
+ }
}
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
prev.br_startoff = start;
}
prev.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+ lastx--;
+ error = xfs_bmap_add_extent(ip, &lastx, &cur,
&prev, firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ XFS_DATA_FORK);
if (error)
goto error0;
goto nodelete;
} else {
ASSERT(del.br_state == XFS_EXT_NORM);
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur,
&del, firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ XFS_DATA_FORK);
if (error)
goto error0;
goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
- (int64_t)rtexts, rsvd);
+ (int64_t)rtexts, 0);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_RTBLKS);
} else {
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)del.br_blockcount, rsvd);
+ (int64_t)del.br_blockcount, 0);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
error = XFS_ERROR(ENOSPC);
goto error0;
}
- error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
- &tmp_logflags, whichfork, rsvd);
+ error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+ &tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
bno = del.br_startoff - 1;
nodelete:
- lastx = ifp->if_lastex;
/*
* If not done go on to the next (previous) record.
- * Reset ep in case the extents array was re-alloced.
*/
- ep = xfs_iext_get_ext(ifp, lastx);
if (bno != (xfs_fileoff_t)-1 && bno >= start) {
- if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
- xfs_bmbt_get_startoff(ep) > bno) {
- if (--lastx >= 0)
- ep = xfs_iext_get_ext(ifp, lastx);
- }
- if (lastx >= 0)
+ if (lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp, lastx);
+ if (xfs_bmbt_get_startoff(ep) > bno) {
+ if (--lastx >= 0)
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ }
xfs_bmbt_get_all(ep, &got);
+ }
extno++;
}
}
- ifp->if_lastex = lastx;
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
ASSERT(ifp->if_ext_max ==
XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191daea1..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef struct xfs_bmap_free
#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
-#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
/* combine contig. space */
@@ -87,7 +86,6 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
{ XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
- { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c8e3349c287c..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
/*
* We know that the size is valid (it's checked in iformat_btree)
*/
- ifp->if_lastex = NULLEXTNUM;
ifp->if_bytes = ifp->if_real_bytes = 0;
ifp->if_flags |= XFS_IFEXTENTS;
xfs_iext_add(ifp, 0, nextents);
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_EXTENTS:
ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
!(iip->ili_format.ilf_fields & extflag[whichfork]));
- ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
- (ifp->if_bytes == 0));
- ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
- (ifp->if_bytes > 0));
if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
+ ASSERT(xfs_iext_get_ext(ifp, 0));
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
xfs_extnum_t idx) /* index of target extent */
{
ASSERT(idx >= 0);
+ ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
return ifp->if_u1.if_ext_irec->er_extbuf;
} else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
}
ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
ifp->if_real_bytes = 0;
- ifp->if_lastex = nextents + ext_diff;
}
/*
* Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
xfs_extnum_t page_idx = *idxp; /* extent index in target list */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(page_idx >= 0 && page_idx <=
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+ ASSERT(page_idx >= 0);
+ ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+ ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
erp_idx = 0;
low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a30227d..3ae6d58e5473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
unsigned char if_ext_max; /* max # of extent records */
- xfs_extnum_t if_lastex; /* last if_extents used */
union {
xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 7d56e88a3f0e..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
+#include "xfs_discard.h"
/*
* Perform initial CIL structure initialisation. If the CIL is not
@@ -361,18 +362,28 @@ xlog_cil_committed(
int abort)
{
struct xfs_cil_ctx *ctx = args;
+ struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort);
xfs_alloc_busy_sort(&ctx->busy_extents);
- xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents);
+ xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+ (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
spin_lock(&ctx->cil->xc_cil_lock);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_cil_lock);
xlog_cil_free_logvec(ctx->lv_chain);
+
+ if (!list_empty(&ctx->busy_extents)) {
+ ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+ xfs_discard_extents(mp, &ctx->busy_extents);
+ xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+ }
+
kmem_free(ctx);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab0d0c6..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
disk errors in metadata */
+#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
user */
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d1f24858ccc4..7c7bc2b786bd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -609,7 +609,7 @@ xfs_trans_free(
struct xfs_trans *tp)
{
xfs_alloc_busy_sort(&tp->t_busy);
- xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy);
+ xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
xfs_trans_free_dqinfo(tp);