summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Kconfig5
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c11
-rw-r--r--fs/Kconfig19
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/binfmt_flat.c8
-rw-r--r--fs/block_dev.c34
-rw-r--r--fs/btrfs/extent_io.c1
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/buffer.c59
-rw-r--r--fs/ceph/addr.c5
-rw-r--r--fs/ceph/caps.c61
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/export.c25
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/cifs/Kconfig35
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/README12
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifs_unicode.h3
-rw-r--r--fs/cifs/cifsacl.c483
-rw-r--r--fs/cifs/cifsacl.h25
-rw-r--r--fs/cifs/cifsencrypt.c12
-rw-r--r--fs/cifs/cifsfs.c119
-rw-r--r--fs/cifs/cifsfs.h20
-rw-r--r--fs/cifs/cifsglob.h10
-rw-r--r--fs/cifs/cifspdu.h37
-rw-r--r--fs/cifs/cifsproto.h30
-rw-r--r--fs/cifs/cifssmb.c377
-rw-r--r--fs/cifs/connect.c279
-rw-r--r--fs/cifs/export.c4
-rw-r--r--fs/cifs/file.c167
-rw-r--r--fs/cifs/inode.c129
-rw-r--r--fs/cifs/misc.c12
-rw-r--r--fs/cifs/netmisc.c7
-rw-r--r--fs/cifs/sess.c9
-rw-r--r--fs/cifs/smbdes.c418
-rw-r--r--fs/cifs/smbencrypt.c124
-rw-r--r--fs/cifs/transport.c66
-rw-r--r--fs/cifs/xattr.c20
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/compat.c235
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/dcache.c9
-rw-r--r--fs/debugfs/file.c19
-rw-r--r--fs/dlm/config.c9
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/dlm_internal.h3
-rw-r--r--fs/dlm/lock.c182
-rw-r--r--fs/dlm/lock.h1
-rw-r--r--fs/dlm/lockspace.c6
-rw-r--r--fs/dlm/plock.c65
-rw-r--r--fs/dlm/user.c1
-rw-r--r--fs/drop_caches.c5
-rw-r--r--fs/ecryptfs/inode.c5
-rw-r--r--fs/exec.c139
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext3/namei.c80
-rw-r--r--fs/ext4/Makefile3
-rw-r--r--fs/ext4/balloc.c146
-rw-r--r--fs/ext4/ext4.h127
-rw-r--r--fs/ext4/ext4_jbd2.c14
-rw-r--r--fs/ext4/ext4_jbd2.h5
-rw-r--r--fs/ext4/extents.c1410
-rw-r--r--fs/ext4/file.c1
-rw-r--r--fs/ext4/fsync.c25
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/mballoc.c459
-rw-r--r--fs/ext4/mballoc.h6
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c351
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c82
-rw-r--r--fs/ext4/page-io.c39
-rw-r--r--fs/ext4/super.c204
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/fat/cache.c7
-rw-r--r--fs/fat/dir.c32
-rw-r--r--fs/fat/fat.h15
-rw-r--r--fs/fat/fatent.c4
-rw-r--r--fs/fat/inode.c74
-rw-r--r--fs/fat/misc.c44
-rw-r--r--fs/fat/namei_msdos.c9
-rw-r--r--fs/fat/namei_vfat.c9
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/fscache/operation.c10
-rw-r--r--fs/fscache/page.c13
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/gfs2/Makefile4
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/gfs2/dir.c197
-rw-r--r--fs/gfs2/dir.h4
-rw-r--r--fs/gfs2/export.c2
-rw-r--r--fs/gfs2/file.c46
-rw-r--r--fs/gfs2/glock.c99
-rw-r--r--fs/gfs2/glock.h3
-rw-r--r--fs/gfs2/glops.c172
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c1510
-rw-r--r--fs/gfs2/inode.h8
-rw-r--r--fs/gfs2/log.c208
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c39
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c2
-rw-r--r--fs/gfs2/meta_io.h2
-rw-r--r--fs/gfs2/ops_fstype.c32
-rw-r--r--fs/gfs2/ops_inode.c1344
-rw-r--r--fs/gfs2/quota.c12
-rw-r--r--fs/gfs2/quota.h4
-rw-r--r--fs/gfs2/rgrp.c24
-rw-r--r--fs/gfs2/super.c138
-rw-r--r--fs/gfs2/sys.c6
-rw-r--r--fs/gfs2/trace_gfs2.h38
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c9
-rw-r--r--fs/hugetlbfs/inode.c7
-rw-r--r--fs/inode.c10
-rw-r--r--fs/jbd/commit.c15
-rw-r--r--fs/jbd/journal.c16
-rw-r--r--fs/jbd/transaction.c3
-rw-r--r--fs/jbd2/commit.c28
-rw-r--r--fs/jbd2/journal.c58
-rw-r--r--fs/jbd2/transaction.c22
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dev_bdev.c1
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/mbcache.c10
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/namei.c382
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/dir.c5
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/internal.h2
-rw-r--r--fs/nfsd/stats.c2
-rw-r--r--fs/nilfs2/alloc.c12
-rw-r--r--fs/nilfs2/bmap.c4
-rw-r--r--fs/nilfs2/btnode.c19
-rw-r--r--fs/nilfs2/btnode.h4
-rw-r--r--fs/nilfs2/btree.c38
-rw-r--r--fs/nilfs2/cpfile.c24
-rw-r--r--fs/nilfs2/dat.c4
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/gcinode.c25
-rw-r--r--fs/nilfs2/ifile.c4
-rw-r--r--fs/nilfs2/inode.c23
-rw-r--r--fs/nilfs2/ioctl.c61
-rw-r--r--fs/nilfs2/mdt.c8
-rw-r--r--fs/nilfs2/mdt.h9
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/nilfs2/nilfs.h7
-rw-r--r--fs/nilfs2/page.c79
-rw-r--r--fs/nilfs2/page.h7
-rw-r--r--fs/nilfs2/recovery.c12
-rw-r--r--fs/nilfs2/segbuf.c17
-rw-r--r--fs/nilfs2/segment.c190
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/sufile.c274
-rw-r--r--fs/nilfs2/sufile.h4
-rw-r--r--fs/nilfs2/super.c131
-rw-r--r--fs/nilfs2/the_nilfs.c24
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/partitions/check.c8
-rw-r--r--fs/partitions/ldm.c7
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c20
-rw-r--r--fs/proc/generic.c1
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h26
-rw-r--r--fs/proc/namespaces.c198
-rw-r--r--fs/proc/task_mmu.c206
-rw-r--r--fs/pstore/platform.c12
-rw-r--r--fs/quota/dquot.c5
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/splice.c33
-rw-r--r--fs/squashfs/Kconfig4
-rw-r--r--fs/squashfs/cache.c2
-rw-r--r--fs/super.c3
-rw-r--r--fs/sysfs/file.c12
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/timerfd.c102
-rw-r--r--fs/ubifs/budget.c104
-rw-r--r--fs/ubifs/commit.c2
-rw-r--r--fs/ubifs/debug.c167
-rw-r--r--fs/ubifs/debug.h178
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/ubifs/file.c28
-rw-r--r--fs/ubifs/find.c10
-rw-r--r--fs/ubifs/gc.c71
-rw-r--r--fs/ubifs/io.c33
-rw-r--r--fs/ubifs/journal.c29
-rw-r--r--fs/ubifs/log.c28
-rw-r--r--fs/ubifs/lprops.c115
-rw-r--r--fs/ubifs/lpt_commit.c55
-rw-r--r--fs/ubifs/master.c8
-rw-r--r--fs/ubifs/misc.h17
-rw-r--r--fs/ubifs/orphan.c3
-rw-r--r--fs/ubifs/recovery.c354
-rw-r--r--fs/ubifs/replay.c468
-rw-r--r--fs/ubifs/sb.c153
-rw-r--r--fs/ubifs/super.c46
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/tnc_commit.c18
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h86
-rw-r--r--fs/ubifs/xattr.c8
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c26
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c29
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c22
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h76
-rw-r--r--fs/xfs/quota/xfs_qm.c6
-rw-r--r--fs/xfs/xfs_ag.h4
-rw-r--r--fs/xfs/xfs_alloc.c871
-rw-r--r--fs/xfs/xfs_alloc.h18
-rw-r--r--fs/xfs/xfs_alloc_btree.c16
-rw-r--r--fs/xfs/xfs_bmap.c549
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_dfrag.c6
-rw-r--r--fs/xfs/xfs_inode.c19
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_log.c15
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_cil.c16
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c75
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_trans.c6
-rw-r--r--fs/xfs/xfs_types.h2
257 files changed, 9570 insertions, 7157 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 814ac4e213a8..0a93dc1cb4ac 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -1,6 +1,6 @@
config 9P_FS
- tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
- depends on INET && NET_9P && EXPERIMENTAL
+ tristate "Plan 9 Resource Sharing Support (9P2000)"
+ depends on INET && NET_9P
help
If you say Y here, you will get experimental support for
Plan 9 resource sharing via the 9P2000 protocol.
@@ -10,7 +10,6 @@ config 9P_FS
If unsure, say N.
if 9P_FS
-
config 9P_FSCACHE
bool "Enable 9P client caching support (EXPERIMENTAL)"
depends on EXPERIMENTAL
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c67703195..8d7f3e69ae29 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,6 +814,7 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
{
+ dentry_unhash(d);
return v9fs_remove(i, d, 1);
}
@@ -839,6 +840,9 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct p9_fid *newdirfid;
struct p9_wstat wstat;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
P9_DPRINTK(P9_DEBUG_VFS, "\n");
retval = 0;
old_inode = old_dentry->d_inode;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 82a7c38ddad0..691c78f58bef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
if (IS_ERR(inode_fid)) {
err = PTR_ERR(inode_fid);
mutex_unlock(&v9inode->v_mutex);
- goto error;
+ goto err_clunk_old_fid;
}
v9inode->writeback_fid = (void *) inode_fid;
}
@@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
/* Since we are opening a file, assign the open fid to the file */
filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
if (IS_ERR(filp)) {
- p9_client_clunk(ofid);
- return PTR_ERR(filp);
+ err = PTR_ERR(filp);
+ goto err_clunk_old_fid;
}
filp->private_data = ofid;
#ifdef CONFIG_9P_FSCACHE
@@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
return 0;
error:
- if (ofid)
- p9_client_clunk(ofid);
if (fid)
p9_client_clunk(fid);
+err_clunk_old_fid:
+ if (ofid)
+ p9_client_clunk(ofid);
return err;
}
diff --git a/fs/Kconfig b/fs/Kconfig
index f3aa9b08b228..19891aab9c6e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
def_bool n
config EXPORTFS
- bool
+ tristate
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
@@ -124,6 +124,7 @@ config TMPFS
config TMPFS_POSIX_ACL
bool "Tmpfs POSIX Access Control Lists"
depends on TMPFS
+ select TMPFS_XATTR
select GENERIC_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -134,6 +135,22 @@ config TMPFS_POSIX_ACL
If you don't know what Access Control Lists are, say N.
+config TMPFS_XATTR
+ bool "Tmpfs extended attributes"
+ depends on TMPFS
+ default n
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ Currently this enables support for the trusted.* and
+ security.* namespaces.
+
+ You need this for POSIX ACL support on tmpfs.
+
+ If unsure, say N.
+
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index e3e9efc1fdd8..03330e2e390c 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,6 +320,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
dentry->d_inode->i_ino,
(int)dentry->d_name.len, dentry->d_name.name);
+ dentry_unhash(dentry);
+
return affs_remove_header(dentry);
}
@@ -417,6 +419,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
(u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
(u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 20c106f24927..2c4e05160042 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,6 +845,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
_enter("{%x:%u},{%s}",
dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+ dentry_unhash(dentry);
+
ret = -ENAMETOOLONG;
if (dentry->d_name.len >= AFSNAMEMAX)
goto error;
@@ -1146,6 +1148,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct key *key;
int ret;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
vnode = AFS_FS_I(old_dentry->d_inode);
orig_dvnode = AFS_FS_I(old_dir);
new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index f55ae23b137e..87d95a8cddbc 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,6 +583,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EACCES;
+ dentry_unhash(dentry);
+
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index b14cebfd9047..c7d1d06b0483 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,6 +224,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct bfs_sb_info *info;
int error = -ENOENT;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_bh = new_bh = NULL;
old_inode = old_dentry->d_inode;
if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 397d3057d336..1bffbe0ed778 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
int res;
char buf[16];
+ memset(&bprm, 0, sizeof(bprm));
+
/* Create the file name */
sprintf(buf, "/lib/lib%d.so", id);
@@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
if (!bprm.cred)
goto out;
+ /* We don't really care about recalculating credentials at this point
+ * as we're past the point of no return and are dealing with shared
+ * libraries.
+ */
+ bprm.cred_prepared = 1;
+
res = prepare_binprm(&bprm);
if (!IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 257b00e98428..1f2b19978333 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1120,6 +1120,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto restart;
}
}
+
+ if (!ret && !bdev->bd_openers) {
+ bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+ bdi = blk_get_backing_dev_info(bdev);
+ if (bdi == NULL)
+ bdi = &default_backing_dev_info;
+ bdev_inode_switch_bdi(bdev->bd_inode, bdi);
+ }
+
/*
* If the device is invalidated, rescan partition
* if open succeeded or failed with -ENOMEDIUM.
@@ -1130,14 +1139,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
rescan_partitions(disk, bdev);
if (ret)
goto out_clear;
-
- if (!bdev->bd_openers) {
- bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- bdi = blk_get_backing_dev_info(bdev);
- if (bdi == NULL)
- bdi = &default_backing_dev_info;
- bdev_inode_switch_bdi(bdev->bd_inode, bdi);
- }
} else {
struct block_device *whole;
whole = bdget_disk(disk, 0);
@@ -1237,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
res = __blkdev_get(bdev, mode, 0);
if (whole) {
+ struct gendisk *disk = whole->bd_disk;
+
/* finish claiming */
mutex_lock(&bdev->bd_mutex);
spin_lock(&bdev_lock);
@@ -1263,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
spin_unlock(&bdev_lock);
/*
- * Block event polling for write claims. Any write
- * holder makes the write_holder state stick until all
- * are released. This is good enough and tracking
- * individual writeable reference is too fragile given
- * the way @mode is used in blkdev_get/put().
+ * Block event polling for write claims if requested. Any
+ * write holder makes the write_holder state stick until
+ * all are released. This is good enough and tracking
+ * individual writeable reference is too fragile given the
+ * way @mode is used in blkdev_get/put().
*/
- if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+ if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+ !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
bdev->bd_write_holder = true;
- disk_block_events(bdev->bd_disk);
+ disk_block_events(disk);
}
mutex_unlock(&bdev->bd_mutex);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cee46e01081..4f9893243dae 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -10,6 +10,7 @@
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/prefetch.h>
#include <linux/cleancache.h>
#include "extent_io.h"
#include "extent_map.h"
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 199a80134312..f340f7c99d09 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -709,7 +709,7 @@ again:
WARN_ON(cur->checked);
if (!list_empty(&cur->upper)) {
/*
- * the backref was added previously when processsing
+ * the backref was added previously when processing
* backref of type BTRFS_TREE_BLOCK_REF_KEY
*/
BUG_ON(!list_is_singular(&cur->upper));
diff --git a/fs/buffer.c b/fs/buffer.c
index de05703b184b..698c6b2cc462 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2336,24 +2336,26 @@ EXPORT_SYMBOL(block_commit_write);
* page lock we can determine safely if the page is beyond EOF. If it is not
* beyond EOF, then the page is guaranteed safe against truncation until we
* unlock the page.
+ *
+ * Direct callers of this function should call vfs_check_frozen() so that page
+ * fault does not busyloop until the fs is thawed.
*/
-int
-block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
+int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
{
struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
unsigned long end;
loff_t size;
- int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
+ int ret;
lock_page(page);
size = i_size_read(inode);
if ((page->mapping != inode->i_mapping) ||
(page_offset(page) > size)) {
- /* page got truncated out from underneath us */
- unlock_page(page);
- goto out;
+ /* We overload EFAULT to mean page got truncated */
+ ret = -EFAULT;
+ goto out_unlock;
}
/* page is wholly or partially inside EOF */
@@ -2366,18 +2368,41 @@ block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!ret)
ret = block_commit_write(page, 0, end);
- if (unlikely(ret)) {
- unlock_page(page);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else /* -ENOSPC, -EIO, etc */
- ret = VM_FAULT_SIGBUS;
- } else
- ret = VM_FAULT_LOCKED;
-
-out:
+ if (unlikely(ret < 0))
+ goto out_unlock;
+ /*
+ * Freezing in progress? We check after the page is marked dirty and
+ * with page lock held so if the test here fails, we are sure freezing
+ * code will wait during syncing until the page fault is done - at that
+ * point page will be dirty and unlocked so freezing code will write it
+ * and writeprotect it again.
+ */
+ set_page_dirty(page);
+ if (inode->i_sb->s_frozen != SB_UNFROZEN) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
+ return 0;
+out_unlock:
+ unlock_page(page);
return ret;
}
+EXPORT_SYMBOL(__block_page_mkwrite);
+
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
+{
+ int ret;
+ struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
+
+ /*
+ * This check is racy but catches the common case. The check in
+ * __block_page_mkwrite() is reliable.
+ */
+ vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ ret = __block_page_mkwrite(vma, vmf, get_block);
+ return block_page_mkwrite_return(ret);
+}
EXPORT_SYMBOL(block_page_mkwrite);
/*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 38b8ab554924..33da49dc3cc6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -848,7 +848,8 @@ get_more_pages:
op->payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len);
- ceph_osdc_start_request(&fsc->client->osdc, req, true);
+ rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
+ BUG_ON(rc);
req = NULL;
/* continue? */
@@ -880,8 +881,6 @@ release_pvec_pages:
out:
if (req)
ceph_osdc_put_request(req);
- if (rc > 0)
- rc = 0; /* vfs expects us to return 0 */
ceph_put_snap_context(snapc);
dout("writepages done, rc = %d\n", rc);
return rc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2a5404c1c42f..1f72b00447c4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -569,7 +569,8 @@ retry:
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
- }
+ } else if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
if (!ci->i_snap_realm) {
/*
@@ -2634,6 +2635,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
struct ceph_mds_session *session,
int *open_target_sessions)
{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
unsigned mseq = le32_to_cpu(ex->migrate_seq);
@@ -2670,6 +2672,19 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
* export targets, so that we get the matching IMPORT
*/
*open_target_sessions = 1;
+
+ /*
+ * we can't flush dirty caps that we've seen the
+ * EXPORT but no IMPORT for
+ */
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p to cap_dirty_migrating\n",
+ inode);
+ list_move(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
}
__ceph_remove_cap(cap);
}
@@ -2707,6 +2722,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
ci->i_cap_exporting_issued = 0;
ci->i_cap_exporting_mseq = 0;
ci->i_cap_exporting_mds = -1;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (!list_empty(&ci->i_dirty_item)) {
+ dout(" moving %p back to cap_dirty\n", inode);
+ list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
} else {
dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
inode, ci, mds, mseq);
@@ -2910,38 +2932,16 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
*/
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
- struct ceph_inode_info *ci, *nci = NULL;
- struct inode *inode, *ninode = NULL;
- struct list_head *p, *n;
+ struct ceph_inode_info *ci;
+ struct inode *inode;
dout("flush_dirty_caps\n");
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_safe(p, n, &mdsc->cap_dirty) {
- if (nci) {
- ci = nci;
- inode = ninode;
- ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
- dout("flush_dirty_caps inode %p (was next inode)\n",
- inode);
- } else {
- ci = list_entry(p, struct ceph_inode_info,
- i_dirty_item);
- inode = igrab(&ci->vfs_inode);
- BUG_ON(!inode);
- dout("flush_dirty_caps inode %p\n", inode);
- }
- if (n != &mdsc->cap_dirty) {
- nci = list_entry(n, struct ceph_inode_info,
- i_dirty_item);
- ninode = igrab(&nci->vfs_inode);
- BUG_ON(!ninode);
- nci->i_ceph_flags |= CEPH_I_NOFLUSH;
- dout("flush_dirty_caps next inode %p, noflush\n",
- ninode);
- } else {
- nci = NULL;
- ninode = NULL;
- }
+ while (!list_empty(&mdsc->cap_dirty)) {
+ ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
+ i_dirty_item);
+ inode = igrab(&ci->vfs_inode);
+ dout("flush_dirty_caps %p\n", inode);
spin_unlock(&mdsc->cap_dirty_lock);
if (inode) {
ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
@@ -2951,6 +2951,7 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
spin_lock(&mdsc->cap_dirty_lock);
}
spin_unlock(&mdsc->cap_dirty_lock);
+ dout("flush_dirty_caps done\n");
}
/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1a867a3601ae..33729e822bb9 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -360,7 +360,7 @@ more:
rinfo = &fi->last_readdir->r_reply_info;
dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
rinfo->dir_nr, off, fi->offset);
- while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+ while (off >= fi->offset && off - fi->offset < rinfo->dir_nr) {
u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
@@ -1066,16 +1066,17 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
struct inode *inode = file->f_dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
int left;
+ const int bufsize = 1024;
if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
return -EISDIR;
if (!cf->dir_info) {
- cf->dir_info = kmalloc(1024, GFP_NOFS);
+ cf->dir_info = kmalloc(bufsize, GFP_NOFS);
if (!cf->dir_info)
return -ENOMEM;
cf->dir_info_len =
- sprintf(cf->dir_info,
+ snprintf(cf->dir_info, bufsize,
"entries: %20lld\n"
" files: %20lld\n"
" subdirs: %20lld\n"
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e41056174bf8..a610d3d67488 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -86,6 +86,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_nfs_fh *fh)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
struct dentry *dentry;
struct ceph_vino vino;
@@ -95,8 +96,24 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
vino.ino = fh->ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+ if (!inode) {
+ struct ceph_mds_request *req;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
+
+ req->r_ino1 = vino;
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ igrab(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ESTALE);
+ }
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
@@ -148,8 +165,10 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ igrab(inode);
ceph_mdsc_put_request(req);
- inode = ceph_find_inode(sb, vino);
if (!inode)
return ERR_PTR(err ? err : -ESTALE);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d0fae4ce9ba5..79743d146be6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -578,6 +578,7 @@ static void __register_request(struct ceph_mds_client *mdsc,
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
+ ihold(dir);
spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
@@ -598,6 +599,9 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+
+ iput(req->r_unsafe_dir);
+ req->r_unsafe_dir = NULL;
}
ceph_mdsc_put_request(req);
@@ -2691,7 +2695,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
{
struct super_block *sb = mdsc->fsc->sb;
struct inode *inode;
- struct ceph_inode_info *ci;
struct dentry *parent, *dentry;
struct ceph_dentry_info *di;
int mds = session->s_mds;
@@ -2728,7 +2731,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dout("handle_lease no inode %llx\n", vino.ino);
goto release;
}
- ci = ceph_inode(inode);
/* dentry */
parent = d_find_alias(inode);
@@ -3002,6 +3004,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->cap_flush_seq = 0;
INIT_LIST_HEAD(&mdsc->cap_dirty);
+ INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4e3a9cc0bba6..7d8a0d662d56 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -278,6 +278,7 @@ struct ceph_mds_client {
u64 cap_flush_seq;
struct list_head cap_dirty; /* inodes with dirty caps */
+ struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
spinlock_t cap_dirty_lock; /* protects above items */
wait_queue_head_t cap_flushing_wq;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 7cb0f7f847e4..75c47cd8d086 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -7,6 +7,7 @@ config CIFS
select CRYPTO_MD5
select CRYPTO_HMAC
select CRYPTO_ARC4
+ select CRYPTO_DES
help
This is the client VFS module for the Common Internet File System
(CIFS) protocol which is the successor to the Server Message Block
@@ -152,16 +153,28 @@ config CIFS_ACL
Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
is handed over to the application/caller.
-config CIFS_EXPERIMENTAL
- bool "CIFS Experimental Features (EXPERIMENTAL)"
+config CIFS_SMB2
+ bool "SMB2 network file system support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL && INET && BROKEN
+ select NLS
+ select KEYS
+ select FSCACHE
+ select DNS_RESOLVER
+
+ help
+ This enables experimental support for the SMB2 (Server Message Block
+ version 2) protocol. The SMB2 protocol is the successor to the
+ popular CIFS and SMB network file sharing protocols. SMB2 is the
+ native file sharing mechanism for recent versions of Windows
+ operating systems (since Vista). SMB2 enablement will eventually
+ allow users better performance, security and features, than would be
+ possible with cifs. Note that smb2 mount options also are simpler
+ (compared to cifs) due to protocol improvements.
+
+ Unless you are a developer or tester, say N.
+
+config CIFS_NFSD_EXPORT
+ bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
depends on CIFS && EXPERIMENTAL
help
- Enables cifs features under testing. These features are
- experimental and currently include DFS support and directory
- change notification ie fcntl(F_DNOTIFY), as well as the upcall
- mechanism which will be used for Kerberos session negotiation
- and uid remapping. Some of these features also may depend on
- setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental
- (which is disabled by default). See the file fs/cifs/README
- for more details. If unsure, say N.
-
+ Allows NFS server to export a CIFS mounted share (nfsd over cifs)
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index d87558448e3d..005d524c3a4a 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -4,7 +4,7 @@
obj-$(CONFIG_CIFS) += cifs.o
cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
- link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
+ link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o
diff --git a/fs/cifs/README b/fs/cifs/README
index 74ab165fc646..4a3ca0e5ca24 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -704,18 +704,6 @@ the start of smb requests and responses can be enabled via:
echo 1 > /proc/fs/cifs/traceSMB
-Two other experimental features are under development. To test these
-requires enabling CONFIG_CIFS_EXPERIMENTAL
-
- cifsacl support needed to retrieve approximated mode bits based on
- the contents on the CIFS ACL.
-
- lease support: cifs will check the oplock state before calling into
- the vfs to see if we can grant a lease on a file.
-
- DNOTIFY fcntl: needed for support of directory change
- notification and perhaps later for file leases)
-
Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
if the kernel was configured with cifs statistics enabled. The statistics
represent the number of successful (ie non-zero return code from the server)
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 30d01bc90855..18f4272d9047 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -63,7 +63,7 @@ void cifs_dump_detail(struct smb_hdr *smb)
cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d",
smb->Command, smb->Status.CifsError,
smb->Flags, smb->Flags2, smb->Mid, smb->Pid);
- cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb));
+ cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb));
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index ac51cd2d33ae..a9d5692e0c20 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -58,9 +58,7 @@ struct cifs_sb_info {
unsigned int mnt_cifs_flags;
int prepathlen;
char *prepath; /* relative path under the share to mount to */
-#ifdef CONFIG_CIFS_DFS_UPCALL
- char *mountdata; /* mount options received at mount time */
-#endif
+ char *mountdata; /* options received at mount time or via DFS refs */
struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
};
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 644dd882a560..6d02fd560566 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -82,6 +82,9 @@ int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
char *cifs_strndup_from_ucs(const char *src, const int maxlen,
const bool is_unicode,
const struct nls_table *codepage);
+extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
+ const struct nls_table *cp, int mapChars);
+
#endif
/*
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index beeebf194234..f3c6fb9942ac 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -23,24 +23,16 @@
#include <linux/fs.h>
#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/keyctl.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsacl.h"
#include "cifsproto.h"
#include "cifs_debug.h"
-
-static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
- {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
- {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
-;
-
-
/* security id for everyone/world system group */
static const struct cifs_sid sid_everyone = {
1, 1, {0, 0, 0, 0, 0, 1}, {0} };
@@ -50,50 +42,385 @@ static const struct cifs_sid sid_authusers = {
/* group users */
static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
+const struct cred *root_cred;
-int match_sid(struct cifs_sid *ctsid)
+static void
+shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
+ int *nr_del)
{
- int i, j;
- int num_subauth, num_sat, num_saw;
- struct cifs_sid *cwsid;
+ struct rb_node *node;
+ struct rb_node *tmp;
+ struct cifs_sid_id *psidid;
+
+ node = rb_first(root);
+ while (node) {
+ tmp = node;
+ node = rb_next(tmp);
+ psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
+ if (nr_to_scan == 0 || *nr_del == nr_to_scan)
+ ++(*nr_rem);
+ else {
+ if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
+ && psidid->refcount == 0) {
+ rb_erase(tmp, root);
+ ++(*nr_del);
+ } else
+ ++(*nr_rem);
+ }
+ }
+}
+
+/*
+ * Run idmap cache shrinker.
+ */
+static int
+cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+{
+ int nr_del = 0;
+ int nr_rem = 0;
+ struct rb_root *root;
+
+ root = &uidtree;
+ spin_lock(&siduidlock);
+ shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+ spin_unlock(&siduidlock);
+
+ root = &gidtree;
+ spin_lock(&sidgidlock);
+ shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
+ spin_unlock(&sidgidlock);
+
+ return nr_rem;
+}
+
+static struct shrinker cifs_shrinker = {
+ .shrink = cifs_idmap_shrinker,
+ .seeks = DEFAULT_SEEKS,
+};
+
+static int
+cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen)
+{
+ char *payload;
+
+ payload = kmalloc(datalen, GFP_KERNEL);
+ if (!payload)
+ return -ENOMEM;
+
+ memcpy(payload, data, datalen);
+ key->payload.data = payload;
+ return 0;
+}
+
+static inline void
+cifs_idmap_key_destroy(struct key *key)
+{
+ kfree(key->payload.data);
+}
- if (!ctsid)
- return -1;
+struct key_type cifs_idmap_key_type = {
+ .name = "cifs.idmap",
+ .instantiate = cifs_idmap_key_instantiate,
+ .destroy = cifs_idmap_key_destroy,
+ .describe = user_describe,
+ .match = user_match,
+};
+
+static void
+sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+{
+ int i;
+ unsigned long saval;
+ char *strptr;
- for (i = 0; i < NUM_WK_SIDS; ++i) {
- cwsid = &(wksidarr[i].cifssid);
+ strptr = sidstr;
- /* compare the revision */
- if (ctsid->revision != cwsid->revision)
- continue;
+ sprintf(strptr, "%s", "S");
+ strptr = sidstr + strlen(sidstr);
- /* compare all of the six auth values */
- for (j = 0; j < 6; ++j) {
- if (ctsid->authority[j] != cwsid->authority[j])
- break;
+ sprintf(strptr, "-%d", sidptr->revision);
+ strptr = sidstr + strlen(sidstr);
+
+ for (i = 0; i < 6; ++i) {
+ if (sidptr->authority[i]) {
+ sprintf(strptr, "-%d", sidptr->authority[i]);
+ strptr = sidstr + strlen(sidstr);
}
- if (j < 6)
- continue; /* all of the auth values did not match */
-
- /* compare all of the subauth values if any */
- num_sat = ctsid->num_subauth;
- num_saw = cwsid->num_subauth;
- num_subauth = num_sat < num_saw ? num_sat : num_saw;
- if (num_subauth) {
- for (j = 0; j < num_subauth; ++j) {
- if (ctsid->sub_auth[j] != cwsid->sub_auth[j])
- break;
- }
- if (j < num_subauth)
- continue; /* all sub_auth values do not match */
+ }
+
+ for (i = 0; i < sidptr->num_subauth; ++i) {
+ saval = le32_to_cpu(sidptr->sub_auth[i]);
+ sprintf(strptr, "-%ld", saval);
+ strptr = sidstr + strlen(sidstr);
+ }
+}
+
+static void
+id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
+ struct cifs_sid_id **psidid, char *typestr)
+{
+ int rc;
+ char *strptr;
+ struct rb_node *node = root->rb_node;
+ struct rb_node *parent = NULL;
+ struct rb_node **linkto = &(root->rb_node);
+ struct cifs_sid_id *lsidid;
+
+ while (node) {
+ lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+ parent = node;
+ rc = compare_sids(sidptr, &((lsidid)->sid));
+ if (rc > 0) {
+ linkto = &(node->rb_left);
+ node = node->rb_left;
+ } else if (rc < 0) {
+ linkto = &(node->rb_right);
+ node = node->rb_right;
+ }
+ }
+
+ memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
+ (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
+ (*psidid)->refcount = 0;
+
+ sprintf((*psidid)->sidstr, "%s", typestr);
+ strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+ sid_to_str(&(*psidid)->sid, strptr);
+
+ clear_bit(SID_ID_PENDING, &(*psidid)->state);
+ clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+
+ rb_link_node(&(*psidid)->rbnode, parent, linkto);
+ rb_insert_color(&(*psidid)->rbnode, root);
+}
+
+static struct cifs_sid_id *
+id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
+{
+ int rc;
+ struct rb_node *node = root->rb_node;
+ struct cifs_sid_id *lsidid;
+
+ while (node) {
+ lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+ rc = compare_sids(sidptr, &((lsidid)->sid));
+ if (rc > 0) {
+ node = node->rb_left;
+ } else if (rc < 0) {
+ node = node->rb_right;
+ } else /* node found */
+ return lsidid;
+ }
+
+ return NULL;
+}
+
+static int
+sidid_pending_wait(void *unused)
+{
+ schedule();
+ return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+static int
+sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
+ struct cifs_fattr *fattr, uint sidtype)
+{
+ int rc;
+ unsigned long cid;
+ struct key *idkey;
+ const struct cred *saved_cred;
+ struct cifs_sid_id *psidid, *npsidid;
+ struct rb_root *cidtree;
+ spinlock_t *cidlock;
+
+ if (sidtype == SIDOWNER) {
+ cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
+ cidlock = &siduidlock;
+ cidtree = &uidtree;
+ } else if (sidtype == SIDGROUP) {
+ cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
+ cidlock = &sidgidlock;
+ cidtree = &gidtree;
+ } else
+ return -ENOENT;
+
+ spin_lock(cidlock);
+ psidid = id_rb_search(cidtree, psid);
+
+ if (!psidid) { /* node does not exist, allocate one & attempt adding */
+ spin_unlock(cidlock);
+ npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+ if (!npsidid)
+ return -ENOMEM;
+
+ npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
+ if (!npsidid->sidstr) {
+ kfree(npsidid);
+ return -ENOMEM;
+ }
+
+ spin_lock(cidlock);
+ psidid = id_rb_search(cidtree, psid);
+ if (psidid) { /* node happened to get inserted meanwhile */
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ kfree(npsidid->sidstr);
+ kfree(npsidid);
+ } else {
+ psidid = npsidid;
+ id_rb_insert(cidtree, psid, &psidid,
+ sidtype == SIDOWNER ? "os:" : "gs:");
+ ++psidid->refcount;
+ spin_unlock(cidlock);
}
+ } else {
+ ++psidid->refcount;
+ spin_unlock(cidlock);
+ }
+
+ /*
+ * If we are here, it is safe to access psidid and its fields
+ * since a reference was taken earlier while holding the spinlock.
+ * A reference on the node is put without holding the spinlock
+ * and it is OK to do so in this case, shrinker will not erase
+ * this node until all references are put and we do not access
+ * any fields of the node after a reference is put .
+ */
+ if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+ cid = psidid->id;
+ psidid->time = jiffies; /* update ts for accessing */
+ goto sid_to_id_out;
+ }
- cFYI(1, "matching sid: %s\n", wksidarr[i].sidname);
- return 0; /* sids compare/match */
+ if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
+ goto sid_to_id_out;
+
+ if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
+ saved_cred = override_creds(root_cred);
+ idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+ if (IS_ERR(idkey))
+ cFYI(1, "%s: Can't map SID to an id", __func__);
+ else {
+ cid = *(unsigned long *)idkey->payload.value;
+ psidid->id = cid;
+ set_bit(SID_ID_MAPPED, &psidid->state);
+ key_put(idkey);
+ kfree(psidid->sidstr);
+ }
+ revert_creds(saved_cred);
+ psidid->time = jiffies; /* update ts for accessing */
+ clear_bit(SID_ID_PENDING, &psidid->state);
+ wake_up_bit(&psidid->state, SID_ID_PENDING);
+ } else {
+ rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+ sidid_pending_wait, TASK_INTERRUPTIBLE);
+ if (rc) {
+ cFYI(1, "%s: sidid_pending_wait interrupted %d",
+ __func__, rc);
+ --psidid->refcount; /* decremented without spinlock */
+ return rc;
+ }
+ if (test_bit(SID_ID_MAPPED, &psidid->state))
+ cid = psidid->id;
}
- cFYI(1, "No matching sid");
- return -1;
+sid_to_id_out:
+ --psidid->refcount; /* decremented without spinlock */
+ if (sidtype == SIDOWNER)
+ fattr->cf_uid = cid;
+ else
+ fattr->cf_gid = cid;
+
+ return 0;
+}
+
+int
+init_cifs_idmap(void)
+{
+ struct cred *cred;
+ struct key *keyring;
+ int ret;
+
+ cFYI(1, "Registering the %s key type\n", cifs_idmap_key_type.name);
+
+ /* create an override credential set with a special thread keyring in
+ * which requests are cached
+ *
+ * this is used to prevent malicious redirections from being installed
+ * with add_key().
+ */
+ cred = prepare_kernel_cred(NULL);
+ if (!cred)
+ return -ENOMEM;
+
+ keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ if (IS_ERR(keyring)) {
+ ret = PTR_ERR(keyring);
+ goto failed_put_cred;
+ }
+
+ ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
+ if (ret < 0)
+ goto failed_put_key;
+
+ ret = register_key_type(&cifs_idmap_key_type);
+ if (ret < 0)
+ goto failed_put_key;
+
+ /* instruct request_key() to use this special keyring as a cache for
+ * the results it looks up */
+ cred->thread_keyring = keyring;
+ cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
+ root_cred = cred;
+
+ spin_lock_init(&siduidlock);
+ uidtree = RB_ROOT;
+ spin_lock_init(&sidgidlock);
+ gidtree = RB_ROOT;
+
+ register_shrinker(&cifs_shrinker);
+
+ cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring));
+ return 0;
+
+failed_put_key:
+ key_put(keyring);
+failed_put_cred:
+ put_cred(cred);
+ return ret;
+}
+
+void
+exit_cifs_idmap(void)
+{
+ key_revoke(root_cred->thread_keyring);
+ unregister_key_type(&cifs_idmap_key_type);
+ put_cred(root_cred);
+ unregister_shrinker(&cifs_shrinker);
+ cFYI(1, "Unregistered %s key type\n", cifs_idmap_key_type.name);
+}
+
+void
+cifs_destroy_idmaptrees(void)
+{
+ struct rb_root *root;
+ struct rb_node *node;
+
+ root = &uidtree;
+ spin_lock(&siduidlock);
+ while ((node = rb_first(root)))
+ rb_erase(node, root);
+ spin_unlock(&siduidlock);
+
+ root = &gidtree;
+ spin_lock(&sidgidlock);
+ while ((node = rb_first(root)))
+ rb_erase(node, root);
+ spin_unlock(&sidgidlock);
}
/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -104,16 +431,24 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
int num_subauth, num_sat, num_saw;
if ((!ctsid) || (!cwsid))
- return 0;
+ return 1;
/* compare the revision */
- if (ctsid->revision != cwsid->revision)
- return 0;
+ if (ctsid->revision != cwsid->revision) {
+ if (ctsid->revision > cwsid->revision)
+ return 1;
+ else
+ return -1;
+ }
/* compare all of the six auth values */
for (i = 0; i < 6; ++i) {
- if (ctsid->authority[i] != cwsid->authority[i])
- return 0;
+ if (ctsid->authority[i] != cwsid->authority[i]) {
+ if (ctsid->authority[i] > cwsid->authority[i])
+ return 1;
+ else
+ return -1;
+ }
}
/* compare all of the subauth values if any */
@@ -122,12 +457,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
num_subauth = num_sat < num_saw ? num_sat : num_saw;
if (num_subauth) {
for (i = 0; i < num_subauth; ++i) {
- if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
- return 0;
+ if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+ if (ctsid->sub_auth[i] > cwsid->sub_auth[i])
+ return 1;
+ else
+ return -1;
+ }
}
}
- return 1; /* sids compare/match */
+ return 0; /* sids compare/match */
}
@@ -382,22 +721,22 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
#ifdef CONFIG_CIFS_DEBUG2
dump_ace(ppace[i], end_of_acl);
#endif
- if (compare_sids(&(ppace[i]->sid), pownersid))
+ if (compare_sids(&(ppace[i]->sid), pownersid) == 0)
access_flags_to_mode(ppace[i]->access_req,
ppace[i]->type,
&fattr->cf_mode,
&user_mask);
- if (compare_sids(&(ppace[i]->sid), pgrpsid))
+ if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0)
access_flags_to_mode(ppace[i]->access_req,
ppace[i]->type,
&fattr->cf_mode,
&group_mask);
- if (compare_sids(&(ppace[i]->sid), &sid_everyone))
+ if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0)
access_flags_to_mode(ppace[i]->access_req,
ppace[i]->type,
&fattr->cf_mode,
&other_mask);
- if (compare_sids(&(ppace[i]->sid), &sid_authusers))
+ if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)
access_flags_to_mode(ppace[i]->access_req,
ppace[i]->type,
&fattr->cf_mode,
@@ -475,10 +814,10 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
/* Convert CIFS ACL to POSIX form */
-static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
- struct cifs_fattr *fattr)
+static int parse_sec_desc(struct cifs_sb_info *cifs_sb,
+ struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr)
{
- int rc;
+ int rc = 0;
struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
struct cifs_acl *dacl_ptr; /* no need for SACL ptr */
char *end_of_acl = ((char *)pntsd) + acl_len;
@@ -500,12 +839,26 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
le32_to_cpu(pntsd->sacloffset), dacloffset);
/* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */
rc = parse_sid(owner_sid_ptr, end_of_acl);
- if (rc)
+ if (rc) {
+ cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc);
+ return rc;
+ }
+ rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER);
+ if (rc) {
+ cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc);
return rc;
+ }
rc = parse_sid(group_sid_ptr, end_of_acl);
- if (rc)
+ if (rc) {
+ cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc);
return rc;
+ }
+ rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP);
+ if (rc) {
+ cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc);
+ return rc;
+ }
if (dacloffset)
parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr,
@@ -520,7 +873,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr,
sizeof(struct cifs_sid)); */
- return 0;
+ return rc;
}
@@ -688,7 +1041,7 @@ out:
}
/* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
struct inode *inode, const char *path)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -727,7 +1080,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
rc = PTR_ERR(pntsd);
cERROR(1, "%s: error %d getting sec desc", __func__, rc);
} else {
- rc = parse_sec_desc(pntsd, acllen, fattr);
+ rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr);
kfree(pntsd);
if (rc)
cERROR(1, "parse sec desc failed rc = %d", rc);
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index c4ae7d036563..5c902c7ce524 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -39,6 +39,15 @@
#define ACCESS_ALLOWED 0
#define ACCESS_DENIED 1
+#define SIDOWNER 1
+#define SIDGROUP 2
+#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
+
+#define SID_ID_MAPPED 0
+#define SID_ID_PENDING 1
+#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
+#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */
+
struct cifs_ntsd {
__le16 revision; /* revision level */
__le16 type;
@@ -74,7 +83,21 @@ struct cifs_wksid {
char sidname[SIDNAMELENGTH];
} __attribute__((packed));
-extern int match_sid(struct cifs_sid *);
+struct cifs_sid_id {
+ unsigned int refcount; /* increment with spinlock, decrement without */
+ unsigned long id;
+ unsigned long time;
+ unsigned long state;
+ char *sidstr;
+ struct rb_node rbnode;
+ struct cifs_sid sid;
+};
+
+#ifdef __KERNEL__
+extern struct key_type cifs_idmap_key_type;
+extern const struct cred *root_cred;
+#endif /* KERNEL */
+
extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index d1a016be73ba..45c3f78c8f81 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -60,7 +60,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
server->session_key.response, server->session_key.len);
crypto_shash_update(&server->secmech.sdescmd5->shash,
- cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+ cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length));
rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature);
@@ -268,10 +268,11 @@ int setup_ntlm_response(struct cifsSesInfo *ses)
}
#ifdef CONFIG_CIFS_WEAK_PW_HASH
-void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
+int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
char *lnm_session_key)
{
int i;
+ int rc;
char password_with_pad[CIFS_ENCPWD_SIZE];
memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
@@ -282,7 +283,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
memcpy(lnm_session_key, password_with_pad,
CIFS_ENCPWD_SIZE);
- return;
+ return 0;
}
/* calculate old style session key */
@@ -299,10 +300,9 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
password_with_pad[i] = toupper(password_with_pad[i]);
- SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
+ rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key);
- /* clear password before we return/free memory */
- memset(password_with_pad, 0, CIFS_ENCPWD_SIZE);
+ return rc;
}
#endif /* CIFS_WEAK_PW_HASH */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5c412b33cd7c..493b74ca5648 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -128,29 +128,22 @@ cifs_read_super(struct super_block *sb, void *data,
}
cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- /* copy mount params to sb for use in submounts */
- /* BB: should we move this after the mount so we
- * do not have to do the copy on failed mounts?
- * BB: May be it is better to do simple copy before
- * complex operation (mount), and in case of fail
- * just exit instead of doing mount and attempting
- * undo it if this copy fails?*/
+ /*
+ * Copy mount params to sb for use in submounts. Better to do
+ * the copy here and deal with the error before cleanup gets
+ * complicated post-mount.
+ */
if (data) {
- int len = strlen(data);
- cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+ cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
if (cifs_sb->mountdata == NULL) {
bdi_destroy(&cifs_sb->bdi);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
return -ENOMEM;
}
- strncpy(cifs_sb->mountdata, data, len + 1);
- cifs_sb->mountdata[len] = '\0';
}
-#endif
- rc = cifs_mount(sb, cifs_sb, data, devname);
+ rc = cifs_mount(sb, cifs_sb, devname);
if (rc) {
if (!silent)
@@ -163,7 +156,7 @@ cifs_read_super(struct super_block *sb, void *data,
sb->s_bdi = &cifs_sb->bdi;
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
- inode = cifs_root_iget(sb, ROOT_I);
+ inode = cifs_root_iget(sb);
if (IS_ERR(inode)) {
rc = PTR_ERR(inode);
@@ -184,12 +177,12 @@ cifs_read_super(struct super_block *sb, void *data,
else
sb->s_d_op = &cifs_dentry_ops;
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cFYI(1, "export ops supported");
sb->s_export_op = &cifs_export_ops;
}
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
return 0;
@@ -202,12 +195,10 @@ out_no_root:
out_mount_failed:
if (cifs_sb) {
-#ifdef CONFIG_CIFS_DFS_UPCALL
if (cifs_sb->mountdata) {
kfree(cifs_sb->mountdata);
cifs_sb->mountdata = NULL;
}
-#endif
unload_nls(cifs_sb->local_nls);
bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb);
@@ -231,12 +222,10 @@ cifs_put_super(struct super_block *sb)
rc = cifs_umount(sb, cifs_sb);
if (rc)
cERROR(1, "cifs_umount failed with return code %d", rc);
-#ifdef CONFIG_CIFS_DFS_UPCALL
if (cifs_sb->mountdata) {
kfree(cifs_sb->mountdata);
cifs_sb->mountdata = NULL;
}
-#endif
unload_nls(cifs_sb->local_nls);
bdi_destroy(&cifs_sb->bdi);
@@ -618,16 +607,31 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
{
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
- int retval;
-
- /* some applications poll for the file length in this strange
- way so we must seek to end on non-oplocked files by
- setting the revalidate time to zero */
- CIFS_I(file->f_path.dentry->d_inode)->time = 0;
-
- retval = cifs_revalidate_file(file);
- if (retval < 0)
- return (loff_t)retval;
+ int rc;
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ /*
+ * We need to be sure that all dirty pages are written and the
+ * server has the newest file length.
+ */
+ if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+ inode->i_mapping->nrpages != 0) {
+ rc = filemap_fdatawait(inode->i_mapping);
+ if (rc) {
+ mapping_set_error(inode->i_mapping, rc);
+ return rc;
+ }
+ }
+ /*
+ * Some applications poll for the file length in this strange
+ * way so we must seek to end on non-oplocked files by
+ * setting the revalidate time to zero.
+ */
+ CIFS_I(inode)->time = 0;
+
+ rc = cifs_revalidate_file_attr(file);
+ if (rc < 0)
+ return (loff_t)rc;
}
return generic_file_llseek_unlocked(file, offset, origin);
}
@@ -760,10 +764,11 @@ const struct file_operations cifs_file_strict_ops = {
};
const struct file_operations cifs_file_direct_ops = {
- /* no aio, no readv -
- BB reevaluate whether they can be done with directio, no cache */
- .read = cifs_user_read,
- .write = cifs_user_write,
+ /* BB reevaluate whether they can be done with directio, no cache */
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = cifs_user_readv,
+ .aio_write = cifs_user_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -815,10 +820,11 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
};
const struct file_operations cifs_file_direct_nobrl_ops = {
- /* no mmap, no aio, no readv -
- BB reevaluate whether they can be done with directio, no cache */
- .read = cifs_user_read,
- .write = cifs_user_write,
+ /* BB reevaluate whether they can be done with directio, no cache */
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = cifs_user_readv,
+ .aio_write = cifs_user_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -981,10 +987,10 @@ init_cifs(void)
int rc = 0;
cifs_proc_init();
INIT_LIST_HEAD(&cifs_tcp_ses_list);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
INIT_LIST_HEAD(&GlobalDnotifyReqList);
INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
-#endif
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
/*
* Initialize Global counters
*/
@@ -1033,22 +1039,33 @@ init_cifs(void)
if (rc)
goto out_destroy_mids;
- rc = register_filesystem(&cifs_fs_type);
- if (rc)
- goto out_destroy_request_bufs;
#ifdef CONFIG_CIFS_UPCALL
rc = register_key_type(&cifs_spnego_key_type);
if (rc)
- goto out_unregister_filesystem;
-#endif
+ goto out_destroy_request_bufs;
+#endif /* CONFIG_CIFS_UPCALL */
+
+#ifdef CONFIG_CIFS_ACL
+ rc = init_cifs_idmap();
+ if (rc)
+ goto out_register_key_type;
+#endif /* CONFIG_CIFS_ACL */
+
+ rc = register_filesystem(&cifs_fs_type);
+ if (rc)
+ goto out_init_cifs_idmap;
return 0;
-#ifdef CONFIG_CIFS_UPCALL
-out_unregister_filesystem:
- unregister_filesystem(&cifs_fs_type);
+out_init_cifs_idmap:
+#ifdef CONFIG_CIFS_ACL
+ exit_cifs_idmap();
+out_register_key_type:
#endif
+#ifdef CONFIG_CIFS_UPCALL
+ unregister_key_type(&cifs_spnego_key_type);
out_destroy_request_bufs:
+#endif
cifs_destroy_request_bufs();
out_destroy_mids:
cifs_destroy_mids();
@@ -1070,6 +1087,10 @@ exit_cifs(void)
#ifdef CONFIG_CIFS_DFS_UPCALL
cifs_dfs_release_automount_timer();
#endif
+#ifdef CONFIG_CIFS_ACL
+ cifs_destroy_idmaptrees();
+ exit_cifs_idmap();
+#endif
#ifdef CONFIG_CIFS_UPCALL
unregister_key_type(&cifs_spnego_key_type);
#endif
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index a9371b6578c0..64313f778ebf 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -47,7 +47,7 @@ extern void cifs_sb_deactive(struct super_block *sb);
/* Functions related to inodes */
extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *);
extern int cifs_create(struct inode *, struct dentry *, int,
struct nameidata *);
extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -59,9 +59,11 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
extern int cifs_rmdir(struct inode *, struct dentry *);
extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
struct dentry *);
+extern int cifs_revalidate_file_attr(struct file *filp);
+extern int cifs_revalidate_dentry_attr(struct dentry *);
extern int cifs_revalidate_file(struct file *filp);
extern int cifs_revalidate_dentry(struct dentry *);
-extern void cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_invalidate_mapping(struct inode *inode);
extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -80,12 +82,12 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
- size_t read_size, loff_t *poffset);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
- size_t write_size, loff_t *poffset);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
extern int cifs_lock(struct file *, int, struct file_lock *);
@@ -123,9 +125,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t cifs_listxattr(struct dentry *, char *, size_t);
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
extern const struct export_operations cifs_export_ops;
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "1.71"
+#define CIFS_VERSION "1.72"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a5d1106fcbde..76b4517e74b0 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -274,7 +274,8 @@ struct cifsSesInfo {
int capabilities;
char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
TCP names - will ipv6 and sctp addresses fit? */
- char *user_name;
+ char *user_name; /* must not be null except during init of sess
+ and after mount option parsing we fill it */
char *domainName;
char *password;
struct session_key auth_key;
@@ -780,10 +781,12 @@ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock;
*/
GLOBAL_EXTERN spinlock_t cifs_file_list_lock;
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
/* Outstanding dir notify requests */
GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
/* DirNotify response queue */
GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
/*
* Global transaction id (XID) information
@@ -830,6 +833,11 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/
/* reconnect after this many failed echo attempts */
GLOBAL_EXTERN unsigned short echo_retries;
+GLOBAL_EXTERN struct rb_root uidtree;
+GLOBAL_EXTERN struct rb_root gidtree;
+GLOBAL_EXTERN spinlock_t siduidlock;
+GLOBAL_EXTERN spinlock_t sidgidlock;
+
void cifs_oplock_break(struct work_struct *work);
void cifs_oplock_break_get(struct cifsFileInfo *cfile);
void cifs_oplock_break_put(struct cifsFileInfo *cfile);
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b5c8cc5d7a7f..de3aa285de03 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -397,9 +397,9 @@
#define GETU32(var) (*((__u32 *)var)) /* BB check for endian issues */
struct smb_hdr {
- __u32 smb_buf_length; /* big endian on wire *//* BB length is only two
- or three bytes - with one or two byte type preceding it that are
- zero - we could mask the type byte off just in case BB */
+ __be32 smb_buf_length; /* BB length is only two (rarely three) bytes,
+ with one or two byte "type" preceding it that will be
+ zero - we could mask the type byte off */
__u8 Protocol[4];
__u8 Command;
union {
@@ -428,43 +428,28 @@ struct smb_hdr {
__u8 WordCount;
} __attribute__((packed));
-/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */
-#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \
- (2 * (smb_var)->WordCount))
+/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */
+static inline void *
+BCC(struct smb_hdr *smb)
+{
+ return (void *)smb + sizeof(*smb) + 2 * smb->WordCount;
+}
/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
#define pByteArea(smb_var) (BCC(smb_var) + 2)
-/* get the converted ByteCount for a SMB packet and return it */
-static inline __u16
-get_bcc(struct smb_hdr *hdr)
-{
- __u16 *bc_ptr = (__u16 *)BCC(hdr);
-
- return get_unaligned(bc_ptr);
-}
-
/* get the unconverted ByteCount for a SMB packet and return it */
static inline __u16
-get_bcc_le(struct smb_hdr *hdr)
+get_bcc(struct smb_hdr *hdr)
{
__le16 *bc_ptr = (__le16 *)BCC(hdr);
return get_unaligned_le16(bc_ptr);
}
-/* set the ByteCount for a SMB packet in host-byte order */
-static inline void
-put_bcc(__u16 count, struct smb_hdr *hdr)
-{
- __u16 *bc_ptr = (__u16 *)BCC(hdr);
-
- put_unaligned(count, bc_ptr);
-}
-
/* set the ByteCount for a SMB packet in little-endian */
static inline void
-put_bcc_le(__u16 count, struct smb_hdr *hdr)
+put_bcc(__u16 count, struct smb_hdr *hdr)
{
__le16 *bc_ptr = (__le16 *)BCC(hdr);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8096f27ad9a8..6e69e06a30b3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -53,6 +53,9 @@ do { \
cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d", \
__func__, curr_xid, (int)rc); \
} while (0)
+extern int init_cifs_idmap(void);
+extern void exit_cifs_idmap(void);
+extern void cifs_destroy_idmaptrees(void);
extern char *build_path_from_dentry(struct dentry *);
extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb,
struct cifsTconInfo *tcon);
@@ -90,7 +93,6 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool);
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern unsigned int smbCalcSize(struct smb_hdr *ptr);
-extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
@@ -143,8 +145,10 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64);
extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
+extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
+ const char *);
-extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
+extern int cifs_mount(struct super_block *, struct cifs_sb_info *,
const char *);
extern int cifs_umount(struct super_block *, struct cifs_sb_info *);
extern void cifs_dfs_release_automount_timer(void);
@@ -304,12 +308,13 @@ extern int CIFSSMBUnixQuerySymLink(const int xid,
struct cifsTconInfo *tcon,
const unsigned char *searchName, char **syminfo,
const struct nls_table *nls_codepage);
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
extern int CIFSSMBQueryReparseLinkInfo(const int xid,
struct cifsTconInfo *tcon,
const unsigned char *searchName,
char *symlinkinfo, const int buflen, __u16 fid,
const struct nls_table *nls_codepage);
-
+#endif /* temporarily unused until cifs_symlink fixed */
extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon,
const char *fileName, const int disposition,
const int access_flags, const int omode,
@@ -348,8 +353,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName, __u64 *inode_number,
const struct nls_table *nls_codepage,
int remap_special_chars);
-extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
- const struct nls_table *cp, int mapChars);
extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
const __u16 netfid, const __u64 len,
@@ -383,9 +386,15 @@ extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
extern int calc_seckey(struct cifsSesInfo *);
#ifdef CONFIG_CIFS_WEAK_PW_HASH
-extern void calc_lanman_hash(const char *password, const char *cryptkey,
+extern int calc_lanman_hash(const char *password, const char *cryptkey,
bool encrypt, char *lnm_session_key);
#endif /* CIFS_WEAK_PW_HASH */
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */
+extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+ const int notify_subdirs, const __u16 netfid,
+ __u32 filter, struct file *file, int multishot,
+ const struct nls_table *nls_codepage);
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
extern int CIFSSMBCopy(int xid,
struct cifsTconInfo *source_tcon,
const char *fromName,
@@ -393,10 +402,6 @@ extern int CIFSSMBCopy(int xid,
const char *toName, const int flags,
const struct nls_table *nls_codepage,
int remap_special_chars);
-extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
- const int notify_subdirs, const __u16 netfid,
- __u32 filter, struct file *file, int multishot,
- const struct nls_table *nls_codepage);
extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName,
const unsigned char *ea_name, char *EAData,
@@ -427,9 +432,6 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
struct cifs_sb_info *cifs_sb, int xid);
extern int mdfour(unsigned char *, unsigned char *, int);
extern int E_md4hash(const unsigned char *passwd, unsigned char *p16);
-extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
- unsigned char *p24);
-extern void E_P16(unsigned char *p14, unsigned char *p16);
-extern void E_P24(unsigned char *p21, const unsigned char *c8,
+extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8,
unsigned char *p24);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index df959bae6728..83df937b814e 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -339,12 +339,13 @@ static int validate_t2(struct smb_t2_rsp *pSMB)
get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024)
goto vt2_err;
- /* check that bcc is at least as big as parms + data */
- /* check that bcc is less than negotiated smb buffer */
total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount);
if (total_size >= 512)
goto vt2_err;
+ /* check that bcc is at least as big as parms + data, and that it is
+ * less than negotiated smb buffer
+ */
total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount);
if (total_size > get_bcc(&pSMB->hdr) ||
total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE)
@@ -357,6 +358,13 @@ vt2_err:
return -EINVAL;
}
+static inline void inc_rfc1001_len(void *pSMB, int count)
+{
+ struct smb_hdr *hdr = (struct smb_hdr *)pSMB;
+
+ be32_add_cpu(&hdr->smb_buf_length, count);
+}
+
int
CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
{
@@ -409,7 +417,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
count += strlen(protocols[i].name) + 1;
/* null at end of source and target buffers anyway */
}
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -541,10 +549,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
server->secType = RawNTLMSSP;
else if (secFlags & CIFSSEC_MAY_LANMAN)
server->secType = LANMAN;
-/* #ifdef CONFIG_CIFS_EXPERIMENTAL
- else if (secFlags & CIFSSEC_MAY_PLNTXT)
- server->secType = ??
-#endif */
else {
rc = -EOPNOTSUPP;
cERROR(1, "Invalid security type");
@@ -578,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) &&
(server->capabilities & CAP_EXTENDED_SECURITY)) {
- count = pSMBr->ByteCount;
+ count = get_bcc(&pSMBr->hdr);
if (count < 16) {
rc = -EIO;
goto neg_err_exit;
@@ -732,9 +736,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
smb->hdr.Tid = 0xffff;
smb->hdr.WordCount = 1;
put_unaligned_le16(1, &smb->EchoCount);
- put_bcc_le(1, &smb->hdr);
+ put_bcc(1, &smb->hdr);
smb->Data[0] = 'a';
- smb->hdr.smb_buf_length += 3;
+ inc_rfc1001_len(smb, 3);
rc = cifs_call_async(server, (struct smb_hdr *)smb,
cifs_echo_callback, server);
@@ -852,7 +856,7 @@ PsxDelete:
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -898,7 +902,7 @@ DelFileRetry:
pSMB->SearchAttributes =
cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM);
pSMB->BufferFormat = 0x04;
- pSMB->hdr.smb_buf_length += name_len + 1;
+ inc_rfc1001_len(pSMB, name_len + 1);
pSMB->ByteCount = cpu_to_le16(name_len + 1);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -942,7 +946,7 @@ RmDirRetry:
}
pSMB->BufferFormat = 0x04;
- pSMB->hdr.smb_buf_length += name_len + 1;
+ inc_rfc1001_len(pSMB, name_len + 1);
pSMB->ByteCount = cpu_to_le16(name_len + 1);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -985,7 +989,7 @@ MkDirRetry:
}
pSMB->BufferFormat = 0x04;
- pSMB->hdr.smb_buf_length += name_len + 1;
+ inc_rfc1001_len(pSMB, name_len + 1);
pSMB->ByteCount = cpu_to_le16(name_len + 1);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -1063,7 +1067,7 @@ PsxCreat:
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -1075,7 +1079,7 @@ PsxCreat:
cFYI(1, "copying inode info");
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) {
+ if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) {
rc = -EIO; /* bad smb */
goto psx_create_err;
}
@@ -1096,7 +1100,7 @@ PsxCreat:
pRetData->Type = cpu_to_le32(-1); /* unknown */
cFYI(DBG2, "unknown type");
} else {
- if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP)
+ if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)
+ sizeof(FILE_UNIX_BASIC_INFO)) {
cERROR(1, "Open response data too small");
pRetData->Type = cpu_to_le32(-1);
@@ -1228,7 +1232,7 @@ OldOpenRetry:
pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY);
pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition));
count += name_len;
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
/* long_op set to 1 to allow for oplock break timeouts */
@@ -1341,7 +1345,7 @@ openRetry:
SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
count += name_len;
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
/* long_op set to 1 to allow for oplock break timeouts */
@@ -1426,7 +1430,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid,
}
iov[0].iov_base = (char *)pSMB;
- iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+ iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
&resp_buf_type, CIFS_LOG_ERROR);
cifs_stats_inc(&tcon->num_reads);
@@ -1560,7 +1564,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon,
pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF);
pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
if (wct == 14)
pSMB->ByteCount = cpu_to_le16(byte_count);
@@ -1644,11 +1648,12 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon,
pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF);
pSMB->DataLengthHigh = cpu_to_le16(count >> 16);
- smb_hdr_len = pSMB->hdr.smb_buf_length + 1; /* hdr + 1 byte pad */
+ /* header + 1 byte pad */
+ smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1;
if (wct == 14)
- pSMB->hdr.smb_buf_length += count+1;
+ inc_rfc1001_len(pSMB, count + 1);
else /* wct == 12 */
- pSMB->hdr.smb_buf_length += count+5; /* smb data starts later */
+ inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */
if (wct == 14)
pSMB->ByteCount = cpu_to_le16(count + 1);
else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ {
@@ -1748,7 +1753,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
/* oplock break */
count = 0;
}
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
if (waitFlag) {
@@ -1839,14 +1844,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
pSMB->Fid = smb_file_id;
pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
if (waitFlag) {
rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned);
} else {
iov[0].iov_base = (char *)pSMB;
- iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+ iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
&resp_buf_type, timeout);
pSMB = NULL; /* request buf already freed by SendReceive2. Do
@@ -1862,7 +1867,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
__u16 data_count;
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) {
+ if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) {
rc = -EIO; /* bad smb */
goto plk_err_exit;
}
@@ -2012,7 +2017,7 @@ renameRetry:
}
count = 1 /* 1st signature byte */ + name_len + name_len2;
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2092,7 +2097,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
pSMB->InformationLevel =
cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2159,7 +2164,7 @@ copyRetry:
}
count = 1 /* 1st signature byte */ + name_len + name_len2;
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2249,7 +2254,7 @@ createSymLinkRetry:
pSMB->DataOffset = cpu_to_le16(offset);
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2335,7 +2340,7 @@ createHardLinkRetry:
pSMB->DataOffset = cpu_to_le16(offset);
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2406,7 +2411,7 @@ winCreateHardLinkRetry:
}
count = 1 /* string type byte */ + name_len + name_len2;
- pSMB->hdr.smb_buf_length += count;
+ inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2477,7 +2482,7 @@ querySymLinkRetry:
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2489,7 +2494,7 @@ querySymLinkRetry:
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
/* BB also check enough total bytes returned */
- if (rc || (pSMBr->ByteCount < 2))
+ if (rc || get_bcc(&pSMBr->hdr) < 2)
rc = -EIO;
else {
bool is_unicode;
@@ -2516,7 +2521,17 @@ querySymLinkRetry:
return rc;
}
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
+/*
+ * Recent Windows versions now create symlinks more frequently
+ * and they use the "reparse point" mechanism below. We can of course
+ * do symlinks nicely to Samba and other servers which support the
+ * CIFS Unix Extensions and we can also do SFU symlinks and "client only"
+ * "MF" symlinks optionally, but for recent Windows we really need to
+ * reenable the code below and fix the cifs_symlink callers to handle this.
+ * In the interim this code has been moved to its own config option so
+ * it is not compiled in by default until callers fixed up and more tested.
+ */
int
CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
const unsigned char *searchName,
@@ -2561,14 +2576,14 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon,
} else { /* decode response */
__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
__u32 data_count = le32_to_cpu(pSMBr->DataCount);
- if ((pSMBr->ByteCount < 2) || (data_offset > 512)) {
- /* BB also check enough total bytes returned */
+ if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
+ /* BB also check enough total bytes returned */
rc = -EIO; /* bad smb */
goto qreparse_out;
}
if (data_count && (data_count < 2048)) {
char *end_of_smb = 2 /* sizeof byte count */ +
- pSMBr->ByteCount + (char *)&pSMBr->ByteCount;
+ get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
struct reparse_data *reparse_buf =
(struct reparse_data *)
@@ -2618,7 +2633,7 @@ qreparse_out:
return rc;
}
-#endif /* CIFS_EXPERIMENTAL */
+#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */
#ifdef CONFIG_CIFS_POSIX
@@ -2814,7 +2829,7 @@ queryAclRetry:
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2826,8 +2841,8 @@ queryAclRetry:
/* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 2))
/* BB also check enough total bytes returned */
+ if (rc || get_bcc(&pSMBr->hdr) < 2)
rc = -EIO; /* bad smb */
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -2908,7 +2923,7 @@ setAclRetry:
pSMB->ParameterCount = cpu_to_le16(params);
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -2966,7 +2981,7 @@ GetExtAttrRetry:
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS);
pSMB->Pad = 0;
pSMB->Fid = netfid;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->t2.ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -2976,8 +2991,8 @@ GetExtAttrRetry:
} else {
/* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 2))
/* BB also check enough total bytes returned */
+ if (rc || get_bcc(&pSMBr->hdr) < 2)
/* If rc should we check for EOPNOSUPP and
disable the srvino flag? or in caller? */
rc = -EIO; /* bad smb */
@@ -3052,6 +3067,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
char *end_of_smb;
__u32 data_count, data_offset, parm_count, parm_offset;
struct smb_com_ntransact_rsp *pSMBr;
+ u16 bcc;
*pdatalen = 0;
*pparmlen = 0;
@@ -3061,8 +3077,8 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
pSMBr = (struct smb_com_ntransact_rsp *)buf;
- /* ByteCount was converted from little endian in SendReceive */
- end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount +
+ bcc = get_bcc(&pSMBr->hdr);
+ end_of_smb = 2 /* sizeof byte count */ + bcc +
(char *)&pSMBr->ByteCount;
data_offset = le32_to_cpu(pSMBr->DataOffset);
@@ -3088,7 +3104,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
*ppdata, data_count, (data_count + *ppdata),
end_of_smb, pSMBr);
return -EINVAL;
- } else if (parm_count + data_count > pSMBr->ByteCount) {
+ } else if (parm_count + data_count > bcc) {
cFYI(1, "parm count and data count larger than SMB");
return -EINVAL;
}
@@ -3124,9 +3140,9 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid,
pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
CIFS_ACL_DACL);
pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
- pSMB->hdr.smb_buf_length += 11;
+ inc_rfc1001_len(pSMB, 11);
iov[0].iov_base = (char *)pSMB;
- iov[0].iov_len = pSMB->hdr.smb_buf_length + 4;
+ iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
0);
@@ -3235,10 +3251,9 @@ setCifsAclRetry:
memcpy((char *) &pSMBr->hdr.Protocol + data_offset,
(char *) pntsd,
acllen);
- pSMB->hdr.smb_buf_length += (byte_count + data_count);
-
+ inc_rfc1001_len(pSMB, byte_count + data_count);
} else
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3289,7 +3304,7 @@ QInfRetry:
}
pSMB->BufferFormat = 0x04;
name_len++; /* account for buffer type byte */
- pSMB->hdr.smb_buf_length += (__u16) name_len;
+ inc_rfc1001_len(pSMB, (__u16)name_len);
pSMB->ByteCount = cpu_to_le16(name_len);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3364,7 +3379,7 @@ QFileInfoRetry:
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
pSMB->Pad = 0;
pSMB->Fid = netfid;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3375,7 +3390,7 @@ QFileInfoRetry:
if (rc) /* BB add auto retry on EOPNOTSUPP? */
rc = -EIO;
- else if (pSMBr->ByteCount < 40)
+ else if (get_bcc(&pSMBr->hdr) < 40)
rc = -EIO; /* bad smb */
else if (pFindData) {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -3451,7 +3466,7 @@ QPathInfoRetry:
else
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3463,9 +3478,9 @@ QPathInfoRetry:
if (rc) /* BB add auto retry on EOPNOTSUPP? */
rc = -EIO;
- else if (!legacy && (pSMBr->ByteCount < 40))
+ else if (!legacy && get_bcc(&pSMBr->hdr) < 40)
rc = -EIO; /* bad smb */
- else if (legacy && (pSMBr->ByteCount < 24))
+ else if (legacy && get_bcc(&pSMBr->hdr) < 24)
rc = -EIO; /* 24 or 26 expected but we do not read
last field */
else if (pFindData) {
@@ -3532,7 +3547,7 @@ UnixQFileInfoRetry:
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
pSMB->Pad = 0;
pSMB->Fid = netfid;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -3541,7 +3556,7 @@ UnixQFileInfoRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+ if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
"Unix Extensions can be disabled on mount "
"by specifying the nosfu mount option.");
@@ -3617,7 +3632,7 @@ UnixQPathInfoRetry:
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3627,7 +3642,7 @@ UnixQPathInfoRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
+ if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) {
cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n"
"Unix Extensions can be disabled on mount "
"by specifying the nosfu mount option.");
@@ -3731,7 +3746,7 @@ findFirstRetry:
/* BB what should we set StorageType to? Does it matter? BB */
pSMB->SearchStorageType = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -3860,7 +3875,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
byte_count = params + 1 /* pad */ ;
pSMB->TotalParameterCount = cpu_to_le16(params);
pSMB->ParameterCount = pSMB->TotalParameterCount;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4022,7 +4037,7 @@ GetInodeNumberRetry:
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4032,8 +4047,8 @@ GetInodeNumberRetry:
} else {
/* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 2))
/* BB also check enough total bytes returned */
+ if (rc || get_bcc(&pSMBr->hdr) < 2)
/* If rc should we check for EOPNOSUPP and
disable the srvino flag? or in caller? */
rc = -EIO; /* bad smb */
@@ -4246,7 +4261,7 @@ getDFSRetry:
pSMB->ParameterCount = cpu_to_le16(params);
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->MaxReferralLevel = cpu_to_le16(3);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB,
@@ -4258,13 +4273,13 @@ getDFSRetry:
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
/* BB Also check if enough total bytes returned? */
- if (rc || (pSMBr->ByteCount < 17)) {
+ if (rc || get_bcc(&pSMBr->hdr) < 17) {
rc = -EIO; /* bad smb */
goto GetDFSRefExit;
}
cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d",
- pSMBr->ByteCount,
+ get_bcc(&pSMBr->hdr),
le16_to_cpu(pSMBr->t2.DataOffset));
/* parse returned result into more usable form */
@@ -4320,7 +4335,7 @@ oldQFSInfoRetry:
pSMB->Reserved3 = 0;
pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4330,12 +4345,12 @@ oldQFSInfoRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 18))
+ if (rc || get_bcc(&pSMBr->hdr) < 18)
rc = -EIO; /* bad smb */
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
cFYI(1, "qfsinf resp BCC: %d Offset %d",
- pSMBr->ByteCount, data_offset);
+ get_bcc(&pSMBr->hdr), data_offset);
response_data = (FILE_SYSTEM_ALLOC_INFO *)
(((char *) &pSMBr->hdr.Protocol) + data_offset);
@@ -4399,7 +4414,7 @@ QFSInfoRetry:
pSMB->Reserved3 = 0;
pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4409,7 +4424,7 @@ QFSInfoRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 24))
+ if (rc || get_bcc(&pSMBr->hdr) < 24)
rc = -EIO; /* bad smb */
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4479,7 +4494,7 @@ QFSAttributeRetry:
pSMB->Reserved3 = 0;
pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4489,7 +4504,7 @@ QFSAttributeRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 13)) {
+ if (rc || get_bcc(&pSMBr->hdr) < 13) {
/* BB also check if enough bytes returned */
rc = -EIO; /* bad smb */
} else {
@@ -4550,7 +4565,7 @@ QFSDeviceRetry:
pSMB->Reserved3 = 0;
pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4560,7 +4575,8 @@ QFSDeviceRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO)))
+ if (rc || get_bcc(&pSMBr->hdr) <
+ sizeof(FILE_SYSTEM_DEVICE_INFO))
rc = -EIO; /* bad smb */
else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4619,7 +4635,7 @@ QFSUnixRetry:
pSMB->Reserved3 = 0;
pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4629,7 +4645,7 @@ QFSUnixRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 13)) {
+ if (rc || get_bcc(&pSMBr->hdr) < 13) {
rc = -EIO; /* bad smb */
} else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4702,7 +4718,7 @@ SETFSUnixRetry:
pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION);
pSMB->ClientUnixCap = cpu_to_le64(cap);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4764,7 +4780,7 @@ QFSPosixRetry:
pSMB->Reserved3 = 0;
pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION);
pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO);
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4774,7 +4790,7 @@ QFSPosixRetry:
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 13)) {
+ if (rc || get_bcc(&pSMBr->hdr) < 13) {
rc = -EIO; /* bad smb */
} else {
__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
@@ -4890,7 +4906,7 @@ SetEOFRetry:
pSMB->ParameterCount = cpu_to_le16(params);
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
parm_data->FileSize = cpu_to_le64(size);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -4969,7 +4985,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO);
}
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
if (rc) {
@@ -5037,7 +5053,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
else
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
@@ -5096,7 +5112,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
pSMB->Fid = fid;
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
*data_offset = delete_file ? 1 : 0;
rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
@@ -5169,7 +5185,7 @@ SetTimesRetry:
else
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -5221,7 +5237,7 @@ SetAttrLgcyRetry:
}
pSMB->attr = cpu_to_le16(dos_attrs);
pSMB->BufferFormat = 0x04;
- pSMB->hdr.smb_buf_length += name_len + 1;
+ inc_rfc1001_len(pSMB, name_len + 1);
pSMB->ByteCount = cpu_to_le16(name_len + 1);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -5326,7 +5342,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon,
pSMB->Fid = fid;
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
cifs_fill_unix_set_info(data_offset, args);
@@ -5402,7 +5418,7 @@ setPermsRetry:
pSMB->TotalDataCount = pSMB->DataCount;
pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
cifs_fill_unix_set_info(data_offset, args);
@@ -5418,79 +5434,6 @@ setPermsRetry:
return rc;
}
-int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
- const int notify_subdirs, const __u16 netfid,
- __u32 filter, struct file *pfile, int multishot,
- const struct nls_table *nls_codepage)
-{
- int rc = 0;
- struct smb_com_transaction_change_notify_req *pSMB = NULL;
- struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
- struct dir_notify_req *dnotify_req;
- int bytes_returned;
-
- cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
- rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
- (void **) &pSMBr);
- if (rc)
- return rc;
-
- pSMB->TotalParameterCount = 0 ;
- pSMB->TotalDataCount = 0;
- pSMB->MaxParameterCount = cpu_to_le32(2);
- /* BB find exact data count max from sess structure BB */
- pSMB->MaxDataCount = 0; /* same in little endian or be */
-/* BB VERIFY verify which is correct for above BB */
- pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
- MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
-
- pSMB->MaxSetupCount = 4;
- pSMB->Reserved = 0;
- pSMB->ParameterOffset = 0;
- pSMB->DataCount = 0;
- pSMB->DataOffset = 0;
- pSMB->SetupCount = 4; /* single byte does not need le conversion */
- pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
- pSMB->ParameterCount = pSMB->TotalParameterCount;
- if (notify_subdirs)
- pSMB->WatchTree = 1; /* one byte - no le conversion needed */
- pSMB->Reserved2 = 0;
- pSMB->CompletionFilter = cpu_to_le32(filter);
- pSMB->Fid = netfid; /* file handle always le */
- pSMB->ByteCount = 0;
-
- rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *)pSMBr, &bytes_returned,
- CIFS_ASYNC_OP);
- if (rc) {
- cFYI(1, "Error in Notify = %d", rc);
- } else {
- /* Add file to outstanding requests */
- /* BB change to kmem cache alloc */
- dnotify_req = kmalloc(
- sizeof(struct dir_notify_req),
- GFP_KERNEL);
- if (dnotify_req) {
- dnotify_req->Pid = pSMB->hdr.Pid;
- dnotify_req->PidHigh = pSMB->hdr.PidHigh;
- dnotify_req->Mid = pSMB->hdr.Mid;
- dnotify_req->Tid = pSMB->hdr.Tid;
- dnotify_req->Uid = pSMB->hdr.Uid;
- dnotify_req->netfid = netfid;
- dnotify_req->pfile = pfile;
- dnotify_req->filter = filter;
- dnotify_req->multishot = multishot;
- spin_lock(&GlobalMid_Lock);
- list_add_tail(&dnotify_req->lhead,
- &GlobalDnotifyReqList);
- spin_unlock(&GlobalMid_Lock);
- } else
- rc = -ENOMEM;
- }
- cifs_buf_release(pSMB);
- return rc;
-}
-
#ifdef CONFIG_CIFS_XATTR
/*
* Do a path-based QUERY_ALL_EAS call and parse the result. This is a common
@@ -5560,7 +5503,7 @@ QAllEAsRetry:
pSMB->ParameterCount = pSMB->TotalParameterCount;
pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS);
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
@@ -5576,7 +5519,7 @@ QAllEAsRetry:
of these trans2 responses */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
- if (rc || (pSMBr->ByteCount < 4)) {
+ if (rc || get_bcc(&pSMBr->hdr) < 4) {
rc = -EIO; /* bad smb */
goto QAllEAsOut;
}
@@ -5773,7 +5716,7 @@ SetEARetry:
pSMB->ParameterCount = cpu_to_le16(params);
pSMB->TotalParameterCount = pSMB->ParameterCount;
pSMB->Reserved4 = 0;
- pSMB->hdr.smb_buf_length += byte_count;
+ inc_rfc1001_len(pSMB, byte_count);
pSMB->ByteCount = cpu_to_le16(byte_count);
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
@@ -5787,5 +5730,99 @@ SetEARetry:
return rc;
}
-
#endif
+
+#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */
+/*
+ * Years ago the kernel added a "dnotify" function for Samba server,
+ * to allow network clients (such as Windows) to display updated
+ * lists of files in directory listings automatically when
+ * files are added by one user when another user has the
+ * same directory open on their desktop. The Linux cifs kernel
+ * client hooked into the kernel side of this interface for
+ * the same reason, but ironically when the VFS moved from
+ * "dnotify" to "inotify" it became harder to plug in Linux
+ * network file system clients (the most obvious use case
+ * for notify interfaces is when multiple users can update
+ * the contents of the same directory - exactly what network
+ * file systems can do) although the server (Samba) could
+ * still use it. For the short term we leave the worker
+ * function ifdeffed out (below) until inotify is fixed
+ * in the VFS to make it easier to plug in network file
+ * system clients. If inotify turns out to be permanently
+ * incompatible for network fs clients, we could instead simply
+ * expose this config flag by adding a future cifs (and smb2) notify ioctl.
+ */
+int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon,
+ const int notify_subdirs, const __u16 netfid,
+ __u32 filter, struct file *pfile, int multishot,
+ const struct nls_table *nls_codepage)
+{
+ int rc = 0;
+ struct smb_com_transaction_change_notify_req *pSMB = NULL;
+ struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL;
+ struct dir_notify_req *dnotify_req;
+ int bytes_returned;
+
+ cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid);
+ rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+ (void **) &pSMBr);
+ if (rc)
+ return rc;
+
+ pSMB->TotalParameterCount = 0 ;
+ pSMB->TotalDataCount = 0;
+ pSMB->MaxParameterCount = cpu_to_le32(2);
+ /* BB find exact data count max from sess structure BB */
+ pSMB->MaxDataCount = 0; /* same in little endian or be */
+/* BB VERIFY verify which is correct for above BB */
+ pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf -
+ MAX_CIFS_HDR_SIZE) & 0xFFFFFF00);
+
+ pSMB->MaxSetupCount = 4;
+ pSMB->Reserved = 0;
+ pSMB->ParameterOffset = 0;
+ pSMB->DataCount = 0;
+ pSMB->DataOffset = 0;
+ pSMB->SetupCount = 4; /* single byte does not need le conversion */
+ pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE);
+ pSMB->ParameterCount = pSMB->TotalParameterCount;
+ if (notify_subdirs)
+ pSMB->WatchTree = 1; /* one byte - no le conversion needed */
+ pSMB->Reserved2 = 0;
+ pSMB->CompletionFilter = cpu_to_le32(filter);
+ pSMB->Fid = netfid; /* file handle always le */
+ pSMB->ByteCount = 0;
+
+ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+ (struct smb_hdr *)pSMBr, &bytes_returned,
+ CIFS_ASYNC_OP);
+ if (rc) {
+ cFYI(1, "Error in Notify = %d", rc);
+ } else {
+ /* Add file to outstanding requests */
+ /* BB change to kmem cache alloc */
+ dnotify_req = kmalloc(
+ sizeof(struct dir_notify_req),
+ GFP_KERNEL);
+ if (dnotify_req) {
+ dnotify_req->Pid = pSMB->hdr.Pid;
+ dnotify_req->PidHigh = pSMB->hdr.PidHigh;
+ dnotify_req->Mid = pSMB->hdr.Mid;
+ dnotify_req->Tid = pSMB->hdr.Tid;
+ dnotify_req->Uid = pSMB->hdr.Uid;
+ dnotify_req->netfid = netfid;
+ dnotify_req->pfile = pfile;
+ dnotify_req->filter = filter;
+ dnotify_req->multishot = multishot;
+ spin_lock(&GlobalMid_Lock);
+ list_add_tail(&dnotify_req->lhead,
+ &GlobalDnotifyReqList);
+ spin_unlock(&GlobalMid_Lock);
+ } else
+ rc = -ENOMEM;
+ }
+ cifs_buf_release(pSMB);
+ return rc;
+}
+#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 277262a8e82f..da284e3cb653 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -102,6 +102,7 @@ struct smb_vol {
bool fsc:1; /* enable fscache */
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
+ bool use_smb2:1; /* force smb2 use on mount instead of cifs */
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -316,19 +317,19 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount);
/* fix up the BCC */
- byte_count = get_bcc_le(pTargetSMB);
+ byte_count = get_bcc(pTargetSMB);
byte_count += total_in_buf2;
/* is the result too big for the field? */
if (byte_count > USHRT_MAX)
return -EPROTO;
- put_bcc_le(byte_count, pTargetSMB);
+ put_bcc(byte_count, pTargetSMB);
- byte_count = pTargetSMB->smb_buf_length;
+ byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
byte_count += total_in_buf2;
/* don't allow buffer to overflow */
if (byte_count > CIFSMaxBufSize)
return -ENOBUFS;
- pTargetSMB->smb_buf_length = byte_count;
+ pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2);
@@ -495,8 +496,7 @@ incomplete_rcv:
/* Note that FC 1001 length is big endian on the wire,
but we convert it here so it is always manipulated
as host byte order */
- pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
- smb_buffer->smb_buf_length = pdu_length;
+ pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
@@ -735,7 +735,7 @@ multi_t2_fnd:
sock_release(csocket);
server->ssocket = NULL;
}
- /* buffer usuallly freed in free_mid - need to free it here on exit */
+ /* buffer usually freed in free_mid - need to free it here on exit */
cifs_buf_release(bigbuf);
if (smallbuf) /* no sense logging a debug message if NULL */
cifs_small_buf_release(smallbuf);
@@ -818,10 +818,11 @@ extract_hostname(const char *unc)
}
static int
-cifs_parse_mount_options(char *options, const char *devname,
+cifs_parse_mount_options(const char *mountdata, const char *devname,
struct smb_vol *vol)
{
char *value, *data, *end;
+ char *mountdata_copy, *options;
unsigned int temp_len, i, j;
char separator[2];
short int override_uid = -1;
@@ -861,9 +862,14 @@ cifs_parse_mount_options(char *options, const char *devname,
vol->actimeo = CIFS_DEF_ACTIMEO;
- if (!options)
- return 1;
+ if (!mountdata)
+ goto cifs_parse_mount_err;
+
+ mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL);
+ if (!mountdata_copy)
+ goto cifs_parse_mount_err;
+ options = mountdata_copy;
end = options + strlen(options);
if (strncmp(options, "sep=", 4) == 0) {
if (options[4] != 0) {
@@ -889,17 +895,22 @@ cifs_parse_mount_options(char *options, const char *devname,
if (!value) {
printk(KERN_WARNING
"CIFS: invalid or missing username\n");
- return 1; /* needs_arg; */
+ goto cifs_parse_mount_err;
} else if (!*value) {
/* null user, ie anonymous, authentication */
vol->nullauth = 1;
}
if (strnlen(value, MAX_USERNAME_SIZE) <
MAX_USERNAME_SIZE) {
- vol->username = value;
+ vol->username = kstrdup(value, GFP_KERNEL);
+ if (!vol->username) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for username\n");
+ goto cifs_parse_mount_err;
+ }
} else {
printk(KERN_WARNING "CIFS: username too long\n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else if (strnicmp(data, "pass", 4) == 0) {
if (!value) {
@@ -963,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname,
if (vol->password == NULL) {
printk(KERN_WARNING "CIFS: no memory "
"for password\n");
- return 1;
+ goto cifs_parse_mount_err;
}
for (i = 0, j = 0; i < temp_len; i++, j++) {
vol->password[j] = value[i];
@@ -979,7 +990,7 @@ cifs_parse_mount_options(char *options, const char *devname,
if (vol->password == NULL) {
printk(KERN_WARNING "CIFS: no memory "
"for password\n");
- return 1;
+ goto cifs_parse_mount_err;
}
strcpy(vol->password, value);
}
@@ -989,11 +1000,16 @@ cifs_parse_mount_options(char *options, const char *devname,
vol->UNCip = NULL;
} else if (strnlen(value, INET6_ADDRSTRLEN) <
INET6_ADDRSTRLEN) {
- vol->UNCip = value;
+ vol->UNCip = kstrdup(value, GFP_KERNEL);
+ if (!vol->UNCip) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for UNC IP\n");
+ goto cifs_parse_mount_err;
+ }
} else {
printk(KERN_WARNING "CIFS: ip address "
"too long\n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else if (strnicmp(data, "sec", 3) == 0) {
if (!value || !*value) {
@@ -1006,7 +1022,7 @@ cifs_parse_mount_options(char *options, const char *devname,
/* vol->secFlg |= CIFSSEC_MUST_SEAL |
CIFSSEC_MAY_KRB5; */
cERROR(1, "Krb5 cifs privacy not supported");
- return 1;
+ goto cifs_parse_mount_err;
} else if (strnicmp(value, "krb5", 4) == 0) {
vol->secFlg |= CIFSSEC_MAY_KRB5;
} else if (strnicmp(value, "ntlmsspi", 8) == 0) {
@@ -1036,7 +1052,23 @@ cifs_parse_mount_options(char *options, const char *devname,
vol->nullauth = 1;
} else {
cERROR(1, "bad security option: %s", value);
- return 1;
+ goto cifs_parse_mount_err;
+ }
+ } else if (strnicmp(data, "vers", 3) == 0) {
+ if (!value || !*value) {
+ cERROR(1, "no protocol version specified"
+ " after vers= mount option");
+ } else if ((strnicmp(value, "cifs", 4) == 0) ||
+ (strnicmp(value, "1", 1) == 0)) {
+ /* this is the default */
+ continue;
+ } else if ((strnicmp(value, "smb2", 4) == 0) ||
+ (strnicmp(value, "2", 1) == 0)) {
+#ifdef CONFIG_CIFS_SMB2
+ vol->use_smb2 = true;
+#else
+ cERROR(1, "smb2 support not enabled");
+#endif /* CONFIG_CIFS_SMB2 */
}
} else if ((strnicmp(data, "unc", 3) == 0)
|| (strnicmp(data, "target", 6) == 0)
@@ -1044,12 +1076,12 @@ cifs_parse_mount_options(char *options, const char *devname,
if (!value || !*value) {
printk(KERN_WARNING "CIFS: invalid path to "
"network resource\n");
- return 1; /* needs_arg; */
+ goto cifs_parse_mount_err;
}
if ((temp_len = strnlen(value, 300)) < 300) {
vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
if (vol->UNC == NULL)
- return 1;
+ goto cifs_parse_mount_err;
strcpy(vol->UNC, value);
if (strncmp(vol->UNC, "//", 2) == 0) {
vol->UNC[0] = '\\';
@@ -1058,27 +1090,32 @@ cifs_parse_mount_options(char *options, const char *devname,
printk(KERN_WARNING
"CIFS: UNC Path does not begin "
"with // or \\\\ \n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else {
printk(KERN_WARNING "CIFS: UNC name too long\n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else if ((strnicmp(data, "domain", 3) == 0)
|| (strnicmp(data, "workgroup", 5) == 0)) {
if (!value || !*value) {
printk(KERN_WARNING "CIFS: invalid domain name\n");
- return 1; /* needs_arg; */
+ goto cifs_parse_mount_err;
}
/* BB are there cases in which a comma can be valid in
a domain name and need special handling? */
if (strnlen(value, 256) < 256) {
- vol->domainname = value;
+ vol->domainname = kstrdup(value, GFP_KERNEL);
+ if (!vol->domainname) {
+ printk(KERN_WARNING "CIFS: no memory "
+ "for domainname\n");
+ goto cifs_parse_mount_err;
+ }
cFYI(1, "Domain name set");
} else {
printk(KERN_WARNING "CIFS: domain name too "
"long\n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else if (strnicmp(data, "srcaddr", 7) == 0) {
vol->srcaddr.ss_family = AF_UNSPEC;
@@ -1086,7 +1123,7 @@ cifs_parse_mount_options(char *options, const char *devname,
if (!value || !*value) {
printk(KERN_WARNING "CIFS: srcaddr value"
" not specified.\n");
- return 1; /* needs_arg; */
+ goto cifs_parse_mount_err;
}
i = cifs_convert_address((struct sockaddr *)&vol->srcaddr,
value, strlen(value));
@@ -1094,20 +1131,20 @@ cifs_parse_mount_options(char *options, const char *devname,
printk(KERN_WARNING "CIFS: Could not parse"
" srcaddr: %s\n",
value);
- return 1;
+ goto cifs_parse_mount_err;
}
} else if (strnicmp(data, "prefixpath", 10) == 0) {
if (!value || !*value) {
printk(KERN_WARNING
"CIFS: invalid path prefix\n");
- return 1; /* needs_argument */
+ goto cifs_parse_mount_err;
}
if ((temp_len = strnlen(value, 1024)) < 1024) {
if (value[0] != '/')
temp_len++; /* missing leading slash */
vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
if (vol->prepath == NULL)
- return 1;
+ goto cifs_parse_mount_err;
if (value[0] != '/') {
vol->prepath[0] = '/';
strcpy(vol->prepath+1, value);
@@ -1116,24 +1153,33 @@ cifs_parse_mount_options(char *options, const char *devname,
cFYI(1, "prefix path %s", vol->prepath);
} else {
printk(KERN_WARNING "CIFS: prefix too long\n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else if (strnicmp(data, "iocharset", 9) == 0) {
if (!value || !*value) {
printk(KERN_WARNING "CIFS: invalid iocharset "
"specified\n");
- return 1; /* needs_arg; */
+ goto cifs_parse_mount_err;
}
if (strnlen(value, 65) < 65) {
- if (strnicmp(value, "default", 7))
- vol->iocharset = value;
+ if (strnicmp(value, "default", 7)) {
+ vol->iocharset = kstrdup(value,
+ GFP_KERNEL);
+
+ if (!vol->iocharset) {
+ printk(KERN_WARNING "CIFS: no "
+ "memory for"
+ "charset\n");
+ goto cifs_parse_mount_err;
+ }
+ }
/* if iocharset not set then load_nls_default
is used by caller */
cFYI(1, "iocharset set to %s", value);
} else {
printk(KERN_WARNING "CIFS: iocharset name "
"too long.\n");
- return 1;
+ goto cifs_parse_mount_err;
}
} else if (!strnicmp(data, "uid", 3) && value && *value) {
vol->linux_uid = simple_strtoul(value, &value, 0);
@@ -1246,7 +1292,7 @@ cifs_parse_mount_options(char *options, const char *devname,
if (vol->actimeo > CIFS_MAX_ACTIMEO) {
cERROR(1, "CIFS: attribute cache"
"timeout too large");
- return 1;
+ goto cifs_parse_mount_err;
}
}
} else if (strnicmp(data, "credentials", 4) == 0) {
@@ -1390,7 +1436,7 @@ cifs_parse_mount_options(char *options, const char *devname,
#ifndef CONFIG_CIFS_FSCACHE
cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE"
"kernel config option set");
- return 1;
+ goto cifs_parse_mount_err;
#endif
vol->fsc = true;
} else if (strnicmp(data, "mfsymlinks", 10) == 0) {
@@ -1405,12 +1451,12 @@ cifs_parse_mount_options(char *options, const char *devname,
if (devname == NULL) {
printk(KERN_WARNING "CIFS: Missing UNC name for mount "
"target\n");
- return 1;
+ goto cifs_parse_mount_err;
}
if ((temp_len = strnlen(devname, 300)) < 300) {
vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
if (vol->UNC == NULL)
- return 1;
+ goto cifs_parse_mount_err;
strcpy(vol->UNC, devname);
if (strncmp(vol->UNC, "//", 2) == 0) {
vol->UNC[0] = '\\';
@@ -1418,21 +1464,21 @@ cifs_parse_mount_options(char *options, const char *devname,
} else if (strncmp(vol->UNC, "\\\\", 2) != 0) {
printk(KERN_WARNING "CIFS: UNC Path does not "
"begin with // or \\\\ \n");
- return 1;
+ goto cifs_parse_mount_err;
}
value = strpbrk(vol->UNC+2, "/\\");
if (value)
*value = '\\';
} else {
printk(KERN_WARNING "CIFS: UNC name too long\n");
- return 1;
+ goto cifs_parse_mount_err;
}
}
if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) {
cERROR(1, "Multiuser mounts currently require krb5 "
"authentication!");
- return 1;
+ goto cifs_parse_mount_err;
}
if (vol->UNCip == NULL)
@@ -1450,7 +1496,12 @@ cifs_parse_mount_options(char *options, const char *devname,
printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
"specified with no gid= option.\n");
+ kfree(mountdata_copy);
return 0;
+
+cifs_parse_mount_err:
+ kfree(mountdata_copy);
+ return 1;
}
/** Returns true if srcaddr isn't specified and rhs isn't
@@ -2280,7 +2331,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
smb_buf = (struct smb_hdr *)ses_init_buf;
/* sizeof RFC1002_SESSION_REQUEST with no scope */
- smb_buf->smb_buf_length = 0x81000044;
+ smb_buf->smb_buf_length = cpu_to_be32(0x81000044);
rc = smb_send(server, smb_buf, 0x44);
kfree(ses_init_buf);
/*
@@ -2691,8 +2742,12 @@ cleanup_volume_info(struct smb_vol **pvolume_info)
return;
volume_info = *pvolume_info;
+ kfree(volume_info->username);
kzfree(volume_info->password);
kfree(volume_info->UNC);
+ kfree(volume_info->UNCip);
+ kfree(volume_info->domainname);
+ kfree(volume_info->iocharset);
kfree(volume_info->prepath);
kfree(volume_info);
*pvolume_info = NULL;
@@ -2729,11 +2784,65 @@ build_unc_path_to_root(const struct smb_vol *volume_info,
full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */
return full_path;
}
+
+/*
+ * Perform a dfs referral query for a share and (optionally) prefix
+ *
+ * If a referral is found, cifs_sb->mountdata will be (re-)allocated
+ * to a string containing updated options for the submount. Otherwise it
+ * will be left untouched.
+ *
+ * Returns the rc from get_dfs_path to the caller, which can be used to
+ * determine whether there were referrals.
+ */
+static int
+expand_dfs_referral(int xid, struct cifsSesInfo *pSesInfo,
+ struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb,
+ int check_prefix)
+{
+ int rc;
+ unsigned int num_referrals = 0;
+ struct dfs_info3_param *referrals = NULL;
+ char *full_path = NULL, *ref_path = NULL, *mdata = NULL;
+
+ full_path = build_unc_path_to_root(volume_info, cifs_sb);
+ if (IS_ERR(full_path))
+ return PTR_ERR(full_path);
+
+ /* For DFS paths, skip the first '\' of the UNC */
+ ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
+
+ rc = get_dfs_path(xid, pSesInfo , ref_path, cifs_sb->local_nls,
+ &num_referrals, &referrals,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ if (!rc && num_referrals > 0) {
+ char *fake_devname = NULL;
+
+ mdata = cifs_compose_mount_options(cifs_sb->mountdata,
+ full_path + 1, referrals,
+ &fake_devname);
+
+ free_dfs_info_array(referrals, num_referrals);
+ kfree(fake_devname);
+
+ if (cifs_sb->mountdata != NULL)
+ kfree(cifs_sb->mountdata);
+
+ if (IS_ERR(mdata)) {
+ rc = PTR_ERR(mdata);
+ mdata = NULL;
+ }
+ cifs_sb->mountdata = mdata;
+ }
+ kfree(full_path);
+ return rc;
+}
#endif
int
cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
- char *mount_data_global, const char *devname)
+ const char *devname)
{
int rc;
int xid;
@@ -2742,13 +2851,20 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
struct cifsTconInfo *tcon;
struct TCP_Server_Info *srvTcp;
char *full_path;
- char *mount_data = mount_data_global;
struct tcon_link *tlink;
#ifdef CONFIG_CIFS_DFS_UPCALL
- struct dfs_info3_param *referrals = NULL;
- unsigned int num_referrals = 0;
int referral_walks_count = 0;
try_mount_again:
+ /* cleanup activities if we're chasing a referral */
+ if (referral_walks_count) {
+ if (tcon)
+ cifs_put_tcon(tcon);
+ else if (pSesInfo)
+ cifs_put_smb_ses(pSesInfo);
+
+ cleanup_volume_info(&volume_info);
+ FreeXid(xid);
+ }
#endif
rc = 0;
tcon = NULL;
@@ -2765,7 +2881,8 @@ try_mount_again:
goto out;
}
- if (cifs_parse_mount_options(mount_data, devname, volume_info)) {
+ if (cifs_parse_mount_options(cifs_sb->mountdata, devname,
+ volume_info)) {
rc = -EINVAL;
goto out;
}
@@ -2861,6 +2978,24 @@ try_mount_again:
(tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
remote_path_check:
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ /*
+ * Perform an unconditional check for whether there are DFS
+ * referrals for this path without prefix, to provide support
+ * for DFS referrals from w2k8 servers which don't seem to respond
+ * with PATH_NOT_COVERED to requests that include the prefix.
+ * Chase the referral if found, otherwise continue normally.
+ */
+ if (referral_walks_count == 0) {
+ int refrc = expand_dfs_referral(xid, pSesInfo, volume_info,
+ cifs_sb, false);
+ if (!refrc) {
+ referral_walks_count++;
+ goto try_mount_again;
+ }
+ }
+#endif
+
/* check if a whole path (including prepath) is not remote */
if (!rc && tcon) {
/* build_path_to_root works only when we have a valid tcon */
@@ -2894,46 +3029,15 @@ remote_path_check:
if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0)
convert_delimiter(cifs_sb->prepath,
CIFS_DIR_SEP(cifs_sb));
- full_path = build_unc_path_to_root(volume_info, cifs_sb);
- if (IS_ERR(full_path)) {
- rc = PTR_ERR(full_path);
- goto mount_fail_check;
- }
-
- cFYI(1, "Getting referral for: %s", full_path);
- rc = get_dfs_path(xid, pSesInfo , full_path + 1,
- cifs_sb->local_nls, &num_referrals, &referrals,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (!rc && num_referrals > 0) {
- char *fake_devname = NULL;
-
- if (mount_data != mount_data_global)
- kfree(mount_data);
- mount_data = cifs_compose_mount_options(
- cifs_sb->mountdata, full_path + 1,
- referrals, &fake_devname);
+ rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb,
+ true);
- free_dfs_info_array(referrals, num_referrals);
- kfree(fake_devname);
- kfree(full_path);
-
- if (IS_ERR(mount_data)) {
- rc = PTR_ERR(mount_data);
- mount_data = NULL;
- goto mount_fail_check;
- }
-
- if (tcon)
- cifs_put_tcon(tcon);
- else if (pSesInfo)
- cifs_put_smb_ses(pSesInfo);
-
- cleanup_volume_info(&volume_info);
+ if (!rc) {
referral_walks_count++;
- FreeXid(xid);
goto try_mount_again;
}
+ goto mount_fail_check;
#else /* No DFS support, return error on mount */
rc = -EOPNOTSUPP;
#endif
@@ -2966,8 +3070,6 @@ remote_path_check:
mount_fail_check:
/* on error free sesinfo and tcon struct if needed */
if (rc) {
- if (mount_data != mount_data_global)
- kfree(mount_data);
/* If find_unc succeeded then rc == 0 so we can not end */
/* up accidentally freeing someone elses tcon struct */
if (tcon)
@@ -3083,7 +3185,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
bcc_ptr += strlen("?????");
bcc_ptr += 1;
count = bcc_ptr - &pSMB->Password[0];
- pSMB->hdr.smb_buf_length += count;
+ pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu(
+ pSMB->hdr.smb_buf_length) + count);
pSMB->ByteCount = cpu_to_le16(count);
rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length,
@@ -3258,7 +3361,9 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
struct cifsSesInfo *ses;
struct cifsTconInfo *tcon = NULL;
struct smb_vol *vol_info;
- char username[MAX_USERNAME_SIZE + 1];
+ char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
+ /* We used to have this as MAX_USERNAME which is */
+ /* way too big now (256 instead of 32) */
vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
if (vol_info == NULL) {
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 993f82045bf6..55d87ac52000 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -45,7 +45,7 @@
#include "cifs_debug.h"
#include "cifsfs.h"
-#ifdef CONFIG_CIFS_EXPERIMENTAL
+#ifdef CIFS_NFSD_EXPORT
static struct dentry *cifs_get_parent(struct dentry *dentry)
{
/* BB need to add code here eventually to enable export via NFSD */
@@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = {
.encode_fs = */
};
-#endif /* EXPERIMENTAL */
+#endif /* CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index faf59529e847..c672afef0c09 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -857,95 +857,6 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
cifsi->server_eof = end_of_write;
}
-ssize_t cifs_user_write(struct file *file, const char __user *write_data,
- size_t write_size, loff_t *poffset)
-{
- struct inode *inode = file->f_path.dentry->d_inode;
- int rc = 0;
- unsigned int bytes_written = 0;
- unsigned int total_written;
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *pTcon;
- int xid;
- struct cifsFileInfo *open_file;
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
-
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-
- /* cFYI(1, " write %d bytes to offset %lld of %s", write_size,
- *poffset, file->f_path.dentry->d_name.name); */
-
- if (file->private_data == NULL)
- return -EBADF;
-
- open_file = file->private_data;
- pTcon = tlink_tcon(open_file->tlink);
-
- rc = generic_write_checks(file, poffset, &write_size, 0);
- if (rc)
- return rc;
-
- xid = GetXid();
-
- for (total_written = 0; write_size > total_written;
- total_written += bytes_written) {
- rc = -EAGAIN;
- while (rc == -EAGAIN) {
- if (file->private_data == NULL) {
- /* file has been closed on us */
- FreeXid(xid);
- /* if we have gotten here we have written some data
- and blocked, and the file has been freed on us while
- we blocked so return what we managed to write */
- return total_written;
- }
- if (open_file->invalidHandle) {
- /* we could deadlock if we called
- filemap_fdatawait from here so tell
- reopen_file not to flush data to server
- now */
- rc = cifs_reopen_file(open_file, false);
- if (rc != 0)
- break;
- }
-
- rc = CIFSSMBWrite(xid, pTcon,
- open_file->netfid,
- min_t(const int, cifs_sb->wsize,
- write_size - total_written),
- *poffset, &bytes_written,
- NULL, write_data + total_written, 0);
- }
- if (rc || (bytes_written == 0)) {
- if (total_written)
- break;
- else {
- FreeXid(xid);
- return rc;
- }
- } else {
- cifs_update_eof(cifsi, *poffset, bytes_written);
- *poffset += bytes_written;
- }
- }
-
- cifs_stats_bytes_written(pTcon, total_written);
-
-/* Do not update local mtime - server will set its actual value on write
- * inode->i_ctime = inode->i_mtime =
- * current_fs_time(inode->i_sb);*/
- if (total_written > 0) {
- spin_lock(&inode->i_lock);
- if (*poffset > inode->i_size)
- i_size_write(inode, *poffset);
- spin_unlock(&inode->i_lock);
- }
- mark_inode_dirty_sync(inode);
-
- FreeXid(xid);
- return total_written;
-}
-
static ssize_t cifs_write(struct cifsFileInfo *open_file,
const char *write_data, size_t write_size,
loff_t *poffset)
@@ -1420,9 +1331,10 @@ retry_write:
return rc;
}
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
{
- int rc = -EFAULT;
+ int rc;
int xid;
xid = GetXid();
@@ -1442,15 +1354,29 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
* to fail to update with the state of the page correctly.
*/
set_page_writeback(page);
+retry_write:
rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
- SetPageUptodate(page); /* BB add check for error and Clearuptodate? */
- unlock_page(page);
+ if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
+ goto retry_write;
+ else if (rc == -EAGAIN)
+ redirty_page_for_writepage(wbc, page);
+ else if (rc != 0)
+ SetPageError(page);
+ else
+ SetPageUptodate(page);
end_page_writeback(page);
page_cache_release(page);
FreeXid(xid);
return rc;
}
+static int cifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int rc = cifs_writepage_locked(page, wbc);
+ unlock_page(page);
+ return rc;
+}
+
static int cifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
@@ -1519,8 +1445,13 @@ int cifs_strict_fsync(struct file *file, int datasync)
cFYI(1, "Sync file - name: %s datasync: 0x%x",
file->f_path.dentry->d_name.name, datasync);
- if (!CIFS_I(inode)->clientCanCacheRead)
- cifs_invalidate_mapping(inode);
+ if (!CIFS_I(inode)->clientCanCacheRead) {
+ rc = cifs_invalidate_mapping(inode);
+ if (rc) {
+ cFYI(1, "rc: %d during invalidate phase", rc);
+ rc = 0; /* don't care about it in fsync */
+ }
+ }
tcon = tlink_tcon(smbfile->tlink);
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
@@ -1726,7 +1657,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
return total_written;
}
-static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
+ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
ssize_t written;
@@ -1849,17 +1780,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
return total_read;
}
-ssize_t cifs_user_read(struct file *file, char __user *read_data,
- size_t read_size, loff_t *poffset)
-{
- struct iovec iov;
- iov.iov_base = read_data;
- iov.iov_len = read_size;
-
- return cifs_iovec_read(file, &iov, 1, poffset);
-}
-
-static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
+ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
ssize_t read;
@@ -1987,8 +1908,11 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
xid = GetXid();
- if (!CIFS_I(inode)->clientCanCacheRead)
- cifs_invalidate_mapping(inode);
+ if (!CIFS_I(inode)->clientCanCacheRead) {
+ rc = cifs_invalidate_mapping(inode);
+ if (rc)
+ return rc;
+ }
rc = generic_file_mmap(file, vma);
if (rc == 0)
@@ -2415,6 +2339,27 @@ static void cifs_invalidate_page(struct page *page, unsigned long offset)
cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
}
+static int cifs_launder_page(struct page *page)
+{
+ int rc = 0;
+ loff_t range_start = page_offset(page);
+ loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0,
+ .range_start = range_start,
+ .range_end = range_end,
+ };
+
+ cFYI(1, "Launder page: %p", page);
+
+ if (clear_page_dirty_for_io(page))
+ rc = cifs_writepage_locked(page, &wbc);
+
+ cifs_fscache_invalidate_page(page, page->mapping->host);
+ return rc;
+}
+
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -2486,7 +2431,7 @@ const struct address_space_operations cifs_addr_ops = {
.set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = cifs_release_page,
.invalidatepage = cifs_invalidate_page,
- /* .direct_IO = */
+ .launder_page = cifs_launder_page,
};
/*
@@ -2503,5 +2448,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = cifs_release_page,
.invalidatepage = cifs_invalidate_page,
- /* .direct_IO = */
+ .launder_page = cifs_launder_page,
};
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8852470b4fbb..de02ed5e25c2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -878,7 +878,7 @@ retry_iget5_locked:
}
/* gets root inode */
-struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb)
{
int xid;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -1683,71 +1683,70 @@ cifs_inode_needs_reval(struct inode *inode)
/*
* Zap the cache. Called when invalid_mapping flag is set.
*/
-void
+int
cifs_invalidate_mapping(struct inode *inode)
{
- int rc;
+ int rc = 0;
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
cifs_i->invalid_mapping = false;
- /* write back any cached data */
if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
- rc = filemap_write_and_wait(inode->i_mapping);
- mapping_set_error(inode->i_mapping, rc);
+ rc = invalidate_inode_pages2(inode->i_mapping);
+ if (rc) {
+ cERROR(1, "%s: could not invalidate inode %p", __func__,
+ inode);
+ cifs_i->invalid_mapping = true;
+ }
}
- invalidate_remote_inode(inode);
+
cifs_fscache_reset_inode_cookie(inode);
+ return rc;
}
-int cifs_revalidate_file(struct file *filp)
+int cifs_revalidate_file_attr(struct file *filp)
{
int rc = 0;
struct inode *inode = filp->f_path.dentry->d_inode;
struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
if (!cifs_inode_needs_reval(inode))
- goto check_inval;
+ return rc;
if (tlink_tcon(cfile->tlink)->unix_ext)
rc = cifs_get_file_info_unix(filp);
else
rc = cifs_get_file_info(filp);
-check_inval:
- if (CIFS_I(inode)->invalid_mapping)
- cifs_invalidate_mapping(inode);
-
return rc;
}
-/* revalidate a dentry's inode attributes */
-int cifs_revalidate_dentry(struct dentry *dentry)
+int cifs_revalidate_dentry_attr(struct dentry *dentry)
{
int xid;
int rc = 0;
- char *full_path = NULL;
struct inode *inode = dentry->d_inode;
struct super_block *sb = dentry->d_sb;
+ char *full_path = NULL;
if (inode == NULL)
return -ENOENT;
- xid = GetXid();
-
if (!cifs_inode_needs_reval(inode))
- goto check_inval;
+ return rc;
+
+ xid = GetXid();
/* can not safely grab the rename sem here if rename calls revalidate
since that would deadlock */
full_path = build_path_from_dentry(dentry);
if (full_path == NULL) {
rc = -ENOMEM;
- goto check_inval;
+ goto out;
}
- cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
- "jiffies %ld", full_path, inode, inode->i_count.counter,
+ cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time "
+ "%ld jiffies %ld", full_path, inode, inode->i_count.counter,
dentry, dentry->d_time, jiffies);
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
@@ -1756,41 +1755,83 @@ int cifs_revalidate_dentry(struct dentry *dentry)
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
-check_inval:
- if (CIFS_I(inode)->invalid_mapping)
- cifs_invalidate_mapping(inode);
-
+out:
kfree(full_path);
FreeXid(xid);
return rc;
}
+int cifs_revalidate_file(struct file *filp)
+{
+ int rc;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+
+ rc = cifs_revalidate_file_attr(filp);
+ if (rc)
+ return rc;
+
+ if (CIFS_I(inode)->invalid_mapping)
+ rc = cifs_invalidate_mapping(inode);
+ return rc;
+}
+
+/* revalidate a dentry's inode attributes */
+int cifs_revalidate_dentry(struct dentry *dentry)
+{
+ int rc;
+ struct inode *inode = dentry->d_inode;
+
+ rc = cifs_revalidate_dentry_attr(dentry);
+ if (rc)
+ return rc;
+
+ if (CIFS_I(inode)->invalid_mapping)
+ rc = cifs_invalidate_mapping(inode);
+ return rc;
+}
+
int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb);
- int err = cifs_revalidate_dentry(dentry);
-
- if (!err) {
- generic_fillattr(dentry->d_inode, stat);
- stat->blksize = CIFS_MAX_MSGSIZE;
- stat->ino = CIFS_I(dentry->d_inode)->uniqueid;
+ struct inode *inode = dentry->d_inode;
+ int rc;
- /*
- * If on a multiuser mount without unix extensions, and the
- * admin hasn't overridden them, set the ownership to the
- * fsuid/fsgid of the current process.
- */
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
- !tcon->unix_ext) {
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
- stat->uid = current_fsuid();
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
- stat->gid = current_fsgid();
+ /*
+ * We need to be sure that all dirty pages are written and the server
+ * has actual ctime, mtime and file length.
+ */
+ if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+ inode->i_mapping->nrpages != 0) {
+ rc = filemap_fdatawait(inode->i_mapping);
+ if (rc) {
+ mapping_set_error(inode->i_mapping, rc);
+ return rc;
}
}
- return err;
+
+ rc = cifs_revalidate_dentry_attr(dentry);
+ if (rc)
+ return rc;
+
+ generic_fillattr(inode, stat);
+ stat->blksize = CIFS_MAX_MSGSIZE;
+ stat->ino = CIFS_I(inode)->uniqueid;
+
+ /*
+ * If on a multiuser mount without unix extensions, and the admin hasn't
+ * overridden them, set the ownership to the fsuid/fsgid of the current
+ * process.
+ */
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+ !tcon->unix_ext) {
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
+ stat->uid = current_fsuid();
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID))
+ stat->gid = current_fsgid();
+ }
+ return rc;
}
static int cifs_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 0c684ae4c071..907531ac5888 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -304,12 +304,10 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */
- buffer->smb_buf_length =
+ buffer->smb_buf_length = cpu_to_be32(
(2 * word_count) + sizeof(struct smb_hdr) -
4 /* RFC 1001 length field does not count */ +
- 2 /* for bcc field itself */ ;
- /* Note that this is the only network field that has to be converted
- to big endian and it is done just before we send it */
+ 2 /* for bcc field itself */) ;
buffer->Protocol[0] = 0xFF;
buffer->Protocol[1] = 'S';
@@ -424,7 +422,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
int
checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
{
- __u32 len = smb->smb_buf_length;
+ __u32 len = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len);
@@ -464,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
if (check_smb_hdr(smb, mid))
return 1;
- clc_len = smbCalcSize_LE(smb);
+ clc_len = smbCalcSize(smb);
if (4 + len != length) {
cERROR(1, "Length read does not match RFC1001 length %d",
@@ -521,7 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
(struct smb_com_transaction_change_notify_rsp *)buf;
struct file_notify_information *pnotify;
__u32 data_offset = 0;
- if (get_bcc_le(buf) > sizeof(struct file_notify_information)) {
+ if (get_bcc(buf) > sizeof(struct file_notify_information)) {
data_offset = le32_to_cpu(pSMBr->DataOffset);
pnotify = (struct file_notify_information *)
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 79f641eeda30..79b71c2c7c9d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -919,13 +919,6 @@ smbCalcSize(struct smb_hdr *ptr)
2 /* size of the bcc field */ + get_bcc(ptr));
}
-unsigned int
-smbCalcSize_LE(struct smb_hdr *ptr)
-{
- return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) +
- 2 /* size of the bcc field */ + get_bcc_le(ptr));
-}
-
/* The following are taken from fs/ntfs/util.c */
#define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 645114ad0a10..7dd462100378 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -621,7 +621,7 @@ ssetup_ntlmssp_authenticate:
and rest of bcc area. This allows us to avoid
a large buffer 17K allocation */
iov[0].iov_base = (char *)pSMB;
- iov[0].iov_len = smb_buf->smb_buf_length + 4;
+ iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
/* setting this here allows the code at the end of the function
to free the request buffer if there's an error */
@@ -656,7 +656,7 @@ ssetup_ntlmssp_authenticate:
* to use challenge/response method (i.e. Password bit is 1).
*/
- calc_lanman_hash(ses->password, ses->server->cryptkey,
+ rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
ses->server->secMode & SECMODE_PW_ENCRYPT ?
true : false, lnm_session_key);
@@ -859,9 +859,10 @@ ssetup_ntlmssp_authenticate:
iov[2].iov_len = (long) bcc_ptr - (long) str_area;
count = iov[1].iov_len + iov[2].iov_len;
- smb_buf->smb_buf_length += count;
+ smb_buf->smb_buf_length =
+ cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
- put_bcc_le(count, smb_buf);
+ put_bcc(count, smb_buf);
rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type,
CIFS_LOG_ERROR);
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
deleted file mode 100644
index 04721485925d..000000000000
--- a/fs/cifs/smbdes.c
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- Unix SMB/Netbios implementation.
- Version 1.9.
-
- a partial implementation of DES designed for use in the
- SMB authentication protocol
-
- Copyright (C) Andrew Tridgell 1998
- Modified by Steve French (sfrench@us.ibm.com) 2002,2004
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-*/
-
-/* NOTES:
-
- This code makes no attempt to be fast! In fact, it is a very
- slow implementation
-
- This code is NOT a complete DES implementation. It implements only
- the minimum necessary for SMB authentication, as used by all SMB
- products (including every copy of Microsoft Windows95 ever sold)
-
- In particular, it can only do a unchained forward DES pass. This
- means it is not possible to use this code for encryption/decryption
- of data, instead it is only useful as a "hash" algorithm.
-
- There is no entry point into this code that allows normal DES operation.
-
- I believe this means that this code does not come under ITAR
- regulations but this is NOT a legal opinion. If you are concerned
- about the applicability of ITAR regulations to this code then you
- should confirm it for yourself (and maybe let me know if you come
- up with a different answer to the one above)
-*/
-#include <linux/slab.h>
-#define uchar unsigned char
-
-static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9,
- 1, 58, 50, 42, 34, 26, 18,
- 10, 2, 59, 51, 43, 35, 27,
- 19, 11, 3, 60, 52, 44, 36,
- 63, 55, 47, 39, 31, 23, 15,
- 7, 62, 54, 46, 38, 30, 22,
- 14, 6, 61, 53, 45, 37, 29,
- 21, 13, 5, 28, 20, 12, 4
-};
-
-static uchar perm2[48] = { 14, 17, 11, 24, 1, 5,
- 3, 28, 15, 6, 21, 10,
- 23, 19, 12, 4, 26, 8,
- 16, 7, 27, 20, 13, 2,
- 41, 52, 31, 37, 47, 55,
- 30, 40, 51, 45, 33, 48,
- 44, 49, 39, 56, 34, 53,
- 46, 42, 50, 36, 29, 32
-};
-
-static uchar perm3[64] = { 58, 50, 42, 34, 26, 18, 10, 2,
- 60, 52, 44, 36, 28, 20, 12, 4,
- 62, 54, 46, 38, 30, 22, 14, 6,
- 64, 56, 48, 40, 32, 24, 16, 8,
- 57, 49, 41, 33, 25, 17, 9, 1,
- 59, 51, 43, 35, 27, 19, 11, 3,
- 61, 53, 45, 37, 29, 21, 13, 5,
- 63, 55, 47, 39, 31, 23, 15, 7
-};
-
-static uchar perm4[48] = { 32, 1, 2, 3, 4, 5,
- 4, 5, 6, 7, 8, 9,
- 8, 9, 10, 11, 12, 13,
- 12, 13, 14, 15, 16, 17,
- 16, 17, 18, 19, 20, 21,
- 20, 21, 22, 23, 24, 25,
- 24, 25, 26, 27, 28, 29,
- 28, 29, 30, 31, 32, 1
-};
-
-static uchar perm5[32] = { 16, 7, 20, 21,
- 29, 12, 28, 17,
- 1, 15, 23, 26,
- 5, 18, 31, 10,
- 2, 8, 24, 14,
- 32, 27, 3, 9,
- 19, 13, 30, 6,
- 22, 11, 4, 25
-};
-
-static uchar perm6[64] = { 40, 8, 48, 16, 56, 24, 64, 32,
- 39, 7, 47, 15, 55, 23, 63, 31,
- 38, 6, 46, 14, 54, 22, 62, 30,
- 37, 5, 45, 13, 53, 21, 61, 29,
- 36, 4, 44, 12, 52, 20, 60, 28,
- 35, 3, 43, 11, 51, 19, 59, 27,
- 34, 2, 42, 10, 50, 18, 58, 26,
- 33, 1, 41, 9, 49, 17, 57, 25
-};
-
-static uchar sc[16] = { 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 };
-
-static uchar sbox[8][4][16] = {
- {{14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7},
- {0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8},
- {4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0},
- {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13} },
-
- {{15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10},
- {3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5},
- {0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15},
- {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9} },
-
- {{10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8},
- {13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1},
- {13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7},
- {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12} },
-
- {{7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15},
- {13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9},
- {10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4},
- {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14} },
-
- {{2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9},
- {14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6},
- {4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14},
- {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3} },
-
- {{12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11},
- {10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8},
- {9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6},
- {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13} },
-
- {{4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1},
- {13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6},
- {1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2},
- {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12} },
-
- {{13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7},
- {1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2},
- {7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8},
- {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11} }
-};
-
-static void
-permute(char *out, char *in, uchar *p, int n)
-{
- int i;
- for (i = 0; i < n; i++)
- out[i] = in[p[i] - 1];
-}
-
-static void
-lshift(char *d, int count, int n)
-{
- char out[64];
- int i;
- for (i = 0; i < n; i++)
- out[i] = d[(i + count) % n];
- for (i = 0; i < n; i++)
- d[i] = out[i];
-}
-
-static void
-concat(char *out, char *in1, char *in2, int l1, int l2)
-{
- while (l1--)
- *out++ = *in1++;
- while (l2--)
- *out++ = *in2++;
-}
-
-static void
-xor(char *out, char *in1, char *in2, int n)
-{
- int i;
- for (i = 0; i < n; i++)
- out[i] = in1[i] ^ in2[i];
-}
-
-static void
-dohash(char *out, char *in, char *key, int forw)
-{
- int i, j, k;
- char *pk1;
- char c[28];
- char d[28];
- char *cd;
- char (*ki)[48];
- char *pd1;
- char l[32], r[32];
- char *rl;
-
- /* Have to reduce stack usage */
- pk1 = kmalloc(56+56+64+64, GFP_KERNEL);
- if (pk1 == NULL)
- return;
-
- ki = kmalloc(16*48, GFP_KERNEL);
- if (ki == NULL) {
- kfree(pk1);
- return;
- }
-
- cd = pk1 + 56;
- pd1 = cd + 56;
- rl = pd1 + 64;
-
- permute(pk1, key, perm1, 56);
-
- for (i = 0; i < 28; i++)
- c[i] = pk1[i];
- for (i = 0; i < 28; i++)
- d[i] = pk1[i + 28];
-
- for (i = 0; i < 16; i++) {
- lshift(c, sc[i], 28);
- lshift(d, sc[i], 28);
-
- concat(cd, c, d, 28, 28);
- permute(ki[i], cd, perm2, 48);
- }
-
- permute(pd1, in, perm3, 64);
-
- for (j = 0; j < 32; j++) {
- l[j] = pd1[j];
- r[j] = pd1[j + 32];
- }
-
- for (i = 0; i < 16; i++) {
- char *er; /* er[48] */
- char *erk; /* erk[48] */
- char b[8][6];
- char *cb; /* cb[32] */
- char *pcb; /* pcb[32] */
- char *r2; /* r2[32] */
-
- er = kmalloc(48+48+32+32+32, GFP_KERNEL);
- if (er == NULL) {
- kfree(pk1);
- kfree(ki);
- return;
- }
- erk = er+48;
- cb = erk+48;
- pcb = cb+32;
- r2 = pcb+32;
-
- permute(er, r, perm4, 48);
-
- xor(erk, er, ki[forw ? i : 15 - i], 48);
-
- for (j = 0; j < 8; j++)
- for (k = 0; k < 6; k++)
- b[j][k] = erk[j * 6 + k];
-
- for (j = 0; j < 8; j++) {
- int m, n;
- m = (b[j][0] << 1) | b[j][5];
-
- n = (b[j][1] << 3) | (b[j][2] << 2) | (b[j][3] <<
- 1) | b[j][4];
-
- for (k = 0; k < 4; k++)
- b[j][k] =
- (sbox[j][m][n] & (1 << (3 - k))) ? 1 : 0;
- }
-
- for (j = 0; j < 8; j++)
- for (k = 0; k < 4; k++)
- cb[j * 4 + k] = b[j][k];
- permute(pcb, cb, perm5, 32);
-
- xor(r2, l, pcb, 32);
-
- for (j = 0; j < 32; j++)
- l[j] = r[j];
-
- for (j = 0; j < 32; j++)
- r[j] = r2[j];
-
- kfree(er);
- }
-
- concat(rl, r, l, 32, 32);
-
- permute(out, rl, perm6, 64);
- kfree(pk1);
- kfree(ki);
-}
-
-static void
-str_to_key(unsigned char *str, unsigned char *key)
-{
- int i;
-
- key[0] = str[0] >> 1;
- key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
- key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
- key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
- key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
- key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
- key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
- key[7] = str[6] & 0x7F;
- for (i = 0; i < 8; i++)
- key[i] = (key[i] << 1);
-}
-
-static void
-smbhash(unsigned char *out, const unsigned char *in, unsigned char *key,
- int forw)
-{
- int i;
- char *outb; /* outb[64] */
- char *inb; /* inb[64] */
- char *keyb; /* keyb[64] */
- unsigned char key2[8];
-
- outb = kmalloc(64 * 3, GFP_KERNEL);
- if (outb == NULL)
- return;
-
- inb = outb + 64;
- keyb = inb + 64;
-
- str_to_key(key, key2);
-
- for (i = 0; i < 64; i++) {
- inb[i] = (in[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
- keyb[i] = (key2[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0;
- outb[i] = 0;
- }
-
- dohash(outb, inb, keyb, forw);
-
- for (i = 0; i < 8; i++)
- out[i] = 0;
-
- for (i = 0; i < 64; i++) {
- if (outb[i])
- out[i / 8] |= (1 << (7 - (i % 8)));
- }
- kfree(outb);
-}
-
-void
-E_P16(unsigned char *p14, unsigned char *p16)
-{
- unsigned char sp8[8] =
- { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
- smbhash(p16, sp8, p14, 1);
- smbhash(p16 + 8, sp8, p14 + 7, 1);
-}
-
-void
-E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
-{
- smbhash(p24, c8, p21, 1);
- smbhash(p24 + 8, c8, p21 + 7, 1);
- smbhash(p24 + 16, c8, p21 + 14, 1);
-}
-
-#if 0 /* currently unused */
-static void
-D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
- smbhash(out, in, p14, 0);
- smbhash(out + 8, in + 8, p14 + 7, 0);
-}
-
-static void
-E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out)
-{
- smbhash(out, in, p14, 1);
- smbhash(out + 8, in + 8, p14 + 7, 1);
-}
-/* these routines are currently unneeded, but may be
- needed later */
-void
-cred_hash1(unsigned char *out, unsigned char *in, unsigned char *key)
-{
- unsigned char buf[8];
-
- smbhash(buf, in, key, 1);
- smbhash(out, buf, key + 9, 1);
-}
-
-void
-cred_hash2(unsigned char *out, unsigned char *in, unsigned char *key)
-{
- unsigned char buf[8];
- static unsigned char key2[8];
-
- smbhash(buf, in, key, 1);
- key2[0] = key[7];
- smbhash(out, buf, key2, 1);
-}
-
-void
-cred_hash3(unsigned char *out, unsigned char *in, unsigned char *key, int forw)
-{
- static unsigned char key2[8];
-
- smbhash(out, in, key, forw);
- key2[0] = key[7];
- smbhash(out + 8, in + 8, key2, forw);
-}
-#endif /* unneeded routines */
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5041c849981..1525d5e662b6 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -47,6 +47,88 @@
#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
#define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val)))
+static void
+str_to_key(unsigned char *str, unsigned char *key)
+{
+ int i;
+
+ key[0] = str[0] >> 1;
+ key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2);
+ key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3);
+ key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4);
+ key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5);
+ key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6);
+ key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7);
+ key[7] = str[6] & 0x7F;
+ for (i = 0; i < 8; i++)
+ key[i] = (key[i] << 1);
+}
+
+static int
+smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
+{
+ int rc;
+ unsigned char key2[8];
+ struct crypto_blkcipher *tfm_des;
+ struct scatterlist sgin, sgout;
+ struct blkcipher_desc desc;
+
+ str_to_key(key, key2);
+
+ tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm_des)) {
+ rc = PTR_ERR(tfm_des);
+ cERROR(1, "could not allocate des crypto API\n");
+ goto smbhash_err;
+ }
+
+ desc.tfm = tfm_des;
+
+ crypto_blkcipher_setkey(tfm_des, key2, 8);
+
+ sg_init_one(&sgin, in, 8);
+ sg_init_one(&sgout, out, 8);
+
+ rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8);
+ if (rc) {
+ cERROR(1, "could not encrypt crypt key rc: %d\n", rc);
+ crypto_free_blkcipher(tfm_des);
+ goto smbhash_err;
+ }
+
+smbhash_err:
+ return rc;
+}
+
+static int
+E_P16(unsigned char *p14, unsigned char *p16)
+{
+ int rc;
+ unsigned char sp8[8] =
+ { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 };
+
+ rc = smbhash(p16, sp8, p14);
+ if (rc)
+ return rc;
+ rc = smbhash(p16 + 8, sp8, p14 + 7);
+ return rc;
+}
+
+static int
+E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
+{
+ int rc;
+
+ rc = smbhash(p24, c8, p21);
+ if (rc)
+ return rc;
+ rc = smbhash(p24 + 8, c8, p21 + 7);
+ if (rc)
+ return rc;
+ rc = smbhash(p24 + 16, c8, p21 + 14);
+ return rc;
+}
+
/* produce a md4 message digest from data of length n bytes */
int
mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
@@ -87,40 +169,30 @@ mdfour_err:
return rc;
}
-/* Does the des encryption from the NT or LM MD4 hash. */
-static void
-SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8,
- unsigned char p24[24])
-{
- unsigned char p21[21];
-
- memset(p21, '\0', 21);
-
- memcpy(p21, passwd, 16);
- E_P24(p21, c8, p24);
-}
-
/*
This implements the X/Open SMB password encryption
It takes a password, a 8 byte "crypt key" and puts 24 bytes of
encrypted password into p24 */
/* Note that password must be uppercased and null terminated */
-void
+int
SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24)
{
- unsigned char p14[15], p21[21];
+ int rc;
+ unsigned char p14[14], p16[16], p21[21];
- memset(p21, '\0', 21);
memset(p14, '\0', 14);
- strncpy((char *) p14, (char *) passwd, 14);
+ memset(p16, '\0', 16);
+ memset(p21, '\0', 21);
-/* strupper((char *)p14); *//* BB at least uppercase the easy range */
- E_P16(p14, p21);
+ memcpy(p14, passwd, 14);
+ rc = E_P16(p14, p16);
+ if (rc)
+ return rc;
- SMBOWFencrypt(p21, c8, p24);
+ memcpy(p21, p16, 16);
+ rc = E_P24(p21, c8, p24);
- memset(p14, 0, 15);
- memset(p21, 0, 21);
+ return rc;
}
/* Routines for Windows NT MD4 Hash functions. */
@@ -279,16 +351,18 @@ int
SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24)
{
int rc;
- unsigned char p21[21];
+ unsigned char p16[16], p21[21];
+ memset(p16, '\0', 16);
memset(p21, '\0', 21);
- rc = E_md4hash(passwd, p21);
+ rc = E_md4hash(passwd, p16);
if (rc) {
cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc);
return rc;
}
- SMBOWFencrypt(p21, c8, p24);
+ memcpy(p21, p16, 16);
+ rc = E_P24(p21, c8, p24);
return rc;
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 46d8756f2b24..f2513fb8c391 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -129,7 +129,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
unsigned int len = iov[0].iov_len;
unsigned int total_len;
int first_vec = 0;
- unsigned int smb_buf_length = smb_buffer->smb_buf_length;
+ unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length);
struct socket *ssocket = server->ssocket;
if (ssocket == NULL)
@@ -144,17 +144,10 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
else
smb_msg.msg_flags = MSG_NOSIGNAL;
- /* smb header is converted in header_assemble. bcc and rest of SMB word
- area, and byte area if necessary, is converted to littleendian in
- cifssmb.c and RFC1001 len is converted to bigendian in smb_send
- Flags2 is converted in SendReceive */
-
-
total_len = 0;
for (i = 0; i < n_vec; i++)
total_len += iov[i].iov_len;
- smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length);
cFYI(1, "Sending smb: total_len %d", total_len);
dump_smb(smb_buffer, len);
@@ -243,7 +236,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
/* Don't want to modify the buffer as a
side effect of this call. */
- smb_buffer->smb_buf_length = smb_buf_length;
+ smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length);
return rc;
}
@@ -387,7 +380,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
#ifdef CONFIG_CIFS_STATS2
atomic_inc(&server->inSend);
#endif
- rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+ rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
#ifdef CONFIG_CIFS_STATS2
atomic_dec(&server->inSend);
mid->when_sent = jiffies;
@@ -422,7 +415,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses,
int resp_buf_type;
iov[0].iov_base = (char *)in_buf;
- iov[0].iov_len = in_buf->smb_buf_length + 4;
+ iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4;
flags |= CIFS_NO_RESP;
rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc);
@@ -488,10 +481,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
int rc = 0;
/* -4 for RFC1001 length and +2 for BCC field */
- in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2;
+ in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2);
in_buf->Command = SMB_COM_NT_CANCEL;
in_buf->WordCount = 0;
- put_bcc_le(0, in_buf);
+ put_bcc(0, in_buf);
mutex_lock(&server->srv_mutex);
rc = cifs_sign_smb(in_buf, server, &mid->sequence_number);
@@ -499,7 +492,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
mutex_unlock(&server->srv_mutex);
return rc;
}
- rc = smb_send(server, in_buf, in_buf->smb_buf_length);
+ rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
mutex_unlock(&server->srv_mutex);
cFYI(1, "issued NT_CANCEL for mid %u, rc = %d",
@@ -612,7 +605,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
return rc;
}
- receive_len = midQ->resp_buf->smb_buf_length;
+ receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
cERROR(1, "Frame too large received. Length: %d Xid: %d",
@@ -651,11 +644,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
rc = map_smb_to_linux_error(midQ->resp_buf,
flags & CIFS_LOG_ERROR);
- /* convert ByteCount if necessary */
- if (receive_len >= sizeof(struct smb_hdr) - 4
- /* do not count RFC1001 header */ +
- (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ )
- put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
if ((flags & CIFS_NO_RESP) == 0)
midQ->resp_buf = NULL; /* mark it so buf will
not be freed by
@@ -698,9 +686,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
to the same server. We may make this configurable later or
use ses->maxReq */
- if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+ MAX_CIFS_HDR_SIZE - 4) {
cERROR(1, "Illegal length, greater than maximum frame, %d",
- in_buf->smb_buf_length);
+ be32_to_cpu(in_buf->smb_buf_length));
return -EIO;
}
@@ -733,7 +722,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
#ifdef CONFIG_CIFS_STATS2
atomic_inc(&ses->server->inSend);
#endif
- rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
+ rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
#ifdef CONFIG_CIFS_STATS2
atomic_dec(&ses->server->inSend);
midQ->when_sent = jiffies;
@@ -768,7 +757,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
return rc;
}
- receive_len = midQ->resp_buf->smb_buf_length;
+ receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
cERROR(1, "Frame too large received. Length: %d Xid: %d",
@@ -781,7 +770,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
if (midQ->resp_buf && out_buf
&& (midQ->midState == MID_RESPONSE_RECEIVED)) {
- out_buf->smb_buf_length = receive_len;
+ out_buf->smb_buf_length = cpu_to_be32(receive_len);
memcpy((char *)out_buf + 4,
(char *)midQ->resp_buf + 4,
receive_len);
@@ -800,16 +789,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
}
}
- *pbytes_returned = out_buf->smb_buf_length;
+ *pbytes_returned = be32_to_cpu(out_buf->smb_buf_length);
/* BB special case reconnect tid and uid here? */
rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
-
- /* convert ByteCount if necessary */
- if (receive_len >= sizeof(struct smb_hdr) - 4
- /* do not count RFC1001 header */ +
- (2 * out_buf->WordCount) + 2 /* bcc */ )
- put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf);
} else {
rc = -EIO;
cERROR(1, "Bad MID state?");
@@ -877,9 +860,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
to the same server. We may make this configurable later or
use ses->maxReq */
- if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
+ MAX_CIFS_HDR_SIZE - 4) {
cERROR(1, "Illegal length, greater than maximum frame, %d",
- in_buf->smb_buf_length);
+ be32_to_cpu(in_buf->smb_buf_length));
return -EIO;
}
@@ -910,7 +894,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
#ifdef CONFIG_CIFS_STATS2
atomic_inc(&ses->server->inSend);
#endif
- rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length);
+ rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
#ifdef CONFIG_CIFS_STATS2
atomic_dec(&ses->server->inSend);
midQ->when_sent = jiffies;
@@ -977,7 +961,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
if (rc != 0)
return rc;
- receive_len = midQ->resp_buf->smb_buf_length;
+ receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length);
if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) {
cERROR(1, "Frame too large received. Length: %d Xid: %d",
receive_len, xid);
@@ -993,7 +977,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
goto out;
}
- out_buf->smb_buf_length = receive_len;
+ out_buf->smb_buf_length = cpu_to_be32(receive_len);
memcpy((char *)out_buf + 4,
(char *)midQ->resp_buf + 4,
receive_len);
@@ -1012,17 +996,11 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
}
}
- *pbytes_returned = out_buf->smb_buf_length;
+ *pbytes_returned = be32_to_cpu(out_buf->smb_buf_length);
/* BB special case reconnect tid and uid here? */
rc = map_smb_to_linux_error(out_buf, 0 /* no log */ );
- /* convert ByteCount if necessary */
- if (receive_len >= sizeof(struct smb_hdr) - 4
- /* do not count RFC1001 header */ +
- (2 * out_buf->WordCount) + 2 /* bcc */ )
- put_bcc(get_bcc_le(out_buf), out_buf);
-
out:
delete_mid(midQ);
if (rstart && rc == -EACCES)
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index eae2a1491608..912995e013ec 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -112,6 +112,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
struct cifsTconInfo *pTcon;
struct super_block *sb;
char *full_path;
+ struct cifs_ntsd *pacl;
if (direntry == NULL)
return -EIO;
@@ -166,6 +167,25 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
(__u16)value_size, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
+ strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
+ pacl = kmalloc(value_size, GFP_KERNEL);
+ if (!pacl) {
+ cFYI(1, "%s: Can't allocate memory for ACL",
+ __func__);
+ rc = -ENOMEM;
+ } else {
+#ifdef CONFIG_CIFS_ACL
+ memcpy(pacl, ea_value, value_size);
+ rc = set_cifs_acl(pacl, value_size,
+ direntry->d_inode, full_path);
+ if (rc == 0) /* force revalidate of the inode */
+ CIFS_I(direntry->d_inode)->time = 0;
+ kfree(pacl);
+#else
+ cFYI(1, "Set CIFS ACL not supported yet");
+#endif /* CONFIG_CIFS_ACL */
+ }
} else {
int temp;
temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 2b8dae4d121e..a46126fd5735 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,6 +336,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
int len = de->d_name.len;
int error;
+ dentry_unhash(de);
+
error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
if (!error) {
/* VFS may delete the child */
@@ -359,6 +361,9 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
int new_length = new_dentry->d_name.len;
int error;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
coda_i2f(new_dir), old_length, new_length,
(const char *) old_name, (const char *)new_name);
diff --git a/fs/compat.c b/fs/compat.c
index 72fe6cda9108..0ea00832de23 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1306,241 +1306,6 @@ compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int
return do_sys_open(dfd, filename, flags, mode);
}
-/*
- * compat_count() counts the number of arguments/envelopes. It is basically
- * a copy of count() from fs/exec.c, except that it works with 32 bit argv
- * and envp pointers.
- */
-static int compat_count(compat_uptr_t __user *argv, int max)
-{
- int i = 0;
-
- if (argv != NULL) {
- for (;;) {
- compat_uptr_t p;
-
- if (get_user(p, argv))
- return -EFAULT;
- if (!p)
- break;
- argv++;
- if (i++ >= max)
- return -E2BIG;
-
- if (fatal_signal_pending(current))
- return -ERESTARTNOHAND;
- cond_resched();
- }
- }
- return i;
-}
-
-/*
- * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c
- * except that it works with 32 bit argv and envp pointers.
- */
-static int compat_copy_strings(int argc, compat_uptr_t __user *argv,
- struct linux_binprm *bprm)
-{
- struct page *kmapped_page = NULL;
- char *kaddr = NULL;
- unsigned long kpos = 0;
- int ret;
-
- while (argc-- > 0) {
- compat_uptr_t str;
- int len;
- unsigned long pos;
-
- if (get_user(str, argv+argc) ||
- !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) {
- ret = -EFAULT;
- goto out;
- }
-
- if (len > MAX_ARG_STRLEN) {
- ret = -E2BIG;
- goto out;
- }
-
- /* We're going to work our way backwords. */
- pos = bprm->p;
- str += len;
- bprm->p -= len;
-
- while (len > 0) {
- int offset, bytes_to_copy;
-
- if (fatal_signal_pending(current)) {
- ret = -ERESTARTNOHAND;
- goto out;
- }
- cond_resched();
-
- offset = pos % PAGE_SIZE;
- if (offset == 0)
- offset = PAGE_SIZE;
-
- bytes_to_copy = offset;
- if (bytes_to_copy > len)
- bytes_to_copy = len;
-
- offset -= bytes_to_copy;
- pos -= bytes_to_copy;
- str -= bytes_to_copy;
- len -= bytes_to_copy;
-
- if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
- struct page *page;
-
- page = get_arg_page(bprm, pos, 1);
- if (!page) {
- ret = -E2BIG;
- goto out;
- }
-
- if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
- kunmap(kmapped_page);
- put_page(kmapped_page);
- }
- kmapped_page = page;
- kaddr = kmap(kmapped_page);
- kpos = pos & PAGE_MASK;
- flush_cache_page(bprm->vma, kpos,
- page_to_pfn(kmapped_page));
- }
- if (copy_from_user(kaddr+offset, compat_ptr(str),
- bytes_to_copy)) {
- ret = -EFAULT;
- goto out;
- }
- }
- }
- ret = 0;
-out:
- if (kmapped_page) {
- flush_kernel_dcache_page(kmapped_page);
- kunmap(kmapped_page);
- put_page(kmapped_page);
- }
- return ret;
-}
-
-/*
- * compat_do_execve() is mostly a copy of do_execve(), with the exception
- * that it processes 32 bit argv and envp pointers.
- */
-int compat_do_execve(char * filename,
- compat_uptr_t __user *argv,
- compat_uptr_t __user *envp,
- struct pt_regs * regs)
-{
- struct linux_binprm *bprm;
- struct file *file;
- struct files_struct *displaced;
- bool clear_in_exec;
- int retval;
-
- retval = unshare_files(&displaced);
- if (retval)
- goto out_ret;
-
- retval = -ENOMEM;
- bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
- if (!bprm)
- goto out_files;
-
- retval = prepare_bprm_creds(bprm);
- if (retval)
- goto out_free;
-
- retval = check_unsafe_exec(bprm);
- if (retval < 0)
- goto out_free;
- clear_in_exec = retval;
- current->in_execve = 1;
-
- file = open_exec(filename);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
-
- sched_exec();
-
- bprm->file = file;
- bprm->filename = filename;
- bprm->interp = filename;
-
- retval = bprm_mm_init(bprm);
- if (retval)
- goto out_file;
-
- bprm->argc = compat_count(argv, MAX_ARG_STRINGS);
- if ((retval = bprm->argc) < 0)
- goto out;
-
- bprm->envc = compat_count(envp, MAX_ARG_STRINGS);
- if ((retval = bprm->envc) < 0)
- goto out;
-
- retval = prepare_binprm(bprm);
- if (retval < 0)
- goto out;
-
- retval = copy_strings_kernel(1, &bprm->filename, bprm);
- if (retval < 0)
- goto out;
-
- bprm->exec = bprm->p;
- retval = compat_copy_strings(bprm->envc, envp, bprm);
- if (retval < 0)
- goto out;
-
- retval = compat_copy_strings(bprm->argc, argv, bprm);
- if (retval < 0)
- goto out;
-
- retval = search_binary_handler(bprm, regs);
- if (retval < 0)
- goto out;
-
- /* execve succeeded */
- current->fs->in_exec = 0;
- current->in_execve = 0;
- acct_update_integrals(current);
- free_bprm(bprm);
- if (displaced)
- put_files_struct(displaced);
- return retval;
-
-out:
- if (bprm->mm) {
- acct_arg_size(bprm, 0);
- mmput(bprm->mm);
- }
-
-out_file:
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
-
-out_unmark:
- if (clear_in_exec)
- current->fs->in_exec = 0;
- current->in_execve = 0;
-
-out_free:
- free_bprm(bprm);
-
-out_files:
- if (displaced)
- reset_files_struct(displaced);
-out_ret:
- return retval;
-}
-
#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t))
static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9a37a9b6de3a..9d17d350abc5 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,6 +1359,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
+ dentry_unhash(dentry);
+
if (dentry->d_parent == configfs_sb->s_root)
return -EPERM;
diff --git a/fs/dcache.c b/fs/dcache.c
index 22a0ef41bad1..37f72ee5bf7c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,6 +35,7 @@
#include <linux/hardirq.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
+#include <linux/prefetch.h>
#include "internal.h"
/*
@@ -1219,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent)
EXPORT_SYMBOL(shrink_dcache_parent);
/*
- * Scan `nr' dentries and return the number which remain.
+ * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain.
*
* We need to avoid reentering the filesystem if the caller is performing a
* GFP_NOFS allocation attempt. One example deadlock is:
@@ -1230,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent);
*
* In this case we return -1 to tell the caller that we baled.
*/
-static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 89d394d8fe24..90f76575c056 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -428,26 +428,17 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
- int buf_size;
+ size_t buf_size;
+ bool bv;
u32 *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;
- switch (buf[0]) {
- case 'y':
- case 'Y':
- case '1':
- *val = 1;
- break;
- case 'n':
- case 'N':
- case '0':
- *val = 0;
- break;
- }
-
+ if (strtobool(buf, &bv) == 0)
+ *val = bv;
+
return count;
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 0d329ff8ed4c..9b026ea8baa9 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -100,6 +100,7 @@ struct dlm_cluster {
unsigned int cl_log_debug;
unsigned int cl_protocol;
unsigned int cl_timewarn_cs;
+ unsigned int cl_waitwarn_us;
};
enum {
@@ -114,6 +115,7 @@ enum {
CLUSTER_ATTR_LOG_DEBUG,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_TIMEWARN_CS,
+ CLUSTER_ATTR_WAITWARN_US,
};
struct cluster_attribute {
@@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1);
CLUSTER_ATTR(log_debug, 0);
CLUSTER_ATTR(protocol, 0);
CLUSTER_ATTR(timewarn_cs, 1);
+CLUSTER_ATTR(waitwarn_us, 0);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr,
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
+ [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
NULL,
};
@@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_log_debug = dlm_config.ci_log_debug;
cl->cl_protocol = dlm_config.ci_protocol;
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
+ cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
@@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_LOG_DEBUG 0
#define DEFAULT_PROTOCOL 0
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
+#define DEFAULT_WAITWARN_US 0
struct dlm_config_info dlm_config = {
.ci_tcp_port = DEFAULT_TCP_PORT,
@@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = {
.ci_scan_secs = DEFAULT_SCAN_SECS,
.ci_log_debug = DEFAULT_LOG_DEBUG,
.ci_protocol = DEFAULT_PROTOCOL,
- .ci_timewarn_cs = DEFAULT_TIMEWARN_CS
+ .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
+ .ci_waitwarn_us = DEFAULT_WAITWARN_US
};
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 4f1d6fce58c5..dd0ce24d5a80 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -28,6 +28,7 @@ struct dlm_config_info {
int ci_log_debug;
int ci_protocol;
int ci_timewarn_cs;
+ int ci_waitwarn_us;
};
extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index b94204913011..0262451eb9c6 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -209,6 +209,7 @@ struct dlm_args {
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
+#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
@@ -245,6 +246,7 @@ struct dlm_lkb {
int8_t lkb_wait_type; /* type of reply waiting for */
int8_t lkb_wait_count;
+ int lkb_wait_nodeid; /* for debugging */
struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -254,6 +256,7 @@ struct dlm_lkb {
struct list_head lkb_ownqueue; /* list of locks for a process */
struct list_head lkb_time_list;
ktime_t lkb_timestamp;
+ ktime_t lkb_wait_time;
unsigned long lkb_timeout_cs;
struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 56d6bfcc1e48..f71d0b5abd95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype)
return -1;
}
+static int nodeid_warned(int nodeid, int num_nodes, int *warned)
+{
+ int i;
+
+ for (i = 0; i < num_nodes; i++) {
+ if (!warned[i]) {
+ warned[i] = nodeid;
+ return 0;
+ }
+ if (warned[i] == nodeid)
+ return 1;
+ }
+ return 0;
+}
+
+void dlm_scan_waiters(struct dlm_ls *ls)
+{
+ struct dlm_lkb *lkb;
+ ktime_t zero = ktime_set(0, 0);
+ s64 us;
+ s64 debug_maxus = 0;
+ u32 debug_scanned = 0;
+ u32 debug_expired = 0;
+ int num_nodes = 0;
+ int *warned = NULL;
+
+ if (!dlm_config.ci_waitwarn_us)
+ return;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (ktime_equal(lkb->lkb_wait_time, zero))
+ continue;
+
+ debug_scanned++;
+
+ us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
+
+ if (us < dlm_config.ci_waitwarn_us)
+ continue;
+
+ lkb->lkb_wait_time = zero;
+
+ debug_expired++;
+ if (us > debug_maxus)
+ debug_maxus = us;
+
+ if (!num_nodes) {
+ num_nodes = ls->ls_num_nodes;
+ warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int));
+ if (warned)
+ memset(warned, 0, num_nodes * sizeof(int));
+ }
+ if (!warned)
+ continue;
+ if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
+ continue;
+
+ log_error(ls, "waitwarn %x %lld %d us check connection to "
+ "node %d", lkb->lkb_id, (long long)us,
+ dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
+
+ if (warned)
+ kfree(warned);
+
+ if (debug_expired)
+ log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
+ debug_scanned, debug_expired,
+ dlm_config.ci_waitwarn_us, (long long)debug_maxus);
+}
+
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
-static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
+static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error = 0;
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
+ lkb->lkb_wait_time = ktime_get();
+ lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
out:
@@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error;
- if (ms != &ls->ls_stub_ms)
+ if (ms->m_flags != DLM_IFL_STUB_MS)
mutex_lock(&ls->ls_waiters_mutex);
error = _remove_from_waiters(lkb, ms->m_type, ms);
- if (ms != &ls->ls_stub_ms)
+ if (ms->m_flags != DLM_IFL_STUB_MS)
mutex_unlock(&ls->ls_waiters_mutex);
return error;
}
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
mutex_unlock(&ls->ls_timeout_mutex);
+
+ if (!dlm_config.ci_waitwarn_us)
+ return;
+
+ mutex_lock(&ls->ls_waiters_mutex);
+ list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
+ if (ktime_to_us(lkb->lkb_wait_time))
+ lkb->lkb_wait_time = ktime_get();
+ }
+ mutex_unlock(&ls->ls_waiters_mutex);
}
/* lkb is master or local copy */
@@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
compatible with other granted locks */
-static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_demoted(struct dlm_lkb *lkb)
{
- if (ms->m_type != DLM_MSG_CONVERT_REPLY) {
- log_print("munge_demoted %x invalid reply type %d",
- lkb->lkb_id, ms->m_type);
- return;
- }
-
if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
log_print("munge_demoted %x invalid modes gr %d rq %d",
lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
@@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
struct dlm_mhandle *mh;
int to_nodeid, error;
- error = add_to_waiters(lkb, mstype);
+ to_nodeid = r->res_nodeid;
+
+ error = add_to_waiters(lkb, mstype, to_nodeid);
if (error)
return error;
- to_nodeid = r->res_nodeid;
-
error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
if (error)
goto fail;
@@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
/* down conversions go without a reply from the master */
if (!error && down_conversion(lkb)) {
remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
+ r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS;
r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
r->res_ls->ls_stub_ms.m_result = 0;
- r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags;
__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
}
@@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
struct dlm_mhandle *mh;
int to_nodeid, error;
- error = add_to_waiters(lkb, DLM_MSG_LOOKUP);
+ to_nodeid = dlm_dir_nodeid(r);
+
+ error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
if (error)
return error;
- to_nodeid = dlm_dir_nodeid(r);
-
error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
if (error)
goto fail;
@@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
{
+ if (ms->m_flags == DLM_IFL_STUB_MS)
+ return;
+
lkb->lkb_sbflags = ms->m_sbflags;
lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
(ms->m_flags & 0x0000FFFF);
@@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
/* convert was queued on remote master */
receive_flags_reply(lkb, ms);
if (is_demoted(lkb))
- munge_demoted(lkb, ms);
+ munge_demoted(lkb);
del_lkb(r, lkb);
add_lkb(r, lkb, DLM_LKSTS_CONVERT);
add_timeout(lkb);
@@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
/* convert was granted on remote master */
receive_flags_reply(lkb, ms);
if (is_demoted(lkb))
- munge_demoted(lkb, ms);
+ munge_demoted(lkb);
grant_lock_pc(r, lkb, ms);
queue_cast(r, lkb, 0);
break;
@@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid)
dlm_put_lockspace(ls);
}
-static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb)
+static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
+ struct dlm_message *ms_stub)
{
if (middle_conversion(lkb)) {
hold_lkb(lkb);
- ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY;
- ls->ls_stub_ms.m_result = -EINPROGRESS;
- ls->ls_stub_ms.m_flags = lkb->lkb_flags;
- ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
- _receive_convert_reply(lkb, &ls->ls_stub_ms);
+ memset(ms_stub, 0, sizeof(struct dlm_message));
+ ms_stub->m_flags = DLM_IFL_STUB_MS;
+ ms_stub->m_type = DLM_MSG_CONVERT_REPLY;
+ ms_stub->m_result = -EINPROGRESS;
+ ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ _receive_convert_reply(lkb, ms_stub);
/* Same special case as in receive_rcom_lock_args() */
lkb->lkb_grmode = DLM_LOCK_IV;
@@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb)
void dlm_recover_waiters_pre(struct dlm_ls *ls)
{
struct dlm_lkb *lkb, *safe;
+ struct dlm_message *ms_stub;
int wait_type, stub_unlock_result, stub_cancel_result;
+ ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message));
+ if (!ms_stub) {
+ log_error(ls, "dlm_recover_waiters_pre no mem");
+ return;
+ }
+
mutex_lock(&ls->ls_waiters_mutex);
list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
- log_debug(ls, "pre recover waiter lkid %x type %d flags %x",
- lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags);
+
+ /* exclude debug messages about unlocks because there can be so
+ many and they aren't very interesting */
+
+ if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
+ log_debug(ls, "recover_waiter %x nodeid %d "
+ "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid,
+ lkb->lkb_wait_type, lkb->lkb_wait_nodeid);
+ }
/* all outstanding lookups, regardless of destination will be
resent after recovery is done */
@@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
break;
case DLM_MSG_CONVERT:
- recover_convert_waiter(ls, lkb);
+ recover_convert_waiter(ls, lkb, ms_stub);
break;
case DLM_MSG_UNLOCK:
hold_lkb(lkb);
- ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY;
- ls->ls_stub_ms.m_result = stub_unlock_result;
- ls->ls_stub_ms.m_flags = lkb->lkb_flags;
- ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
- _receive_unlock_reply(lkb, &ls->ls_stub_ms);
+ memset(ms_stub, 0, sizeof(struct dlm_message));
+ ms_stub->m_flags = DLM_IFL_STUB_MS;
+ ms_stub->m_type = DLM_MSG_UNLOCK_REPLY;
+ ms_stub->m_result = stub_unlock_result;
+ ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ _receive_unlock_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
case DLM_MSG_CANCEL:
hold_lkb(lkb);
- ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY;
- ls->ls_stub_ms.m_result = stub_cancel_result;
- ls->ls_stub_ms.m_flags = lkb->lkb_flags;
- ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid;
- _receive_cancel_reply(lkb, &ls->ls_stub_ms);
+ memset(ms_stub, 0, sizeof(struct dlm_message));
+ ms_stub->m_flags = DLM_IFL_STUB_MS;
+ ms_stub->m_type = DLM_MSG_CANCEL_REPLY;
+ ms_stub->m_result = stub_cancel_result;
+ ms_stub->m_header.h_nodeid = lkb->lkb_nodeid;
+ _receive_cancel_reply(lkb, ms_stub);
dlm_put_lkb(lkb);
break;
@@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
schedule();
}
mutex_unlock(&ls->ls_waiters_mutex);
+ kfree(ms_stub);
}
static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
@@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
ou = is_overlap_unlock(lkb);
err = 0;
- log_debug(ls, "recover_waiters_post %x type %d flags %x %s",
- lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name);
+ log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d",
+ lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid);
/* At this point we assume that we won't get a reply to any
previous op or overlap op on this lock. First, do a big
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 88e93c80cc22..265017a7c3e7 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
+void dlm_scan_waiters(struct dlm_ls *ls);
void dlm_scan_timeout(struct dlm_ls *ls);
void dlm_adjust_timeouts(struct dlm_ls *ls);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f994a7dfda85..14cbf4099753 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void)
static int dlm_scand(void *data)
{
struct dlm_ls *ls;
- int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
while (!kthread_should_stop()) {
ls = find_ls_to_scan();
@@ -252,13 +251,14 @@ static int dlm_scand(void *data)
ls->ls_scan_time = jiffies;
dlm_scan_rsbs(ls);
dlm_scan_timeout(ls);
+ dlm_scan_waiters(ls);
dlm_unlock_recovery(ls);
} else {
ls->ls_scan_time += HZ;
}
- } else {
- schedule_timeout_interruptible(timeout_jiffies);
+ continue;
}
+ schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
}
return 0;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 30d8b85febbf..e2b878004364 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -71,6 +71,36 @@ static void send_op(struct plock_op *op)
wake_up(&send_wq);
}
+/* If a process was killed while waiting for the only plock on a file,
+ locks_remove_posix will not see any lock on the file so it won't
+ send an unlock-close to us to pass on to userspace to clean up the
+ abandoned waiter. So, we have to insert the unlock-close when the
+ lock call is interrupted. */
+
+static void do_unlock_close(struct dlm_ls *ls, u64 number,
+ struct file *file, struct file_lock *fl)
+{
+ struct plock_op *op;
+
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op)
+ return;
+
+ op->info.optype = DLM_PLOCK_OP_UNLOCK;
+ op->info.pid = fl->fl_pid;
+ op->info.fsid = ls->ls_global_id;
+ op->info.number = number;
+ op->info.start = 0;
+ op->info.end = OFFSET_MAX;
+ if (fl->fl_lmops && fl->fl_lmops->fl_grant)
+ op->info.owner = (__u64) fl->fl_pid;
+ else
+ op->info.owner = (__u64)(long) fl->fl_owner;
+
+ op->info.flags |= DLM_PLOCK_FL_CLOSE;
+ send_op(op);
+}
+
int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
int cmd, struct file_lock *fl)
{
@@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
- if (xop->callback == NULL)
- wait_event(recv_wq, (op->done != 0));
- else {
+ if (xop->callback == NULL) {
+ rv = wait_event_killable(recv_wq, (op->done != 0));
+ if (rv == -ERESTARTSYS) {
+ log_debug(ls, "dlm_posix_lock: wait killed %llx",
+ (unsigned long long)number);
+ spin_lock(&ops_lock);
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ kfree(xop);
+ do_unlock_close(ls, number, file, fl);
+ goto out;
+ }
+ } else {
rv = FILE_LOCK_DEFERRED;
goto out;
}
@@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
else
op->info.owner = (__u64)(long) fl->fl_owner;
+ if (fl->fl_flags & FL_CLOSE) {
+ op->info.flags |= DLM_PLOCK_FL_CLOSE;
+ send_op(op);
+ rv = 0;
+ goto out;
+ }
+
send_op(op);
wait_event(recv_wq, (op->done != 0));
@@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
spin_lock(&ops_lock);
if (!list_empty(&send_list)) {
op = list_entry(send_list.next, struct plock_op, list);
- list_move(&op->list, &recv_list);
+ if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+ list_del(&op->list);
+ else
+ list_move(&op->list, &recv_list);
memcpy(&info, &op->info, sizeof(info));
}
spin_unlock(&ops_lock);
@@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
if (!op)
return -EAGAIN;
+ /* there is no need to get a reply from userspace for unlocks
+ that were generated by the vfs cleaning up for a close
+ (the process did not make an unlock call). */
+
+ if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+ kfree(op);
+
if (copy_to_user(u, &info, sizeof(info)))
return -EFAULT;
return sizeof(info);
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d5ab3fe7c198..e96bf3e9be88 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf,
out_sig:
sigprocmask(SIG_SETMASK, &tmpsig, NULL);
- recalc_sigpending();
out_free:
kfree(kbuf);
return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 98b77c89494c..c00e055b6282 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
static void drop_slab(void)
{
int nr_objects;
+ struct shrink_control shrink = {
+ .gfp_mask = GFP_KERNEL,
+ };
do {
- nr_objects = shrink_slab(1000, GFP_KERNEL, 1000);
+ nr_objects = shrink_slab(&shrink, 1000, 1000);
} while (nr_objects > 10);
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 4d4cc6a90cd5..227b409b8406 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -521,6 +521,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
struct dentry *lower_dir_dentry;
int rc;
+ dentry_unhash(dentry);
+
lower_dentry = ecryptfs_dentry_to_lower(dentry);
dget(dentry);
lower_dir_dentry = lock_parent(lower_dentry);
@@ -571,6 +573,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dir_dentry;
struct dentry *trap = NULL;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
dget(lower_old_dentry);
diff --git a/fs/exec.c b/fs/exec.c
index 5e62d26a4fec..936f5776655c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -55,6 +55,7 @@
#include <linux/fs_struct.h>
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -166,8 +167,13 @@ out:
}
#ifdef CONFIG_MMU
-
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+/*
+ * The nascent bprm->mm is not visible until exec_mmap() but it can
+ * use a lot of memory, account these pages in current->mm temporary
+ * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
+ * change the counter back via acct_arg_size(0).
+ */
+static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
struct mm_struct *mm = current->mm;
long diff = (long)(pages - bprm->vma_pages);
@@ -186,7 +192,7 @@ void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
#endif
}
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
struct page *page;
@@ -194,7 +200,7 @@ struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
#ifdef CONFIG_STACK_GROWSUP
if (write) {
- ret = expand_stack_downwards(bprm->vma, pos);
+ ret = expand_downwards(bprm->vma, pos);
if (ret < 0)
return NULL;
}
@@ -305,11 +311,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len)
#else
-void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
+static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}
-struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
+static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
int write)
{
struct page *page;
@@ -398,22 +404,56 @@ err:
return err;
}
+struct user_arg_ptr {
+#ifdef CONFIG_COMPAT
+ bool is_compat;
+#endif
+ union {
+ const char __user *const __user *native;
+#ifdef CONFIG_COMPAT
+ compat_uptr_t __user *compat;
+#endif
+ } ptr;
+};
+
+static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
+{
+ const char __user *native;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(argv.is_compat)) {
+ compat_uptr_t compat;
+
+ if (get_user(compat, argv.ptr.compat + nr))
+ return ERR_PTR(-EFAULT);
+
+ return compat_ptr(compat);
+ }
+#endif
+
+ if (get_user(native, argv.ptr.native + nr))
+ return ERR_PTR(-EFAULT);
+
+ return native;
+}
+
/*
* count() counts the number of strings in array ARGV.
*/
-static int count(const char __user * const __user * argv, int max)
+static int count(struct user_arg_ptr argv, int max)
{
int i = 0;
- if (argv != NULL) {
+ if (argv.ptr.native != NULL) {
for (;;) {
- const char __user * p;
+ const char __user *p = get_user_arg_ptr(argv, i);
- if (get_user(p, argv))
- return -EFAULT;
if (!p)
break;
- argv++;
+
+ if (IS_ERR(p))
+ return -EFAULT;
+
if (i++ >= max)
return -E2BIG;
@@ -430,7 +470,7 @@ static int count(const char __user * const __user * argv, int max)
* processes's memory to the new process's stack. The call to get_user_pages()
* ensures the destination page is created and not swapped out.
*/
-static int copy_strings(int argc, const char __user *const __user *argv,
+static int copy_strings(int argc, struct user_arg_ptr argv,
struct linux_binprm *bprm)
{
struct page *kmapped_page = NULL;
@@ -443,16 +483,18 @@ static int copy_strings(int argc, const char __user *const __user *argv,
int len;
unsigned long pos;
- if (get_user(str, argv+argc) ||
- !(len = strnlen_user(str, MAX_ARG_STRLEN))) {
- ret = -EFAULT;
+ ret = -EFAULT;
+ str = get_user_arg_ptr(argv, argc);
+ if (IS_ERR(str))
goto out;
- }
- if (!valid_arg_len(bprm, len)) {
- ret = -E2BIG;
+ len = strnlen_user(str, MAX_ARG_STRLEN);
+ if (!len)
+ goto out;
+
+ ret = -E2BIG;
+ if (!valid_arg_len(bprm, len))
goto out;
- }
/* We're going to work our way backwords. */
pos = bprm->p;
@@ -519,14 +561,19 @@ out:
/*
* Like copy_strings, but get argv and its values from kernel memory.
*/
-int copy_strings_kernel(int argc, const char *const *argv,
+int copy_strings_kernel(int argc, const char *const *__argv,
struct linux_binprm *bprm)
{
int r;
mm_segment_t oldfs = get_fs();
+ struct user_arg_ptr argv = {
+ .ptr.native = (const char __user *const __user *)__argv,
+ };
+
set_fs(KERNEL_DS);
- r = copy_strings(argc, (const char __user *const __user *)argv, bprm);
+ r = copy_strings(argc, argv, bprm);
set_fs(oldfs);
+
return r;
}
EXPORT_SYMBOL(copy_strings_kernel);
@@ -553,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
unsigned long length = old_end - old_start;
unsigned long new_start = old_start - shift;
unsigned long new_end = old_end - shift;
- struct mmu_gather *tlb;
+ struct mmu_gather tlb;
BUG_ON(new_start > new_end);
@@ -579,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
return -ENOMEM;
lru_add_drain();
- tlb = tlb_gather_mmu(mm, 0);
+ tlb_gather_mmu(&tlb, mm, 0);
if (new_end > old_start) {
/*
* when the old and new regions overlap clear from new_end.
*/
- free_pgd_range(tlb, new_end, old_end, new_end,
+ free_pgd_range(&tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
} else {
/*
@@ -593,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* have constraints on va-space that make this illegal (IA64) -
* for the others its just a little faster.
*/
- free_pgd_range(tlb, old_start, old_end, new_end,
+ free_pgd_range(&tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
}
- tlb_finish_mmu(tlb, new_end, old_end);
+ tlb_finish_mmu(&tlb, new_end, old_end);
/*
* Shrink the vma to just the new range. Always succeeds.
@@ -1004,6 +1051,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
task_unlock(tsk);
return buf;
}
+EXPORT_SYMBOL_GPL(get_task_comm);
void set_task_comm(struct task_struct *tsk, char *buf)
{
@@ -1379,10 +1427,10 @@ EXPORT_SYMBOL(search_binary_handler);
/*
* sys_execve() executes a new program.
*/
-int do_execve(const char * filename,
- const char __user *const __user *argv,
- const char __user *const __user *envp,
- struct pt_regs * regs)
+static int do_execve_common(const char *filename,
+ struct user_arg_ptr argv,
+ struct user_arg_ptr envp,
+ struct pt_regs *regs)
{
struct linux_binprm *bprm;
struct file *file;
@@ -1489,6 +1537,34 @@ out_ret:
return retval;
}
+int do_execve(const char *filename,
+ const char __user *const __user *__argv,
+ const char __user *const __user *__envp,
+ struct pt_regs *regs)
+{
+ struct user_arg_ptr argv = { .ptr.native = __argv };
+ struct user_arg_ptr envp = { .ptr.native = __envp };
+ return do_execve_common(filename, argv, envp, regs);
+}
+
+#ifdef CONFIG_COMPAT
+int compat_do_execve(char *filename,
+ compat_uptr_t __user *__argv,
+ compat_uptr_t __user *__envp,
+ struct pt_regs *regs)
+{
+ struct user_arg_ptr argv = {
+ .is_compat = true,
+ .ptr.compat = __argv,
+ };
+ struct user_arg_ptr envp = {
+ .is_compat = true,
+ .ptr.compat = __envp,
+ };
+ return do_execve_common(filename, argv, envp, regs);
+}
+#endif
+
void set_binfmt(struct linux_binfmt *new)
{
struct mm_struct *mm = current->mm;
@@ -1659,6 +1735,7 @@ static int zap_process(struct task_struct *start, int exit_code)
t = start;
do {
+ task_clear_group_stop_pending(t);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 0a78dae7e2cb..1dd62ed35b85 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
if (!sb_set_blocksize(sb, blocksize)) {
- ext2_msg(sb, KERN_ERR, "error: blocksize is too small");
+ ext2_msg(sb, KERN_ERR,
+ "error: bad blocksize %d", blocksize);
goto failed_sbi;
}
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 32f3b8695859..34b6d9bfc48a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+ /*
+ * Mark buffers dirty here so that if do_split() fails we write a
+ * consistent set of buffers to disk.
+ */
+ ext3_journal_dirty_metadata(handle, frame->bh);
+ ext3_journal_dirty_metadata(handle, bh);
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ ext3_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
@@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir,
handle_t *handle;
struct inode * inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
@@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir,
dquot_initialize(dir);
+ if (l > EXT3_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks.
+ */
+ credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
retry:
- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext3_journal_start(dir, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2211,21 +2237,45 @@ retry:
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof (EXT3_I(inode)->i_data)) {
+ if (l > EXT3_N_BLOCKS * 4) {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
/*
- * page_symlink() calls into ext3_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext3_write_begin() which acquires page
+ * lock which ranks below transaction start (and it can also
+ * wait for journal commit if we are running out of space). So
+ * we have to stop transaction now and restart it when symlink
+ * contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
*/
+ drop_nlink(inode);
+ err = ext3_orphan_add(handle, inode);
+ ext3_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS
+ * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext3_journal_start(dir,
+ EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ inc_nlink(inode);
+ err = ext3_orphan_del(handle, inode);
if (err) {
+ ext3_journal_stop(handle);
drop_nlink(inode);
- unlock_new_inode(inode);
- ext3_mark_inode_dirty(handle, inode);
- iput (inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
inode->i_op = &ext3_fast_symlink_inode_operations;
@@ -2239,6 +2289,10 @@ out_stop:
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext3_link (struct dentry * old_dentry,
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index c947e36eda6c..04109460ba9e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
- ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o
+ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+ mmp.o
ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1c67139ad4b4..264f6949511e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -362,130 +362,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
}
/**
- * ext4_add_groupblocks() -- Add given blocks to an existing group
- * @handle: handle to this transaction
- * @sb: super block
- * @block: start physcial block to add to the block group
- * @count: number of blocks to free
- *
- * This marks the blocks as free in the bitmap. We ask the
- * mballoc to reload the buddy after this by setting group
- * EXT4_GROUP_INFO_NEED_INIT_BIT flag
- */
-void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gd_bh;
- ext4_group_t block_group;
- ext4_grpblk_t bit;
- unsigned int i;
- struct ext4_group_desc *desc;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int err = 0, ret, blk_free_count;
- ext4_grpblk_t blocks_freed;
- struct ext4_group_info *grp;
-
- ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
-
- ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- grp = ext4_get_group_info(sb, block_group);
- /*
- * Check to see if we are freeing blocks across a group
- * boundary.
- */
- if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
- goto error_return;
- }
- bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
- goto error_return;
- desc = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!desc)
- goto error_return;
-
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
- ext4_error(sb, "Adding blocks in system zones - "
- "Block = %llu, count = %lu",
- block, count);
- goto error_return;
- }
-
- /*
- * We are about to add blocks to the bitmap,
- * so we need undo access.
- */
- BUFFER_TRACE(bitmap_bh, "getting undo access");
- err = ext4_journal_get_undo_access(handle, bitmap_bh);
- if (err)
- goto error_return;
-
- /*
- * We are about to modify some metadata. Call the journal APIs
- * to unshare ->b_data if a currently-committing transaction is
- * using it
- */
- BUFFER_TRACE(gd_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, gd_bh);
- if (err)
- goto error_return;
- /*
- * make sure we don't allow a parallel init on other groups in the
- * same buddy cache
- */
- down_write(&grp->alloc_sem);
- for (i = 0, blocks_freed = 0; i < count; i++) {
- BUFFER_TRACE(bitmap_bh, "clear bit");
- if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
- bit + i, bitmap_bh->b_data)) {
- ext4_error(sb, "bit already cleared for block %llu",
- (ext4_fsblk_t)(block + i));
- BUFFER_TRACE(bitmap_bh, "bit already cleared");
- } else {
- blocks_freed++;
- }
- }
- ext4_lock_group(sb, block_group);
- blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
- ext4_free_blks_set(sb, desc, blk_free_count);
- desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
- ext4_unlock_group(sb, block_group);
- percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- atomic_add(blocks_freed,
- &sbi->s_flex_groups[flex_group].free_blocks);
- }
- /*
- * request to reload the buddy with the
- * new bitmap information
- */
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
- grp->bb_free += blocks_freed;
- up_write(&grp->alloc_sem);
-
- /* We dirtied the bitmap block */
- BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-
- /* And the group descriptor block */
- BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
- ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
- if (!err)
- err = ret;
-
-error_return:
- brelse(bitmap_bh);
- ext4_std_error(sb, err);
- return;
-}
-
-/**
* ext4_has_free_blocks()
* @sbi: in-core super block structure.
* @nblocks: number of needed blocks
@@ -493,7 +369,8 @@ error_return:
* Check if filesystem has nblocks free & available for allocation.
* On success return 1, return 0 on failure.
*/
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
+static int ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks, unsigned int flags)
{
s64 free_blocks, dirty_blocks, root_blocks;
struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
@@ -507,11 +384,6 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
EXT4_FREEBLOCKS_WATERMARK) {
free_blocks = percpu_counter_sum_positive(fbc);
dirty_blocks = percpu_counter_sum_positive(dbc);
- if (dirty_blocks < 0) {
- printk(KERN_CRIT "Dirty block accounting "
- "went wrong %lld\n",
- (long long)dirty_blocks);
- }
}
/* Check whether we have space after
* accounting for current dirty blocks & root reserved blocks.
@@ -522,7 +394,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
/* Hm, nope. Are (enough) root reserved blocks available? */
if (sbi->s_resuid == current_fsuid() ||
((sbi->s_resgid != 0) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE)) {
+ capable(CAP_SYS_RESOURCE) ||
+ (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+
if (free_blocks >= (nblocks + dirty_blocks))
return 1;
}
@@ -531,9 +405,9 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks)
}
int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
- s64 nblocks)
+ s64 nblocks, unsigned int flags)
{
- if (ext4_has_free_blocks(sbi, nblocks)) {
+ if (ext4_has_free_blocks(sbi, nblocks, flags)) {
percpu_counter_add(&sbi->s_dirtyblocks_counter, nblocks);
return 0;
} else
@@ -554,7 +428,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 3 ||
!EXT4_SB(sb)->s_journal)
return 0;
@@ -577,7 +451,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* error stores in errp pointer
*/
ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
+ ext4_fsblk_t goal, unsigned int flags,
+ unsigned long *count, int *errp)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
@@ -587,6 +462,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ar.inode = inode;
ar.goal = goal;
ar.len = count ? *count : 1;
+ ar.flags = flags;
ret = ext4_mb_new_blocks(handle, &ar, errp);
if (count)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4daaf2b753f4..a74b89c09f90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -108,7 +108,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_DELALLOC_RESERVED 0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC 0x0800
-
+/* Use reserved root blocks if needed */
+#define EXT4_MB_USE_ROOT_BLOCKS 0x1000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -209,6 +210,8 @@ struct ext4_io_submit {
*/
#define EXT4_BAD_INO 1 /* Bad blocks inode */
#define EXT4_ROOT_INO 2 /* Root inode */
+#define EXT4_USR_QUOTA_INO 3 /* User quota inode */
+#define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */
#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */
#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */
@@ -512,6 +515,10 @@ struct ext4_new_group_data {
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Punch out blocks of an extent */
+#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT 0x0020
+ /* Don't normalize allocation size (used for fallocate) */
+#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/*
* Flags used by ext4_free_blocks
@@ -1028,7 +1035,7 @@ struct ext4_super_block {
__le16 s_want_extra_isize; /* New inodes should reserve # bytes */
__le32 s_flags; /* Miscellaneous flags */
__le16 s_raid_stride; /* RAID stride */
- __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
+ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
@@ -1144,6 +1151,9 @@ struct ext4_sb_info {
unsigned long s_ext_blocks;
unsigned long s_ext_extents;
#endif
+ /* ext4 extent cache stats */
+ unsigned long extent_cache_hits;
+ unsigned long extent_cache_misses;
/* for buddy allocator */
struct ext4_group_info ***s_group_info;
@@ -1201,6 +1211,9 @@ struct ext4_sb_info {
struct ext4_li_request *s_li_request;
/* Wait multiplier for lazy initialization thread */
unsigned int s_li_wait_mult;
+
+ /* Kernel thread for multiple mount protection */
+ struct task_struct *s_mmp_tsk;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1338,6 +1351,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
+#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1351,13 +1365,29 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
+#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
+#define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
+#define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
+ EXT4_FEATURE_INCOMPAT_RECOVER| \
+ EXT4_FEATURE_INCOMPAT_META_BG)
+#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
+ EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
+
#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
- EXT4_FEATURE_INCOMPAT_FLEX_BG)
+ EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+ EXT4_FEATURE_INCOMPAT_MMP)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1590,12 +1620,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
*/
struct ext4_lazy_init {
unsigned long li_state;
-
- wait_queue_head_t li_wait_daemon;
- wait_queue_head_t li_wait_task;
- struct timer_list li_timer;
- struct task_struct *li_task;
-
struct list_head li_request_list;
struct mutex li_list_mtx;
};
@@ -1615,6 +1639,67 @@ struct ext4_features {
};
/*
+ * This structure will be used for multiple mount protection. It will be
+ * written into the block number saved in the s_mmp_block field in the
+ * superblock. Programs that check MMP should assume that if
+ * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
+ * to use the filesystem, regardless of how old the timestamp is.
+ */
+#define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */
+#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
+#define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */
+#define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */
+
+struct mmp_struct {
+ __le32 mmp_magic; /* Magic number for MMP */
+ __le32 mmp_seq; /* Sequence no. updated periodically */
+
+ /*
+ * mmp_time, mmp_nodename & mmp_bdevname are only used for information
+ * purposes and do not affect the correctness of the algorithm
+ */
+ __le64 mmp_time; /* Time last updated */
+ char mmp_nodename[64]; /* Node which last updated MMP block */
+ char mmp_bdevname[32]; /* Bdev which last updated MMP block */
+
+ /*
+ * mmp_check_interval is used to verify if the MMP block has been
+ * updated on the block device. The value is updated based on the
+ * maximum time to write the MMP block during an update cycle.
+ */
+ __le16 mmp_check_interval;
+
+ __le16 mmp_pad1;
+ __le32 mmp_pad2[227];
+};
+
+/* arguments passed to the mmp thread */
+struct mmpd_data {
+ struct buffer_head *bh; /* bh from initial read_mmp_block() */
+ struct super_block *sb; /* super block of the fs */
+};
+
+/*
+ * Check interval multiplier
+ * The MMP block is written every update interval and initially checked every
+ * update interval x the multiplier (the value is then adapted based on the
+ * write latency). The reason is that writes can be delayed under load and we
+ * don't want readers to incorrectly assume that the filesystem is no longer
+ * in use.
+ */
+#define EXT4_MMP_CHECK_MULT 2UL
+
+/*
+ * Minimum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MIN_CHECK_INTERVAL 5UL
+
+/*
+ * Maximum interval for MMP checking in seconds.
+ */
+#define EXT4_MMP_MAX_CHECK_INTERVAL 300UL
+
+/*
* Function prototypes
*/
@@ -1638,10 +1723,12 @@ extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
-extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count);
+ ext4_fsblk_t goal,
+ unsigned int flags,
+ unsigned long *count,
+ int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
@@ -1706,6 +1793,8 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
unsigned long count, int flags);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
/* inode.c */
@@ -1729,6 +1818,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -1738,6 +1828,8 @@ extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1788,6 +1880,10 @@ extern void __ext4_warning(struct super_block *, const char *, unsigned int,
__LINE__, ## message)
extern void ext4_msg(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
+extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
+ const char *, unsigned int, const char *);
+#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
+ __LINE__, msg)
extern void __ext4_grp_locked_error(const char *, unsigned int, \
struct super_block *, ext4_group_t, \
unsigned long, ext4_fsblk_t, \
@@ -2064,6 +2160,8 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern void ext4_ext_truncate(struct inode *);
+extern int ext4_ext_punch_hole(struct file *file, loff_t offset,
+ loff_t length);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
@@ -2092,6 +2190,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
int len,
struct writeback_control *wbc);
+/* mmp.c */
+extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+
/* BH_Uninit flag: blocks are allocated but uninitialized on disk */
enum ext4_state_bits {
BH_Uninit /* blocks are allocated but uninitialized on disk */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6e272ef6ba96..f5240aa15601 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -6,20 +6,6 @@
#include <trace/events/ext4.h>
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh)
-{
- int err = 0;
-
- if (ext4_handle_valid(handle)) {
- err = jbd2_journal_get_undo_access(handle, bh);
- if (err)
- ext4_journal_abort_handle(where, line, __func__, bh,
- handle, err);
- }
- return err;
-}
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh)
{
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d0f53538a57f..bb85757689b6 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -126,9 +126,6 @@ void ext4_journal_abort_handle(const char *caller, unsigned int line,
const char *err_fn,
struct buffer_head *bh, handle_t *handle, int err);
-int __ext4_journal_get_undo_access(const char *where, unsigned int line,
- handle_t *handle, struct buffer_head *bh);
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
@@ -146,8 +143,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
int __ext4_handle_dirty_super(const char *where, unsigned int line,
handle_t *handle, struct super_block *sb);
-#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
__ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4890d6f3ad15..5199bac7fc62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -46,6 +46,13 @@
#include <trace/events/ext4.h>
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags);
+
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
int needed)
@@ -192,12 +199,13 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
static ext4_fsblk_t
ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
- struct ext4_extent *ex, int *err)
+ struct ext4_extent *ex, int *err, unsigned int flags)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err);
+ newblock = ext4_new_meta_blocks(handle, inode, goal, flags,
+ NULL, err);
return newblock;
}
@@ -474,9 +482,43 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
}
ext_debug("\n");
}
+
+static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
+ ext4_fsblk_t newblock, int level)
+{
+ int depth = ext_depth(inode);
+ struct ext4_extent *ex;
+
+ if (depth != level) {
+ struct ext4_extent_idx *idx;
+ idx = path[level].p_idx;
+ while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) {
+ ext_debug("%d: move %d:%llu in new index %llu\n", level,
+ le32_to_cpu(idx->ei_block),
+ ext4_idx_pblock(idx),
+ newblock);
+ idx++;
+ }
+
+ return;
+ }
+
+ ex = path[depth].p_ext;
+ while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) {
+ ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
+ le32_to_cpu(ex->ee_block),
+ ext4_ext_pblock(ex),
+ ext4_ext_is_uninitialized(ex),
+ ext4_ext_get_actual_len(ex),
+ newblock);
+ ex++;
+ }
+}
+
#else
#define ext4_ext_show_path(inode, path)
#define ext4_ext_show_leaf(inode, path)
+#define ext4_ext_show_move(inode, path, newblock, level)
#endif
void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -792,14 +834,14 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
* - initializes subtree
*/
static int ext4_ext_split(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext, int at)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext, int at)
{
struct buffer_head *bh = NULL;
int depth = ext_depth(inode);
struct ext4_extent_header *neh;
struct ext4_extent_idx *fidx;
- struct ext4_extent *ex;
int i = at, k, m, a;
ext4_fsblk_t newblock, oldblock;
__le32 border;
@@ -847,7 +889,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
newblock = ext4_ext_new_meta_block(handle, inode, path,
- newext, &err);
+ newext, &err, flags);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -876,7 +918,6 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
neh->eh_magic = EXT4_EXT_MAGIC;
neh->eh_depth = 0;
- ex = EXT_FIRST_EXTENT(neh);
/* move remainder of path[depth] to the new leaf */
if (unlikely(path[depth].p_hdr->eh_entries !=
@@ -888,25 +929,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
goto cleanup;
}
/* start copy from next extent */
- /* TODO: we could do it by single memmove */
- m = 0;
- path[depth].p_ext++;
- while (path[depth].p_ext <=
- EXT_MAX_EXTENT(path[depth].p_hdr)) {
- ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
- le32_to_cpu(path[depth].p_ext->ee_block),
- ext4_ext_pblock(path[depth].p_ext),
- ext4_ext_is_uninitialized(path[depth].p_ext),
- ext4_ext_get_actual_len(path[depth].p_ext),
- newblock);
- /*memmove(ex++, path[depth].p_ext++,
- sizeof(struct ext4_extent));
- neh->eh_entries++;*/
- path[depth].p_ext++;
- m++;
- }
+ m = EXT_MAX_EXTENT(path[depth].p_hdr) - path[depth].p_ext++;
+ ext4_ext_show_move(inode, path, newblock, depth);
if (m) {
- memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m);
+ struct ext4_extent *ex;
+ ex = EXT_FIRST_EXTENT(neh);
+ memmove(ex, path[depth].p_ext, sizeof(struct ext4_extent) * m);
le16_add_cpu(&neh->eh_entries, m);
}
@@ -968,12 +996,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
ext_debug("int.index at %d (block %llu): %u -> %llu\n",
i, newblock, le32_to_cpu(border), oldblock);
- /* copy indexes */
- m = 0;
- path[i].p_idx++;
- ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
- EXT_MAX_INDEX(path[i].p_hdr));
+ /* move remainder of path[i] to the new index block */
if (unlikely(EXT_MAX_INDEX(path[i].p_hdr) !=
EXT_LAST_INDEX(path[i].p_hdr))) {
EXT4_ERROR_INODE(inode,
@@ -982,20 +1006,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
err = -EIO;
goto cleanup;
}
- while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) {
- ext_debug("%d: move %d:%llu in new index %llu\n", i,
- le32_to_cpu(path[i].p_idx->ei_block),
- ext4_idx_pblock(path[i].p_idx),
- newblock);
- /*memmove(++fidx, path[i].p_idx++,
- sizeof(struct ext4_extent_idx));
- neh->eh_entries++;
- BUG_ON(neh->eh_entries > neh->eh_max);*/
- path[i].p_idx++;
- m++;
- }
+ /* start copy indexes */
+ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++;
+ ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx,
+ EXT_MAX_INDEX(path[i].p_hdr));
+ ext4_ext_show_move(inode, path, newblock, i);
if (m) {
- memmove(++fidx, path[i].p_idx - m,
+ memmove(++fidx, path[i].p_idx,
sizeof(struct ext4_extent_idx) * m);
le16_add_cpu(&neh->eh_entries, m);
}
@@ -1056,8 +1073,9 @@ cleanup:
* just created block
*/
static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
struct ext4_ext_path *curp = path;
struct ext4_extent_header *neh;
@@ -1065,7 +1083,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err, flags);
if (newblock == 0)
return err;
@@ -1140,8 +1159,9 @@ out:
* if no free index is found, then it requests in-depth growing.
*/
static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path,
- struct ext4_extent *newext)
+ unsigned int flags,
+ struct ext4_ext_path *path,
+ struct ext4_extent *newext)
{
struct ext4_ext_path *curp;
int depth, i, err = 0;
@@ -1161,7 +1181,7 @@ repeat:
if (EXT_HAS_FREE_INDEX(curp)) {
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
- err = ext4_ext_split(handle, inode, path, newext, i);
+ err = ext4_ext_split(handle, inode, flags, path, newext, i);
if (err)
goto out;
@@ -1174,7 +1194,8 @@ repeat:
err = PTR_ERR(path);
} else {
/* tree is full, time to grow in depth */
- err = ext4_ext_grow_indepth(handle, inode, path, newext);
+ err = ext4_ext_grow_indepth(handle, inode, flags,
+ path, newext);
if (err)
goto out;
@@ -1563,7 +1584,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* Returns 0 if the extents (ex and ex+1) were _not_ merged and returns
* 1 if they got merged.
*/
-static int ext4_ext_try_to_merge(struct inode *inode,
+static int ext4_ext_try_to_merge_right(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex)
{
@@ -1603,6 +1624,31 @@ static int ext4_ext_try_to_merge(struct inode *inode,
}
/*
+ * This function tries to merge the @ex extent to neighbours in the tree.
+ * return 1 if merge left else 0.
+ */
+static int ext4_ext_try_to_merge(struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_extent *ex) {
+ struct ext4_extent_header *eh;
+ unsigned int depth;
+ int merge_done = 0;
+ int ret = 0;
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ eh = path[depth].p_hdr;
+
+ if (ex > EXT_FIRST_EXTENT(eh))
+ merge_done = ext4_ext_try_to_merge_right(inode, path, ex - 1);
+
+ if (!merge_done)
+ ret = ext4_ext_try_to_merge_right(inode, path, ex);
+
+ return ret;
+}
+
+/*
* check if a portion of the "newext" extent overlaps with an
* existing extent.
*
@@ -1668,6 +1714,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
int depth, len, err;
ext4_lblk_t next;
unsigned uninitialized = 0;
+ int flags = 0;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1742,7 +1789,9 @@ repeat:
* There is no free space in the found leaf.
* We're gonna add a new leaf in the tree.
*/
- err = ext4_ext_create_new_leaf(handle, inode, path, newext);
+ if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
+ flags = EXT4_MB_USE_ROOT_BLOCKS;
+ err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
if (err)
goto cleanup;
depth = ext_depth(inode);
@@ -2003,13 +2052,25 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
}
/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * cache extent pointer. If the cached extent is a hole,
+ * this routine should be used instead of
+ * ext4_ext_in_cache if the calling function needs to
+ * know the size of the hole.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex: Pointer where the cached extent will be stored
+ * if it contains block
+ *
* Return 0 if cache is invalid; 1 if the cache is valid
*/
-static int
-ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
- struct ext4_extent *ex)
-{
+static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_ext_cache *ex){
struct ext4_ext_cache *cex;
+ struct ext4_sb_info *sbi;
int ret = 0;
/*
@@ -2017,26 +2078,60 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
*/
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
cex = &EXT4_I(inode)->i_cached_extent;
+ sbi = EXT4_SB(inode->i_sb);
/* has cache valid data? */
if (cex->ec_len == 0)
goto errout;
if (in_range(block, cex->ec_block, cex->ec_len)) {
- ex->ee_block = cpu_to_le32(cex->ec_block);
- ext4_ext_store_pblock(ex, cex->ec_start);
- ex->ee_len = cpu_to_le16(cex->ec_len);
+ memcpy(ex, cex, sizeof(struct ext4_ext_cache));
ext_debug("%u cached by %u:%u:%llu\n",
block,
cex->ec_block, cex->ec_len, cex->ec_start);
ret = 1;
}
errout:
+ if (!ret)
+ sbi->extent_cache_misses++;
+ else
+ sbi->extent_cache_hits++;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
return ret;
}
/*
+ * ext4_ext_in_cache()
+ * Checks to see if the given block is in the cache.
+ * If it is, the cached extent is stored in the given
+ * extent pointer.
+ *
+ * @inode: The files inode
+ * @block: The block to look for in the cache
+ * @ex: Pointer where the cached extent will be stored
+ * if it contains block
+ *
+ * Return 0 if cache is invalid; 1 if the cache is valid
+ */
+static int
+ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
+ struct ext4_extent *ex)
+{
+ struct ext4_ext_cache cex;
+ int ret = 0;
+
+ if (ext4_ext_check_cache(inode, block, &cex)) {
+ ex->ee_block = cpu_to_le32(cex.ec_block);
+ ext4_ext_store_pblock(ex, cex.ec_start);
+ ex->ee_len = cpu_to_le16(cex.ec_len);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+/*
* ext4_ext_rm_idx:
* removes index from the index block.
* It's used in truncate case only, thus all requests are for
@@ -2163,8 +2258,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
ext4_free_blocks(handle, inode, NULL, start, num, flags);
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
- printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
- from, to, le32_to_cpu(ex->ee_block), ee_len);
+ /* head removal */
+ ext4_lblk_t num;
+ ext4_fsblk_t start;
+
+ num = to - from;
+ start = ext4_ext_pblock(ex);
+
+ ext_debug("free first %u blocks starting %llu\n", num, start);
+ ext4_free_blocks(handle, inode, 0, start, num, flags);
+
} else {
printk(KERN_INFO "strange request: removal(2) "
"%u-%u from %u:%u\n",
@@ -2173,9 +2276,22 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
return 0;
}
+
+/*
+ * ext4_ext_rm_leaf() Removes the extents associated with the
+ * blocks appearing between "start" and "end", and splits the extents
+ * if "start" and "end" appear in the same extent
+ *
+ * @handle: The journal handle
+ * @inode: The files inode
+ * @path: The path to the leaf
+ * @start: The first block to remove
+ * @end: The last block to remove
+ */
static int
ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
- struct ext4_ext_path *path, ext4_lblk_t start)
+ struct ext4_ext_path *path, ext4_lblk_t start,
+ ext4_lblk_t end)
{
int err = 0, correct_index = 0;
int depth = ext_depth(inode), credits;
@@ -2186,6 +2302,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned short ex_ee_len;
unsigned uninitialized = 0;
struct ext4_extent *ex;
+ struct ext4_map_blocks map;
/* the header must be checked already in ext4_ext_remove_space() */
ext_debug("truncate since %u in leaf\n", start);
@@ -2215,31 +2332,95 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
- b = ex_ee_block + ex_ee_len - 1 < EXT_MAX_BLOCK ?
- ex_ee_block + ex_ee_len - 1 : EXT_MAX_BLOCK;
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
ext_debug(" border %u:%u\n", a, b);
- if (a != ex_ee_block && b != ex_ee_block + ex_ee_len - 1) {
- block = 0;
- num = 0;
- BUG();
+ /* If this extent is beyond the end of the hole, skip it */
+ if (end <= ex_ee_block) {
+ ex--;
+ ex_ee_block = le32_to_cpu(ex->ee_block);
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+ continue;
+ } else if (a != ex_ee_block &&
+ b != ex_ee_block + ex_ee_len - 1) {
+ /*
+ * If this is a truncate, then this condition should
+ * never happen because at least one of the end points
+ * needs to be on the edge of the extent.
+ */
+ if (end == EXT_MAX_BLOCK) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ block = 0;
+ num = 0;
+ err = -EIO;
+ goto out;
+ }
+ /*
+ * else this is a hole punch, so the extent needs to
+ * be split since neither edge of the hole is on the
+ * extent edge
+ */
+ else{
+ map.m_pblk = ext4_ext_pblock(ex);
+ map.m_lblk = ex_ee_block;
+ map.m_len = b - ex_ee_block;
+
+ err = ext4_split_extent(handle,
+ inode, path, &map, 0,
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+ EXT4_GET_BLOCKS_PRE_IO);
+
+ if (err < 0)
+ goto out;
+
+ ex_ee_len = ext4_ext_get_actual_len(ex);
+
+ b = ex_ee_block+ex_ee_len - 1 < end ?
+ ex_ee_block+ex_ee_len - 1 : end;
+
+ /* Then remove tail of this extent */
+ block = ex_ee_block;
+ num = a - block;
+ }
} else if (a != ex_ee_block) {
/* remove tail of the extent */
block = ex_ee_block;
num = a - block;
} else if (b != ex_ee_block + ex_ee_len - 1) {
/* remove head of the extent */
- block = a;
- num = b - a;
- /* there is no "make a hole" API yet */
- BUG();
+ block = b;
+ num = ex_ee_block + ex_ee_len - b;
+
+ /*
+ * If this is a truncate, this condition
+ * should never happen
+ */
+ if (end == EXT_MAX_BLOCK) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
+ }
} else {
/* remove whole extent: excellent! */
block = ex_ee_block;
num = 0;
- BUG_ON(a != ex_ee_block);
- BUG_ON(b != ex_ee_block + ex_ee_len - 1);
+ if (a != ex_ee_block) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
+ }
+
+ if (b != ex_ee_block + ex_ee_len - 1) {
+ ext_debug(" bad truncate %u:%u\n",
+ start, end);
+ err = -EIO;
+ goto out;
+ }
}
/*
@@ -2270,7 +2451,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (num == 0) {
/* this extent is removed; mark slot entirely unused */
ext4_ext_store_pblock(ex, 0);
- le16_add_cpu(&eh->eh_entries, -1);
+ } else if (block != ex_ee_block) {
+ /*
+ * If this was a head removal, then we need to update
+ * the physical block since it is now at a different
+ * location
+ */
+ ext4_ext_store_pblock(ex, ext4_ext_pblock(ex) + (b-a));
}
ex->ee_block = cpu_to_le32(block);
@@ -2286,6 +2473,27 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
if (err)
goto out;
+ /*
+ * If the extent was completely released,
+ * we need to remove it from the leaf
+ */
+ if (num == 0) {
+ if (end != EXT_MAX_BLOCK) {
+ /*
+ * For hole punching, we need to scoot all the
+ * extents up when an extent is removed so that
+ * we dont have blank extents in the middle
+ */
+ memmove(ex, ex+1, (EXT_LAST_EXTENT(eh) - ex) *
+ sizeof(struct ext4_extent));
+
+ /* Now get rid of the one at the end */
+ memset(EXT_LAST_EXTENT(eh), 0,
+ sizeof(struct ext4_extent));
+ }
+ le16_add_cpu(&eh->eh_entries, -1);
+ }
+
ext_debug("new extent: %u:%u:%llu\n", block, num,
ext4_ext_pblock(ex));
ex--;
@@ -2326,7 +2534,8 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
return 1;
}
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t end)
{
struct super_block *sb = inode->i_sb;
int depth = ext_depth(inode);
@@ -2365,7 +2574,8 @@ again:
while (i >= 0 && err == 0) {
if (i == depth) {
/* this is leaf block */
- err = ext4_ext_rm_leaf(handle, inode, path, start);
+ err = ext4_ext_rm_leaf(handle, inode, path,
+ start, end);
/* root level has p_bh == NULL, brelse() eats this */
brelse(path[i].p_bh);
path[i].p_bh = NULL;
@@ -2529,6 +2739,195 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
return ret;
}
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
+ due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+
+/*
+ * ext4_split_extent_at() splits an extent at given block.
+ *
+ * @handle: the journal handle
+ * @inode: the file inode
+ * @path: the path to the extent
+ * @split: the logical block where the extent is splitted.
+ * @split_flags: indicates if the extent could be zeroout if split fails, and
+ * the states(init or uninit) of new extents.
+ * @flags: flags used to insert new extent to extent tree.
+ *
+ *
+ * Splits extent [a, b] into two extents [a, @split) and [@split, b], states
+ * of which are deterimined by split_flag.
+ *
+ * There are two cases:
+ * a> the extent are splitted into two extent.
+ * b> split is not needed, and just mark the extent.
+ *
+ * return 0 on success.
+ */
+static int ext4_split_extent_at(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t split,
+ int split_flag,
+ int flags)
+{
+ ext4_fsblk_t newblock;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex, newex, orig_ex;
+ struct ext4_extent *ex2 = NULL;
+ unsigned int ee_len, depth;
+ int err = 0;
+
+ ext_debug("ext4_split_extents_at: inode %lu, logical"
+ "block %llu\n", inode->i_ino, (unsigned long long)split);
+
+ ext4_ext_show_leaf(inode, path);
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ newblock = split - ee_block + ext4_ext_pblock(ex);
+
+ BUG_ON(split < ee_block || split >= (ee_block + ee_len));
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (split == ee_block) {
+ /*
+ * case b: block @split is the block that the extent begins with
+ * then we just change the state of the extent, and splitting
+ * is not needed.
+ */
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ ext4_ext_mark_uninitialized(ex);
+ else
+ ext4_ext_mark_initialized(ex);
+
+ if (!(flags & EXT4_GET_BLOCKS_PRE_IO))
+ ext4_ext_try_to_merge(inode, path, ex);
+
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ }
+
+ /* case a */
+ memcpy(&orig_ex, ex, sizeof(orig_ex));
+ ex->ee_len = cpu_to_le16(split - ee_block);
+ if (split_flag & EXT4_EXT_MARK_UNINIT1)
+ ext4_ext_mark_uninitialized(ex);
+
+ /*
+ * path may lead to new leaf, not to original leaf any more
+ * after ext4_ext_insert_extent() returns,
+ */
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto fix_extent_len;
+
+ ex2 = &newex;
+ ex2->ee_block = cpu_to_le32(split);
+ ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
+ ext4_ext_store_pblock(ex2, newblock);
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ ext4_ext_mark_uninitialized(ex2);
+
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+ /* update the extent length and mark as initialized */
+ ex->ee_len = cpu_to_le32(ee_len);
+ ext4_ext_try_to_merge(inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ } else if (err)
+ goto fix_extent_len;
+
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+
+fix_extent_len:
+ ex->ee_len = orig_ex.ee_len;
+ ext4_ext_dirty(handle, inode, path + depth);
+ return err;
+}
+
+/*
+ * ext4_split_extents() splits an extent and mark extent which is covered
+ * by @map as split_flags indicates
+ *
+ * It may result in splitting the extent into multiple extents (upto three)
+ * There are three possibilities:
+ * a> There is no split required
+ * b> Splits in two extents: Split is happening at either end of the extent
+ * c> Splits in three extents: Somone is splitting in middle of the extent
+ *
+ */
+static int ext4_split_extent(handle_t *handle,
+ struct inode *inode,
+ struct ext4_ext_path *path,
+ struct ext4_map_blocks *map,
+ int split_flag,
+ int flags)
+{
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len, depth;
+ int err = 0;
+ int uninitialized;
+ int split_flag1, flags1;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+ uninitialized = ext4_ext_is_uninitialized(ex);
+
+ if (map->m_lblk + map->m_len < ee_block + ee_len) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
+ if (uninitialized)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
+ EXT4_EXT_MARK_UNINIT2;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk + map->m_len, split_flag1, flags1);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ if (map->m_lblk >= ee_block) {
+ split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ if (uninitialized)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+ if (split_flag & EXT4_EXT_MARK_UNINIT2)
+ split_flag1 |= EXT4_EXT_MARK_UNINIT2;
+ err = ext4_split_extent_at(handle, inode, path,
+ map->m_lblk, split_flag1, flags);
+ if (err)
+ goto out;
+ }
+
+ ext4_ext_show_leaf(inode, path);
+out:
+ return err ? err : map->m_len;
+}
+
#define EXT4_EXT_ZERO_LEN 7
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
@@ -2545,17 +2944,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
- struct ext4_extent *ex, newex, orig_ex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
- struct ext4_extent_header *eh;
+ struct ext4_map_blocks split_map;
+ struct ext4_extent zero_ex;
+ struct ext4_extent *ex;
ext4_lblk_t ee_block, eof_block;
unsigned int allocated, ee_len, depth;
- ext4_fsblk_t newblock;
int err = 0;
- int ret = 0;
- int may_zeroout;
+ int split_flag = 0;
ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2567,280 +2962,86 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
- eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
- newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
- ex2 = ex;
- orig_ex.ee_block = ex->ee_block;
- orig_ex.ee_len = cpu_to_le16(ee_len);
- ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
+ WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully insde i_size or new_size.
*/
- may_zeroout = ee_block + ee_len <= eof_block;
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
- if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
+ if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ err = ext4_ext_zeroout(inode, ex);
if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- return allocated;
- }
-
- /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
- if (map->m_lblk > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
- */
- if (!ex1 && allocated > map->m_len)
- ex2->ee_len = cpu_to_le16(map->m_len);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > map->m_len) {
- unsigned int newdepth;
- /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
- if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
- /*
- * map->m_lblk == ee_block is handled by the zerouout
- * at the beginning.
- * Mark first half uninitialized.
- * Mark second half initialized and zero out the
- * initialized extent
- */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = cpu_to_le16(ee_len - allocated);
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
-
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex3, newblock);
- ex3->ee_len = cpu_to_le16(allocated);
- err = ext4_ext_insert_extent(handle, inode, path,
- ex3, 0);
- if (err == -ENOSPC) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex,
- ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
-
- /*
- * We need to zero out the second half because
- * an fallocate request can update file size and
- * converting the second half to initialized extent
- * implies that we can leak some junk data to user
- * space.
- */
- err = ext4_ext_zeroout(inode, ex3);
- if (err) {
- /*
- * We should actually mark the
- * second half as uninit and return error
- * Insert would have changed the extent
- */
- depth = ext_depth(inode);
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk,
- path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- return err;
- }
- /* get the second half extent details */
- ex = path[depth].p_ext;
- err = ext4_ext_get_access(handle, inode,
- path + depth);
- if (err)
- return err;
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
- }
-
- /* zeroed the second half */
- return allocated;
- }
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
- ext4_ext_store_pblock(ex3, newblock + map->m_len);
- ex3->ee_len = cpu_to_le16(allocated - map->m_len);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
- /*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
- */
- newdepth = ext_depth(inode);
- /*
- * update the extent length after successful insert of the
- * split extent
- */
- ee_len -= ext4_ext_get_actual_len(ex3);
- orig_ex.ee_len = cpu_to_le16(ee_len);
- may_zeroout = ee_block + ee_len <= eof_block;
-
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
goto out;
- }
- eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
-
- allocated = map->m_len;
-
- /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
- * to insert a extent in the middle zerout directly
- * otherwise give the extent a chance to merge to left
- */
- if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
- map->m_lblk != ee_block && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- /* blocks available from map->m_lblk */
- return allocated;
- }
- }
- /*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
- */
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
- ex2->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- if (ex2 != ex)
- goto insert;
- /*
- * New (initialized) extent starts from the first block
- * in the current extent. i.e., ex2 == ex
- * We have to see if it can be merged with the extent
- * on the left.
- */
- if (ex2 > EXT_FIRST_EXTENT(eh)) {
- /*
- * To merge left, pass "ex2 - 1" to try_to_merge(),
- * since it merges towards right _only_.
- */
- ret = ext4_ext_try_to_merge(inode, path, ex2 - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- ex2--;
- }
+ ext4_ext_mark_initialized(ex);
+ ext4_ext_try_to_merge(inode, path, ex);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
}
+
/*
- * Try to Merge towards right. This might be required
- * only when the whole extent is being written to.
- * i.e. ex2 == ex and ex3 == NULL.
+ * four cases:
+ * 1. split the extent into three extents.
+ * 2. split the extent into two extents, zeroout the first half.
+ * 3. split the extent into two extents, zeroout the second half.
+ * 4. split the extent into two extents with out zeroout.
*/
- if (!ex3) {
- ret = ext4_ext_try_to_merge(inode, path, ex2);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = map->m_len;
+
+ if (allocated > map->m_len) {
+ if (allocated <= EXT4_EXT_ZERO_LEN &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ /* case 3 */
+ zero_ex.ee_block =
+ cpu_to_le32(map->m_lblk);
+ zero_ex.ee_len = cpu_to_le16(allocated);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex) + map->m_lblk - ee_block);
+ err = ext4_ext_zeroout(inode, &zero_ex);
if (err)
goto out;
+ split_map.m_lblk = map->m_lblk;
+ split_map.m_len = allocated;
+ } else if ((map->m_lblk - ee_block + map->m_len <
+ EXT4_EXT_ZERO_LEN) &&
+ (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
+ /* case 2 */
+ if (map->m_lblk != ee_block) {
+ zero_ex.ee_block = ex->ee_block;
+ zero_ex.ee_len = cpu_to_le16(map->m_lblk -
+ ee_block);
+ ext4_ext_store_pblock(&zero_ex,
+ ext4_ext_pblock(ex));
+ err = ext4_ext_zeroout(inode, &zero_ex);
+ if (err)
+ goto out;
+ }
+
+ split_map.m_lblk = ee_block;
+ split_map.m_len = map->m_lblk - ee_block + map->m_len;
+ allocated = map->m_len;
}
}
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- return allocated;
- } else if (err)
- goto fix_extent_len;
+
+ allocated = ext4_split_extent(handle, inode, path,
+ &split_map, split_flag, 0);
+ if (allocated < 0)
+ err = allocated;
+
out:
- ext4_ext_show_leaf(inode, path);
return err ? err : allocated;
-
-fix_extent_len:
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
}
/*
@@ -2871,15 +3072,11 @@ static int ext4_split_unwritten_extents(handle_t *handle,
struct ext4_ext_path *path,
int flags)
{
- struct ext4_extent *ex, newex, orig_ex;
- struct ext4_extent *ex1 = NULL;
- struct ext4_extent *ex2 = NULL;
- struct ext4_extent *ex3 = NULL;
- ext4_lblk_t ee_block, eof_block;
- unsigned int allocated, ee_len, depth;
- ext4_fsblk_t newblock;
- int err = 0;
- int may_zeroout;
+ ext4_lblk_t eof_block;
+ ext4_lblk_t ee_block;
+ struct ext4_extent *ex;
+ unsigned int ee_len;
+ int split_flag = 0, depth;
ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
"block %llu, max_blocks %u\n", inode->i_ino,
@@ -2889,156 +3086,22 @@ static int ext4_split_unwritten_extents(handle_t *handle,
inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
-
- depth = ext_depth(inode);
- ex = path[depth].p_ext;
- ee_block = le32_to_cpu(ex->ee_block);
- ee_len = ext4_ext_get_actual_len(ex);
- allocated = ee_len - (map->m_lblk - ee_block);
- newblock = map->m_lblk - ee_block + ext4_ext_pblock(ex);
-
- ex2 = ex;
- orig_ex.ee_block = ex->ee_block;
- orig_ex.ee_len = cpu_to_le16(ee_len);
- ext4_ext_store_pblock(&orig_ex, ext4_ext_pblock(ex));
-
/*
* It is safe to convert extent to initialized via explicit
* zeroout only if extent is fully insde i_size or new_size.
*/
- may_zeroout = ee_block + ee_len <= eof_block;
-
- /*
- * If the uninitialized extent begins at the same logical
- * block where the write begins, and the write completely
- * covers the extent, then we don't need to split it.
- */
- if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
- return allocated;
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
- /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
- if (map->m_lblk > ee_block) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * for sanity, update the length of the ex2 extent before
- * we insert ex3, if ex1 is NULL. This is to avoid temporary
- * overlap of blocks.
- */
- if (!ex1 && allocated > map->m_len)
- ex2->ee_len = cpu_to_le16(map->m_len);
- /* ex3: to ee_block + ee_len : uninitialised */
- if (allocated > map->m_len) {
- unsigned int newdepth;
- ex3 = &newex;
- ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
- ext4_ext_store_pblock(ex3, newblock + map->m_len);
- ex3->ee_len = cpu_to_le16(allocated - map->m_len);
- ext4_ext_mark_uninitialized(ex3);
- err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
- /* blocks available from map->m_lblk */
- return allocated;
-
- } else if (err)
- goto fix_extent_len;
- /*
- * The depth, and hence eh & ex might change
- * as part of the insert above.
- */
- newdepth = ext_depth(inode);
- /*
- * update the extent length after successful insert of the
- * split extent
- */
- ee_len -= ext4_ext_get_actual_len(ex3);
- orig_ex.ee_len = cpu_to_le16(ee_len);
- may_zeroout = ee_block + ee_len <= eof_block;
-
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, map->m_lblk, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
+ split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= EXT4_EXT_MARK_UNINIT2;
- allocated = map->m_len;
- }
- /*
- * If there was a change of depth as part of the
- * insertion of ex3 above, we need to update the length
- * of the ex1 extent again here
- */
- if (ex1 && ex1 != ex) {
- ex1 = ex;
- ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
- ext4_ext_mark_uninitialized(ex1);
- ex2 = &newex;
- }
- /*
- * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
- * using direct I/O, uninitialised still.
- */
- ex2->ee_block = cpu_to_le32(map->m_lblk);
- ext4_ext_store_pblock(ex2, newblock);
- ex2->ee_len = cpu_to_le16(allocated);
- ext4_ext_mark_uninitialized(ex2);
- if (ex2 != ex)
- goto insert;
- /* Mark modified extent as dirty */
- err = ext4_ext_dirty(handle, inode, path + depth);
- ext_debug("out here\n");
- goto out;
-insert:
- err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
- if (err == -ENOSPC && may_zeroout) {
- err = ext4_ext_zeroout(inode, &orig_ex);
- if (err)
- goto fix_extent_len;
- /* update the extent length and mark as initialized */
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_dirty(handle, inode, path + depth);
- /* zero out the first half */
- return allocated;
- } else if (err)
- goto fix_extent_len;
-out:
- ext4_ext_show_leaf(inode, path);
- return err ? err : allocated;
-
-fix_extent_len:
- ex->ee_block = orig_ex.ee_block;
- ex->ee_len = orig_ex.ee_len;
- ext4_ext_store_pblock(ex, ext4_ext_pblock(&orig_ex));
- ext4_ext_mark_uninitialized(ex);
- ext4_ext_dirty(handle, inode, path + depth);
- return err;
+ flags |= EXT4_GET_BLOCKS_PRE_IO;
+ return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
+
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_ext_path *path)
@@ -3047,46 +3110,27 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct ext4_extent_header *eh;
int depth;
int err = 0;
- int ret = 0;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
ex = path[depth].p_ext;
+ ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
+ "block %llu, max_blocks %u\n", inode->i_ino,
+ (unsigned long long)le32_to_cpu(ex->ee_block),
+ ext4_ext_get_actual_len(ex));
+
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
goto out;
/* first mark the extent as initialized */
ext4_ext_mark_initialized(ex);
- /*
- * We have to see if it can be merged with the extent
- * on the left.
- */
- if (ex > EXT_FIRST_EXTENT(eh)) {
- /*
- * To merge left, pass "ex - 1" to try_to_merge(),
- * since it merges towards right _only_.
- */
- ret = ext4_ext_try_to_merge(inode, path, ex - 1);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- ex--;
- }
- }
- /*
- * Try to Merge towards right.
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
*/
- ret = ext4_ext_try_to_merge(inode, path, ex);
- if (ret) {
- err = ext4_ext_correct_indexes(handle, inode, path);
- if (err)
- goto out;
- depth = ext_depth(inode);
- }
+ ext4_ext_try_to_merge(inode, path, ex);
+
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + depth);
out:
@@ -3302,15 +3346,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock = 0;
int err = 0, depth, ret;
unsigned int allocated = 0;
+ unsigned int punched_out = 0;
+ unsigned int result = 0;
struct ext4_allocation_request ar;
ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+ struct ext4_map_blocks punch_map;
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* check in cache */
- if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+ if (ext4_ext_in_cache(inode, map->m_lblk, &newex) &&
+ ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) {
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
/*
@@ -3375,16 +3423,84 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- /* Do not put uninitialized extent in the cache */
- if (!ext4_ext_is_uninitialized(ex)) {
- ext4_ext_put_in_cache(inode, ee_block,
- ee_len, ee_start);
- goto out;
+ if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
+ /*
+ * Do not put uninitialized extent
+ * in the cache
+ */
+ if (!ext4_ext_is_uninitialized(ex)) {
+ ext4_ext_put_in_cache(inode, ee_block,
+ ee_len, ee_start);
+ goto out;
+ }
+ ret = ext4_ext_handle_uninitialized_extents(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ return ret;
}
- ret = ext4_ext_handle_uninitialized_extents(handle,
- inode, map, path, flags, allocated,
- newblock);
- return ret;
+
+ /*
+ * Punch out the map length, but only to the
+ * end of the extent
+ */
+ punched_out = allocated < map->m_len ?
+ allocated : map->m_len;
+
+ /*
+ * Sense extents need to be converted to
+ * uninitialized, they must fit in an
+ * uninitialized extent
+ */
+ if (punched_out > EXT_UNINIT_MAX_LEN)
+ punched_out = EXT_UNINIT_MAX_LEN;
+
+ punch_map.m_lblk = map->m_lblk;
+ punch_map.m_pblk = newblock;
+ punch_map.m_len = punched_out;
+ punch_map.m_flags = 0;
+
+ /* Check to see if the extent needs to be split */
+ if (punch_map.m_len != ee_len ||
+ punch_map.m_lblk != ee_block) {
+
+ ret = ext4_split_extent(handle, inode,
+ path, &punch_map, 0,
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
+ EXT4_GET_BLOCKS_PRE_IO);
+
+ if (ret < 0) {
+ err = ret;
+ goto out2;
+ }
+ /*
+ * find extent for the block at
+ * the start of the hole
+ */
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ path = ext4_ext_find_extent(inode,
+ map->m_lblk, NULL);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ goto out2;
+ }
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_len = ext4_ext_get_actual_len(ex);
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_start = ext4_ext_pblock(ex);
+
+ }
+
+ ext4_ext_mark_uninitialized(ex);
+
+ err = ext4_ext_remove_space(inode, map->m_lblk,
+ map->m_lblk + punched_out);
+
+ goto out2;
}
}
@@ -3446,6 +3562,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
else
/* disable in-core preallocation for non-regular files */
ar.flags = 0;
+ if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE)
+ ar.flags |= EXT4_MB_HINT_NOPREALLOC;
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out2;
@@ -3529,7 +3647,11 @@ out2:
}
trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
newblock, map->m_len, err ? err : allocated);
- return err ? err : allocated;
+
+ result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
+ punched_out : allocated;
+
+ return err ? err : result;
}
void ext4_ext_truncate(struct inode *inode)
@@ -3577,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
- err = ext4_ext_remove_space(inode, last_block);
+ err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK);
/* In a multi-transaction truncate, we only make the final
* transaction synchronous.
@@ -3585,8 +3707,9 @@ void ext4_ext_truncate(struct inode *inode)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-out_stop:
up_write(&EXT4_I(inode)->i_data_sem);
+
+out_stop:
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -3651,10 +3774,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
struct ext4_map_blocks map;
unsigned int credits, blkbits = inode->i_blkbits;
- /* We only support the FALLOC_FL_KEEP_SIZE mode */
- if (mode & ~FALLOC_FL_KEEP_SIZE)
- return -EOPNOTSUPP;
-
/*
* currently supporting (pre)allocate mode for extent-based
* files _only_
@@ -3662,6 +3781,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
+ /* Return error if mode is not supported */
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return ext4_punch_hole(file, offset, len);
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -3691,7 +3817,8 @@ retry:
break;
}
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT |
+ EXT4_GET_BLOCKS_NO_NORMALIZE);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
@@ -3822,6 +3949,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
pgoff_t last_offset;
pgoff_t offset;
pgoff_t index;
+ pgoff_t start_index = 0;
struct page **pages = NULL;
struct buffer_head *bh = NULL;
struct buffer_head *head = NULL;
@@ -3848,39 +3976,57 @@ out:
kfree(pages);
return EXT_CONTINUE;
}
+ index = 0;
+next_page:
/* Try to find the 1st mapped buffer. */
- end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+ end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
blksize_bits;
- if (!page_has_buffers(pages[0]))
+ if (!page_has_buffers(pages[index]))
goto out;
- head = page_buffers(pages[0]);
+ head = page_buffers(pages[index]);
if (!head)
goto out;
+ index++;
bh = head;
do {
- if (buffer_mapped(bh)) {
+ if (end >= newex->ec_block +
+ newex->ec_len)
+ /* The buffer is out of
+ * the request range.
+ */
+ goto out;
+
+ if (buffer_mapped(bh) &&
+ end >= newex->ec_block) {
+ start_index = index - 1;
/* get the 1st mapped buffer. */
- if (end > newex->ec_block +
- newex->ec_len)
- /* The buffer is out of
- * the request range.
- */
- goto out;
goto found_mapped_buffer;
}
+
bh = bh->b_this_page;
end++;
} while (bh != head);
- /* No mapped buffer found. */
- goto out;
+ /* No mapped buffer in the range found in this page,
+ * We need to look up next page.
+ */
+ if (index >= ret) {
+ /* There is no page left, but we need to limit
+ * newex->ec_len.
+ */
+ newex->ec_len = end - newex->ec_block;
+ goto out;
+ }
+ goto next_page;
} else {
/*Find contiguous delayed buffers. */
if (ret > 0 && pages[0]->index == last_offset)
head = page_buffers(pages[0]);
bh = head;
+ index = 1;
+ start_index = 0;
}
found_mapped_buffer:
@@ -3903,7 +4049,7 @@ found_mapped_buffer:
end++;
} while (bh != head);
- for (index = 1; index < ret; index++) {
+ for (; index < ret; index++) {
if (!page_has_buffers(pages[index])) {
bh = NULL;
break;
@@ -3913,8 +4059,10 @@ found_mapped_buffer:
bh = NULL;
break;
}
+
if (pages[index]->index !=
- pages[0]->index + index) {
+ pages[start_index]->index + index
+ - start_index) {
/* Blocks are not contiguous. */
bh = NULL;
break;
@@ -4006,6 +4154,177 @@ static int ext4_xattr_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
}
+/*
+ * ext4_ext_punch_hole
+ *
+ * Punches a hole of "length" bytes in a file starting
+ * at byte "offset"
+ *
+ * @inode: The inode of the file to punch a hole in
+ * @offset: The starting byte offset of the hole
+ * @length: The length of the hole
+ *
+ * Returns the number of blocks removed or negative on err
+ */
+int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ struct ext4_ext_cache cache_ex;
+ ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+ struct address_space *mapping = inode->i_mapping;
+ struct ext4_map_blocks map;
+ handle_t *handle;
+ loff_t first_block_offset, last_block_offset, block_len;
+ loff_t first_page, last_page, first_page_offset, last_page_offset;
+ int ret, credits, blocks_released, err = 0;
+
+ first_block = (offset + sb->s_blocksize - 1) >>
+ EXT4_BLOCK_SIZE_BITS(sb);
+ last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ first_block_offset = first_block << EXT4_BLOCK_SIZE_BITS(sb);
+ last_block_offset = last_block << EXT4_BLOCK_SIZE_BITS(sb);
+
+ first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ last_page = (offset + length) >> PAGE_CACHE_SHIFT;
+
+ first_page_offset = first_page << PAGE_CACHE_SHIFT;
+ last_page_offset = last_page << PAGE_CACHE_SHIFT;
+
+ /*
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
+ */
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ err = filemap_write_and_wait_range(mapping,
+ first_page_offset == 0 ? 0 : first_page_offset-1,
+ last_page_offset);
+
+ if (err)
+ return err;
+ }
+
+ /* Now release the pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_inode_pages_range(mapping, first_page_offset,
+ last_page_offset-1);
+ }
+
+ /* finish any pending end_io work */
+ ext4_flush_completed_IO(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ err = ext4_orphan_add(handle, inode);
+ if (err)
+ goto out;
+
+ /*
+ * Now we need to zero out the un block aligned data.
+ * If the file is smaller than a block, just
+ * zero out the middle
+ */
+ if (first_block > last_block)
+ ext4_block_zero_page_range(handle, mapping, offset, length);
+ else {
+ /* zero out the head of the hole before the first block */
+ block_len = first_block_offset - offset;
+ if (block_len > 0)
+ ext4_block_zero_page_range(handle, mapping,
+ offset, block_len);
+
+ /* zero out the tail of the hole after the last block */
+ block_len = offset + length - last_block_offset;
+ if (block_len > 0) {
+ ext4_block_zero_page_range(handle, mapping,
+ last_block_offset, block_len);
+ }
+ }
+
+ /* If there are no blocks to remove, return now */
+ if (first_block >= last_block)
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
+
+ /*
+ * Loop over all the blocks and identify blocks
+ * that need to be punched out
+ */
+ iblock = first_block;
+ blocks_released = 0;
+ while (iblock < last_block) {
+ max_blocks = last_block - iblock;
+ num_blocks = 1;
+ memset(&map, 0, sizeof(map));
+ map.m_lblk = iblock;
+ map.m_len = max_blocks;
+ ret = ext4_ext_map_blocks(handle, inode, &map,
+ EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+ if (ret > 0) {
+ blocks_released += ret;
+ num_blocks = ret;
+ } else if (ret == 0) {
+ /*
+ * If map blocks could not find the block,
+ * then it is in a hole. If the hole was
+ * not already cached, then map blocks should
+ * put it in the cache. So we can get the hole
+ * out of the cache
+ */
+ memset(&cache_ex, 0, sizeof(cache_ex));
+ if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
+ !cache_ex.ec_start) {
+
+ /* The hole is cached */
+ num_blocks = cache_ex.ec_block +
+ cache_ex.ec_len - iblock;
+
+ } else {
+ /* The block could not be identified */
+ err = -EIO;
+ break;
+ }
+ } else {
+ /* Map blocks error */
+ err = ret;
+ break;
+ }
+
+ if (num_blocks == 0) {
+ /* This condition should never happen */
+ ext_debug("Block lookup failed");
+ err = -EIO;
+ break;
+ }
+
+ iblock += num_blocks;
+ }
+
+ if (blocks_released > 0) {
+ ext4_ext_invalidate_cache(inode);
+ ext4_discard_preallocations(inode);
+ }
+
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+
+out:
+ ext4_orphan_del(handle, inode);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ return err;
+}
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
@@ -4042,4 +4361,3 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return error;
}
-
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b80d543b89e..2c0972322009 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -272,7 +272,6 @@ const struct file_operations ext4_file_operations = {
};
const struct inode_operations ext4_file_inode_operations = {
- .truncate = ext4_truncate,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
#ifdef CONFIG_EXT4_FS_XATTR
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e9473cbe80df..ce66d2fe826c 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -36,7 +36,7 @@
static void dump_completed_IO(struct inode * inode)
{
-#ifdef EXT4_DEBUG
+#ifdef EXT4FS_DEBUG
struct list_head *cur, *before, *after;
ext4_io_end_t *io, *io0, *io1;
unsigned long flags;
@@ -172,6 +172,7 @@ int ext4_sync_file(struct file *file, int datasync)
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret;
tid_t commit_tid;
+ bool needs_barrier = false;
J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -211,22 +212,12 @@ int ext4_sync_file(struct file *file, int datasync)
}
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
- if (jbd2_log_start_commit(journal, commit_tid)) {
- /*
- * When the journal is on a different device than the
- * fs data disk, we need to issue the barrier in
- * writeback mode. (In ordered mode, the jbd2 layer
- * will take care of issuing the barrier. In
- * data=journal, all of the data blocks are written to
- * the journal device.)
- */
- if (ext4_should_writeback_data(inode) &&
- (journal->j_fs_dev != journal->j_dev) &&
- (journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
- NULL);
- ret = jbd2_log_wait_commit(journal, commit_tid);
- } else if (journal->j_flags & JBD2_BARRIER)
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ jbd2_log_start_commit(journal, commit_tid);
+ ret = jbd2_log_wait_commit(journal, commit_tid);
+ if (needs_barrier)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
out:
trace_ext4_sync_file_exit(inode, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f2fa5e8a582c..50d0e9c64584 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -639,8 +639,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_meta_blocks(handle, inode,
- goal, &count, err);
+ current_block = ext4_new_meta_blocks(handle, inode, goal,
+ 0, &count, err);
if (*err)
goto failed_out;
@@ -1930,7 +1930,7 @@ repeat:
* We do still charge estimated metadata to the sb though;
* we cannot afford to run out of free blocks.
*/
- if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+ if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
dquot_release_reservation_block(inode, 1);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
@@ -2796,9 +2796,7 @@ static int write_cache_pages_da(struct address_space *mapping,
continue;
}
- if (PageWriteback(page))
- wait_on_page_writeback(page);
-
+ wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
if (mpd->next_page != page->index)
@@ -3513,7 +3511,7 @@ retry:
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ ext4_truncate_failed_write(inode);
}
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -3916,9 +3914,30 @@ void ext4_set_aops(struct inode *inode)
int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
+{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, length, pos;
+ unsigned blocksize, max, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3931,7 +3950,15 @@ int ext4_block_truncate_page(handle_t *handle,
return -EINVAL;
blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
+ max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
if (!page_has_buffers(page))
@@ -4380,8 +4407,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
int ext4_can_truncate(struct inode *inode)
{
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return 0;
if (S_ISREG(inode->i_mode))
return 1;
if (S_ISDIR(inode->i_mode))
@@ -4392,6 +4417,31 @@ int ext4_can_truncate(struct inode *inode)
}
/*
+ * ext4_punch_hole: punches a hole in a file by releaseing the blocks
+ * associated with the given offset and length
+ *
+ * @inode: File inode
+ * @offset: The offset where the hole will begin
+ * @len: The length of the hole
+ *
+ * Returns: 0 on sucess or negative on failure
+ */
+
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ if (!S_ISREG(inode->i_mode))
+ return -ENOTSUPP;
+
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ /* TODO: Add support for non extent hole punching */
+ return -ENOTSUPP;
+ }
+
+ return ext4_ext_punch_hole(file, offset, length);
+}
+
+/*
* ext4_truncate()
*
* We block out ext4_get_block() block instantiations across the entire
@@ -4617,7 +4667,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
/*
* Figure out the offset within the block group inode table
*/
- inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb));
block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
@@ -5311,8 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(inode->i_mode) &&
attr->ia_valid & ATTR_SIZE &&
- (attr->ia_size < inode->i_size ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
+ (attr->ia_size < inode->i_size)) {
handle_t *handle;
handle = ext4_journal_start(inode, 3);
@@ -5346,14 +5395,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
goto err_out;
}
}
- /* ext4_truncate will clear the flag */
- if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
- ext4_truncate(inode);
}
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- rc = vmtruncate(inode, attr->ia_size);
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ ext4_truncate(inode);
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
+ ext4_truncate(inode);
+ }
if (!rc) {
setattr_copy(inode, attr);
@@ -5811,15 +5861,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out_unlock;
}
ret = 0;
- if (PageMappedToDisk(page))
- goto out_unlock;
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ if (PageMappedToDisk(page)) {
+ up_read(&inode->i_alloc_sem);
+ return VM_FAULT_LOCKED;
+ }
if (page->index == size >> PAGE_CACHE_SHIFT)
len = size & ~PAGE_CACHE_MASK;
else
len = PAGE_CACHE_SIZE;
- lock_page(page);
/*
* return if we have all the buffers mapped. This avoid
* the need to call write_begin/write_end which does a
@@ -5829,8 +5883,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (page_has_buffers(page)) {
if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
ext4_bh_unmapped)) {
- unlock_page(page);
- goto out_unlock;
+ up_read(&inode->i_alloc_sem);
+ return VM_FAULT_LOCKED;
}
}
unlock_page(page);
@@ -5850,6 +5904,16 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret < 0)
goto out_unlock;
ret = 0;
+
+ /*
+ * write_begin/end might have created a dirty page and someone
+ * could wander in and start the IO. Make sure that hasn't
+ * happened.
+ */
+ lock_page(page);
+ wait_on_page_writeback(page);
+ up_read(&inode->i_alloc_sem);
+ return VM_FAULT_LOCKED;
out_unlock:
if (ret)
ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8a16eecf1d5..859f2ae8864e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -787,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
struct inode *inode;
char *data;
char *bitmap;
+ struct ext4_group_info *grinfo;
mb_debug(1, "init page %lu\n", page->index);
@@ -819,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (first_group + i >= ngroups)
break;
+ grinfo = ext4_get_group_info(sb, first_group + i);
+ /*
+ * If page is uptodate then we came here after online resize
+ * which added some new uninitialized group info structs, so
+ * we must skip all initialized uptodate buddies on the page,
+ * which may be currently in use by an allocating task.
+ */
+ if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
+ bh[i] = NULL;
+ continue;
+ }
+
err = -EIO;
desc = ext4_get_group_desc(sb, first_group + i, NULL);
if (desc == NULL)
@@ -871,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
}
/* wait for I/O completion */
- for (i = 0; i < groups_per_page && bh[i]; i++)
- wait_on_buffer(bh[i]);
+ for (i = 0; i < groups_per_page; i++)
+ if (bh[i])
+ wait_on_buffer(bh[i]);
err = -EIO;
- for (i = 0; i < groups_per_page && bh[i]; i++)
- if (!buffer_uptodate(bh[i]))
+ for (i = 0; i < groups_per_page; i++)
+ if (bh[i] && !buffer_uptodate(bh[i]))
goto out;
err = 0;
first_block = page->index * blocks_per_page;
- /* init the page */
- memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
for (i = 0; i < blocks_per_page; i++) {
int group;
- struct ext4_group_info *grinfo;
group = (first_block + i) >> 1;
if (group >= ngroups)
break;
+ if (!bh[group - first_group])
+ /* skip initialized uptodate buddy */
+ continue;
+
/*
* data carry information regarding this
* particular group in the format specified
@@ -919,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
* incore got set to the group block bitmap below
*/
ext4_lock_group(sb, group);
+ /* init the buddy */
+ memset(data, 0xff, blocksize);
ext4_mb_generate_buddy(sb, data, incore, group);
ext4_unlock_group(sb, group);
incore = NULL;
@@ -948,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
out:
if (bh) {
- for (i = 0; i < groups_per_page && bh[i]; i++)
+ for (i = 0; i < groups_per_page; i++)
brelse(bh[i]);
if (bh != &bhs)
kfree(bh);
@@ -957,22 +974,21 @@ out:
}
/*
- * lock the group_info alloc_sem of all the groups
- * belonging to the same buddy cache page. This
- * make sure other parallel operation on the buddy
- * cache doesn't happen whild holding the buddy cache
- * lock
+ * Lock the buddy and bitmap pages. This make sure other parallel init_group
+ * on the same buddy page doesn't happen whild holding the buddy page lock.
+ * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
+ * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
*/
-static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group)
+static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
+ ext4_group_t group, struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
+ struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
+ int block, pnum, poff;
int blocks_per_page;
- int groups_per_page;
- ext4_group_t ngroups = ext4_get_groups_count(sb);
- ext4_group_t first_group;
- struct ext4_group_info *grp;
+ struct page *page;
+
+ e4b->bd_buddy_page = NULL;
+ e4b->bd_bitmap_page = NULL;
blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
/*
@@ -982,57 +998,40 @@ static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
*/
block = group * 2;
pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
-
- groups_per_page = blocks_per_page >> 1;
- if (groups_per_page == 0)
- groups_per_page = 1;
- /* read all groups the page covers into the cache */
- for (i = 0; i < groups_per_page; i++) {
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -EIO;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_bitmap_page = page;
+ e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- if ((first_group + i) >= ngroups)
- break;
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- down_write_nested(&grp->alloc_sem, i);
+ if (blocks_per_page >= 2) {
+ /* buddy and bitmap are on the same page */
+ return 0;
}
- return i;
+
+ block++;
+ pnum = block / blocks_per_page;
+ poff = block % blocks_per_page;
+ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+ if (!page)
+ return -EIO;
+ BUG_ON(page->mapping != inode->i_mapping);
+ e4b->bd_buddy_page = page;
+ return 0;
}
-static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
- ext4_group_t group, int locked_group)
+static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
{
- int i;
- int block, pnum;
- int blocks_per_page;
- ext4_group_t first_group;
- struct ext4_group_info *grp;
-
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- first_group = pnum * blocks_per_page / 2;
- /* release locks on all the groups */
- for (i = 0; i < locked_group; i++) {
-
- grp = ext4_get_group_info(sb, first_group + i);
- /* take all groups write allocation
- * semaphore. This make sure there is
- * no block allocation going on in any
- * of that groups
- */
- up_write(&grp->alloc_sem);
+ if (e4b->bd_bitmap_page) {
+ unlock_page(e4b->bd_bitmap_page);
+ page_cache_release(e4b->bd_bitmap_page);
+ }
+ if (e4b->bd_buddy_page) {
+ unlock_page(e4b->bd_buddy_page);
+ page_cache_release(e4b->bd_buddy_page);
}
-
}
/*
@@ -1044,93 +1043,60 @@ static noinline_for_stack
int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
{
- int ret = 0;
- void *bitmap;
- int blocks_per_page;
- int block, pnum, poff;
- int num_grp_locked = 0;
struct ext4_group_info *this_grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct inode *inode = sbi->s_buddy_cache;
- struct page *page = NULL, *bitmap_page = NULL;
+ struct ext4_buddy e4b;
+ struct page *page;
+ int ret = 0;
mb_debug(1, "init group %u\n", group);
- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
this_grp = ext4_get_group_info(sb, group);
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
- * would have taken the alloc_sem lock.
+ * would have pinned buddy page to page cache.
*/
- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
+ if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
/*
* somebody initialized the group
* return without doing anything
*/
- ret = 0;
goto err;
}
- /*
- * the buddy cache inode stores the block bitmap
- * and buddy information in consecutive blocks.
- * So for each group we need two blocks.
- */
- block = group * 2;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, NULL);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
- }
- if (page == NULL || !PageUptodate(page)) {
+
+ page = e4b.bd_bitmap_page;
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
mark_page_accessed(page);
- bitmap_page = page;
- bitmap = page_address(page) + (poff * sb->s_blocksize);
- /* init buddy cache */
- block++;
- pnum = block / blocks_per_page;
- poff = block % blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
- if (page == bitmap_page) {
+ if (e4b.bd_buddy_page == NULL) {
/*
* If both the bitmap and buddy are in
* the same page we don't need to force
* init the buddy
*/
- unlock_page(page);
- } else if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
- ret = ext4_mb_init_cache(page, bitmap);
- if (ret) {
- unlock_page(page);
- goto err;
- }
- unlock_page(page);
+ ret = 0;
+ goto err;
}
- if (page == NULL || !PageUptodate(page)) {
+ /* init buddy cache */
+ page = e4b.bd_buddy_page;
+ ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
+ if (ret)
+ goto err;
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
mark_page_accessed(page);
err:
- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
- if (bitmap_page)
- page_cache_release(bitmap_page);
- if (page)
- page_cache_release(page);
+ ext4_mb_put_buddy_page_lock(&e4b);
return ret;
}
@@ -1164,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
e4b->bd_group = group;
e4b->bd_buddy_page = NULL;
e4b->bd_bitmap_page = NULL;
- e4b->alloc_semp = &grp->alloc_sem;
-
- /* Take the read lock on the group alloc
- * sem. This would make sure a parallel
- * ext4_mb_init_group happening on other
- * groups mapped by the page is blocked
- * till we are done with allocation
- */
-repeat_load_buddy:
- down_read(e4b->alloc_semp);
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
- /* we need to check for group need init flag
- * with alloc_semp held so that we can be sure
- * that new blocks didn't get added to the group
- * when we are loading the buddy cache
- */
- up_read(e4b->alloc_semp);
/*
* we need full data about the group
* to make a good selection
@@ -1189,7 +1139,6 @@ repeat_load_buddy:
ret = ext4_mb_init_group(sb, group);
if (ret)
return ret;
- goto repeat_load_buddy;
}
/*
@@ -1273,15 +1222,14 @@ repeat_load_buddy:
return 0;
err:
+ if (page)
+ page_cache_release(page);
if (e4b->bd_bitmap_page)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
-
- /* Done with the buddy cache */
- up_read(e4b->alloc_semp);
return ret;
}
@@ -1291,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
page_cache_release(e4b->bd_bitmap_page);
if (e4b->bd_buddy_page)
page_cache_release(e4b->bd_buddy_page);
- /* Done with the buddy cache */
- if (e4b->alloc_semp)
- up_read(e4b->alloc_semp);
}
@@ -1606,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
get_page(ac->ac_bitmap_page);
ac->ac_buddy_page = e4b->bd_buddy_page;
get_page(ac->ac_buddy_page);
- /* on allocation we use ac to track the held semaphore */
- ac->alloc_semp = e4b->alloc_semp;
- e4b->alloc_semp = NULL;
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
spin_lock(&sbi->s_md_lock);
@@ -2659,7 +2601,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
struct ext4_group_info *db;
- int err, ret, count = 0, count2 = 0;
+ int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
struct list_head *l, *ltmp;
@@ -2669,15 +2611,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->count, entry->group, entry);
- if (test_opt(sb, DISCARD)) {
- ret = ext4_issue_discard(sb, entry->group,
- entry->start_blk, entry->count);
- if (unlikely(ret == -EOPNOTSUPP)) {
- ext4_warning(sb, "discard not supported, "
- "disabling");
- clear_opt(sb, DISCARD);
- }
- }
+ if (test_opt(sb, DISCARD))
+ ext4_issue_discard(sb, entry->group,
+ entry->start_blk, entry->count);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
@@ -4226,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
spin_unlock(&pa->pa_lock);
}
}
- if (ac->alloc_semp)
- up_read(ac->alloc_semp);
if (pa) {
/*
* We want to add the pa to the right bucket.
* Remove it from the list and while adding
* make sure the list to which we are adding
- * doesn't grow big. We need to release
- * alloc_semp before calling ext4_mb_add_n_trim()
+ * doesn't grow big.
*/
if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
@@ -4303,7 +4236,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
* there is enough free blocks to do block allocation
* and verify allocation doesn't exceed the quota limits.
*/
- while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ while (ar->len &&
+ ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
+
/* let others to free the space */
yield();
ar->len = ar->len >> 1;
@@ -4313,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
return 0;
}
reserv_blks = ar->len;
- while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
- ar->flags |= EXT4_MB_HINT_NOPREALLOC;
- ar->len--;
+ if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
+ dquot_alloc_block_nofail(ar->inode, ar->len);
+ } else {
+ while (ar->len &&
+ dquot_alloc_block(ar->inode, ar->len)) {
+
+ ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+ ar->len--;
+ }
}
inquota = ar->len;
if (ar->len == 0) {
@@ -4704,6 +4645,127 @@ error_return:
}
/**
+ * ext4_add_groupblocks() -- Add given blocks to an existing group
+ * @handle: handle to this transaction
+ * @sb: super block
+ * @block: start physcial block to add to the block group
+ * @count: number of blocks to free
+ *
+ * This marks the blocks as free in the bitmap and buddy.
+ */
+void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count)
+{
+ struct buffer_head *bitmap_bh = NULL;
+ struct buffer_head *gd_bh;
+ ext4_group_t block_group;
+ ext4_grpblk_t bit;
+ unsigned int i;
+ struct ext4_group_desc *desc;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_buddy e4b;
+ int err = 0, ret, blk_free_count;
+ ext4_grpblk_t blocks_freed;
+ struct ext4_group_info *grp;
+
+ ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
+
+ ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
+ grp = ext4_get_group_info(sb, block_group);
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ */
+ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
+ goto error_return;
+
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh)
+ goto error_return;
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
+ if (!desc)
+ goto error_return;
+
+ if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
+ in_range(ext4_inode_bitmap(sb, desc), block, count) ||
+ in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
+ in_range(block + count - 1, ext4_inode_table(sb, desc),
+ sbi->s_itb_per_group)) {
+ ext4_error(sb, "Adding blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
+ goto error_return;
+ }
+
+ BUFFER_TRACE(bitmap_bh, "getting write access");
+ err = ext4_journal_get_write_access(handle, bitmap_bh);
+ if (err)
+ goto error_return;
+
+ /*
+ * We are about to modify some metadata. Call the journal APIs
+ * to unshare ->b_data if a currently-committing transaction is
+ * using it
+ */
+ BUFFER_TRACE(gd_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, gd_bh);
+ if (err)
+ goto error_return;
+
+ for (i = 0, blocks_freed = 0; i < count; i++) {
+ BUFFER_TRACE(bitmap_bh, "clear bit");
+ if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
+ ext4_error(sb, "bit already cleared for block %llu",
+ (ext4_fsblk_t)(block + i));
+ BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ } else {
+ blocks_freed++;
+ }
+ }
+
+ err = ext4_mb_load_buddy(sb, block_group, &e4b);
+ if (err)
+ goto error_return;
+
+ /*
+ * need to update group_info->bb_free and bitmap
+ * with group lock held. generate_buddy look at
+ * them with group lock_held
+ */
+ ext4_lock_group(sb, block_group);
+ mb_clear_bits(bitmap_bh->b_data, bit, count);
+ mb_free_blocks(NULL, &e4b, bit, count);
+ blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
+ ext4_free_blks_set(sb, desc, blk_free_count);
+ desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+ ext4_unlock_group(sb, block_group);
+ percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ atomic_add(blocks_freed,
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ }
+
+ ext4_mb_unload_buddy(&e4b);
+
+ /* We dirtied the bitmap block */
+ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+
+ /* And the group descriptor block */
+ BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
+ if (!err)
+ err = ret;
+
+error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, err);
+ return;
+}
+
+/**
* ext4_trim_extent -- function to TRIM one single free extent in the group
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
@@ -4715,11 +4777,10 @@ error_return:
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+ ext4_group_t group, struct ext4_buddy *e4b)
{
struct ext4_free_extent ex;
- int ret = 0;
assert_spin_locked(ext4_group_lock_ptr(sb, group));
@@ -4733,12 +4794,9 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
-
- ret = ext4_issue_discard(sb, group, start, count);
-
+ ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
- return ret;
}
/**
@@ -4760,21 +4818,26 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
* the group buddy bitmap. This is done until whole group is scanned.
*/
static ext4_grpblk_t
-ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
- ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+ ext4_grpblk_t start, ext4_grpblk_t max,
+ ext4_grpblk_t minblocks)
{
void *bitmap;
ext4_grpblk_t next, count = 0;
- ext4_group_t group;
- int ret = 0;
+ struct ext4_buddy e4b;
+ int ret;
- BUG_ON(e4b == NULL);
+ ret = ext4_mb_load_buddy(sb, group, &e4b);
+ if (ret) {
+ ext4_error(sb, "Error in loading buddy "
+ "information for %u", group);
+ return ret;
+ }
+ bitmap = e4b.bd_bitmap;
- bitmap = e4b->bd_bitmap;
- group = e4b->bd_group;
- start = (e4b->bd_info->bb_first_free > start) ?
- e4b->bd_info->bb_first_free : start;
ext4_lock_group(sb, group);
+ start = (e4b.bd_info->bb_first_free > start) ?
+ e4b.bd_info->bb_first_free : start;
while (start < max) {
start = mb_find_next_zero_bit(bitmap, max, start);
@@ -4783,10 +4846,8 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
next = mb_find_next_bit(bitmap, max, start);
if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, e4b);
- if (ret < 0)
- break;
+ ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
count += next - start;
}
start = next + 1;
@@ -4802,17 +4863,15 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
ext4_lock_group(sb, group);
}
- if ((e4b->bd_info->bb_free - count) < minblocks)
+ if ((e4b.bd_info->bb_free - count) < minblocks)
break;
}
ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- if (ret < 0)
- count = ret;
-
return count;
}
@@ -4830,11 +4889,11 @@ ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
*/
int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
{
- struct ext4_buddy e4b;
+ struct ext4_group_info *grp;
ext4_group_t first_group, last_group;
ext4_group_t group, ngroups = ext4_get_groups_count(sb);
ext4_grpblk_t cnt = 0, first_block, last_block;
- uint64_t start, len, minlen, trimmed;
+ uint64_t start, len, minlen, trimmed = 0;
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
int ret = 0;
@@ -4842,7 +4901,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> sb->s_blocksize_bits;
len = range->len >> sb->s_blocksize_bits;
minlen = range->minlen >> sb->s_blocksize_bits;
- trimmed = 0;
if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
return -EINVAL;
@@ -4863,11 +4921,12 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
return -EINVAL;
for (group = first_group; group <= last_group; group++) {
- ret = ext4_mb_load_buddy(sb, group, &e4b);
- if (ret) {
- ext4_error(sb, "Error in loading buddy "
- "information for %u", group);
- break;
+ grp = ext4_get_group_info(sb, group);
+ /* We only do this if the grp has never been initialized */
+ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+ ret = ext4_mb_init_group(sb, group);
+ if (ret)
+ break;
}
/*
@@ -4880,16 +4939,14 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
last_block = first_block + len;
len -= last_block - first_block;
- if (e4b.bd_info->bb_free >= minlen) {
- cnt = ext4_trim_all_free(sb, &e4b, first_block,
+ if (grp->bb_free >= minlen) {
+ cnt = ext4_trim_all_free(sb, group, first_block,
last_block, minlen);
if (cnt < 0) {
ret = cnt;
- ext4_mb_unload_buddy(&e4b);
break;
}
}
- ext4_mb_unload_buddy(&e4b);
trimmed += cnt;
first_block = 0;
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 22bd4d7f289b..20b5e7bfebd1 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -193,11 +193,6 @@ struct ext4_allocation_context {
__u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page;
struct page *ac_buddy_page;
- /*
- * pointer to the held semaphore upon successful
- * block allocation
- */
- struct rw_semaphore *alloc_semp;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
@@ -215,7 +210,6 @@ struct ext4_buddy {
struct super_block *bd_sb;
__u16 bd_blkbits;
ext4_group_t bd_group;
- struct rw_semaphore *alloc_semp;
};
#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap)
#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 92816b4e0f16..b57b98fb44d1 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
* We have the extent map build with the tmp inode.
* Now copy the i_data across
*/
- ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data));
/*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
new file mode 100644
index 000000000000..9bdef3f537c5
--- /dev/null
+++ b/fs/ext4/mmp.c
@@ -0,0 +1,351 @@
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/buffer_head.h>
+#include <linux/utsname.h>
+#include <linux/kthread.h>
+
+#include "ext4.h"
+
+/*
+ * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * faster.
+ */
+static int write_mmp_block(struct buffer_head *bh)
+{
+ mark_buffer_dirty(bh);
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ submit_bh(WRITE_SYNC, bh);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Read the MMP block. It _must_ be read from disk and hence we clear the
+ * uptodate flag on the buffer.
+ */
+static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
+ ext4_fsblk_t mmp_block)
+{
+ struct mmp_struct *mmp;
+
+ if (*bh)
+ clear_buffer_uptodate(*bh);
+
+ /* This would be sb_bread(sb, mmp_block), except we need to be sure
+ * that the MD RAID device cache has been bypassed, and that the read
+ * is not blocked in the elevator. */
+ if (!*bh)
+ *bh = sb_getblk(sb, mmp_block);
+ if (*bh) {
+ get_bh(*bh);
+ lock_buffer(*bh);
+ (*bh)->b_end_io = end_buffer_read_sync;
+ submit_bh(READ_SYNC, *bh);
+ wait_on_buffer(*bh);
+ if (!buffer_uptodate(*bh)) {
+ brelse(*bh);
+ *bh = NULL;
+ }
+ }
+ if (!*bh) {
+ ext4_warning(sb, "Error while reading MMP block %llu",
+ mmp_block);
+ return -EIO;
+ }
+
+ mmp = (struct mmp_struct *)((*bh)->b_data);
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * Dump as much information as possible to help the admin.
+ */
+void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
+ const char *function, unsigned int line, const char *msg)
+{
+ __ext4_warning(sb, function, line, msg);
+ __ext4_warning(sb, function, line,
+ "MMP failure info: last update time: %llu, last update "
+ "node: %s, last update device: %s\n",
+ (long long unsigned int) le64_to_cpu(mmp->mmp_time),
+ mmp->mmp_nodename, mmp->mmp_bdevname);
+}
+
+/*
+ * kmmpd will update the MMP sequence every s_mmp_update_interval seconds
+ */
+static int kmmpd(void *data)
+{
+ struct super_block *sb = ((struct mmpd_data *) data)->sb;
+ struct buffer_head *bh = ((struct mmpd_data *) data)->bh;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct mmp_struct *mmp;
+ ext4_fsblk_t mmp_block;
+ u32 seq = 0;
+ unsigned long failed_writes = 0;
+ int mmp_update_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned mmp_check_interval;
+ unsigned long last_update_time;
+ unsigned long diff;
+ int retval;
+
+ mmp_block = le64_to_cpu(es->s_mmp_block);
+ mmp = (struct mmp_struct *)(bh->b_data);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ /*
+ * Start with the higher mmp_check_interval and reduce it if
+ * the MMP block is being updated on time.
+ */
+ mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ bdevname(bh->b_bdev, mmp->mmp_bdevname);
+
+ memcpy(mmp->mmp_nodename, init_utsname()->sysname,
+ sizeof(mmp->mmp_nodename));
+
+ while (!kthread_should_stop()) {
+ if (++seq > EXT4_MMP_SEQ_MAX)
+ seq = 1;
+
+ mmp->mmp_seq = cpu_to_le32(seq);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+ last_update_time = jiffies;
+
+ retval = write_mmp_block(bh);
+ /*
+ * Don't spew too many error messages. Print one every
+ * (s_mmp_update_interval * 60) seconds.
+ */
+ if (retval && (failed_writes % 60) == 0) {
+ ext4_error(sb, "Error writing to MMP block");
+ failed_writes++;
+ }
+
+ if (!(le32_to_cpu(es->s_feature_incompat) &
+ EXT4_FEATURE_INCOMPAT_MMP)) {
+ ext4_warning(sb, "kmmpd being stopped since MMP feature"
+ " has been disabled.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ if (sb->s_flags & MS_RDONLY) {
+ ext4_warning(sb, "kmmpd being stopped since filesystem "
+ "has been remounted as readonly.");
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ diff = jiffies - last_update_time;
+ if (diff < mmp_update_interval * HZ)
+ schedule_timeout_interruptible(mmp_update_interval *
+ HZ - diff);
+
+ /*
+ * We need to make sure that more than mmp_check_interval
+ * seconds have not passed since writing. If that has happened
+ * we need to check if the MMP block is as we left it.
+ */
+ diff = jiffies - last_update_time;
+ if (diff > mmp_check_interval * HZ) {
+ struct buffer_head *bh_check = NULL;
+ struct mmp_struct *mmp_check;
+
+ retval = read_mmp_block(sb, &bh_check, mmp_block);
+ if (retval) {
+ ext4_error(sb, "error reading MMP data: %d",
+ retval);
+
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ goto failed;
+ }
+
+ mmp_check = (struct mmp_struct *)(bh_check->b_data);
+ if (mmp->mmp_seq != mmp_check->mmp_seq ||
+ memcmp(mmp->mmp_nodename, mmp_check->mmp_nodename,
+ sizeof(mmp->mmp_nodename))) {
+ dump_mmp_msg(sb, mmp_check,
+ "Error while updating MMP info. "
+ "The filesystem seems to have been"
+ " multiply mounted.");
+ ext4_error(sb, "abort");
+ goto failed;
+ }
+ put_bh(bh_check);
+ }
+
+ /*
+ * Adjust the mmp_check_interval depending on how much time
+ * it took for the MMP block to be written.
+ */
+ mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ,
+ EXT4_MMP_MAX_CHECK_INTERVAL),
+ EXT4_MMP_MIN_CHECK_INTERVAL);
+ mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ }
+
+ /*
+ * Unmount seems to be clean.
+ */
+ mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
+ mmp->mmp_time = cpu_to_le64(get_seconds());
+
+ retval = write_mmp_block(bh);
+
+failed:
+ kfree(data);
+ brelse(bh);
+ return retval;
+}
+
+/*
+ * Get a random new sequence number but make sure it is not greater than
+ * EXT4_MMP_SEQ_MAX.
+ */
+static unsigned int mmp_new_seq(void)
+{
+ u32 new_seq;
+
+ do {
+ get_random_bytes(&new_seq, sizeof(u32));
+ } while (new_seq > EXT4_MMP_SEQ_MAX);
+
+ return new_seq;
+}
+
+/*
+ * Protect the filesystem from being mounted more than once.
+ */
+int ext4_multi_mount_protect(struct super_block *sb,
+ ext4_fsblk_t mmp_block)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct buffer_head *bh = NULL;
+ struct mmp_struct *mmp = NULL;
+ struct mmpd_data *mmpd_data;
+ u32 seq;
+ unsigned int mmp_check_interval = le16_to_cpu(es->s_mmp_update_interval);
+ unsigned int wait_time = 0;
+ int retval;
+
+ if (mmp_block < le32_to_cpu(es->s_first_data_block) ||
+ mmp_block >= ext4_blocks_count(es)) {
+ ext4_warning(sb, "Invalid MMP block in superblock");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+
+ mmp = (struct mmp_struct *)(bh->b_data);
+
+ if (mmp_check_interval < EXT4_MMP_MIN_CHECK_INTERVAL)
+ mmp_check_interval = EXT4_MMP_MIN_CHECK_INTERVAL;
+
+ /*
+ * If check_interval in MMP block is larger, use that instead of
+ * update_interval from the superblock.
+ */
+ if (mmp->mmp_check_interval > mmp_check_interval)
+ mmp_check_interval = mmp->mmp_check_interval;
+
+ seq = le32_to_cpu(mmp->mmp_seq);
+ if (seq == EXT4_MMP_SEQ_CLEAN)
+ goto skip;
+
+ if (seq == EXT4_MMP_SEQ_FSCK) {
+ dump_mmp_msg(sb, mmp, "fsck is running on the filesystem");
+ goto failed;
+ }
+
+ wait_time = min(mmp_check_interval * 2 + 1,
+ mmp_check_interval + 60);
+
+ /* Print MMP interval if more than 20 secs. */
+ if (wait_time > EXT4_MMP_MIN_CHECK_INTERVAL * 4)
+ ext4_warning(sb, "MMP interval %u higher than expected, please"
+ " wait.\n", wait_time * 2);
+
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+skip:
+ /*
+ * write a new random sequence number.
+ */
+ mmp->mmp_seq = seq = cpu_to_le32(mmp_new_seq());
+
+ retval = write_mmp_block(bh);
+ if (retval)
+ goto failed;
+
+ /*
+ * wait for MMP interval and check mmp_seq.
+ */
+ if (schedule_timeout_interruptible(HZ * wait_time) != 0) {
+ ext4_warning(sb, "MMP startup interrupted, failing mount\n");
+ goto failed;
+ }
+
+ retval = read_mmp_block(sb, &bh, mmp_block);
+ if (retval)
+ goto failed;
+ mmp = (struct mmp_struct *)(bh->b_data);
+ if (seq != le32_to_cpu(mmp->mmp_seq)) {
+ dump_mmp_msg(sb, mmp,
+ "Device is already active on another node.");
+ goto failed;
+ }
+
+ mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL);
+ if (!mmpd_data) {
+ ext4_warning(sb, "not enough memory for mmpd_data");
+ goto failed;
+ }
+ mmpd_data->sb = sb;
+ mmpd_data->bh = bh;
+
+ /*
+ * Start a kernel thread to update the MMP block periodically.
+ */
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ bdevname(bh->b_bdev,
+ mmp->mmp_bdevname));
+ if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
+ EXT4_SB(sb)->s_mmp_tsk = NULL;
+ kfree(mmpd_data);
+ ext4_warning(sb, "Unable to create kmmpd thread for %s.",
+ sb->s_id);
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ brelse(bh);
+ return 1;
+}
+
+
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b9f3e7862f13..2b8304bf3c50 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -876,8 +876,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* It needs to call wait_on_page_writeback() to wait for the
* writeback of the page.
*/
- if (PageWriteback(page))
- wait_on_page_writeback(page);
+ wait_on_page_writeback(page);
/* Release old bh and drop refs */
try_to_release_page(page, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 67fd0b025858..b754b7721f51 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1413,10 +1413,22 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
frame->at = entries;
frame->bh = bh;
bh = bh2;
+
+ ext4_handle_dirty_metadata(handle, dir, frame->bh);
+ ext4_handle_dirty_metadata(handle, dir, bh);
+
de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
- dx_release (frames);
- if (!(de))
+ if (!de) {
+ /*
+ * Even if the block split failed, we have to properly write
+ * out all the changes we did so far. Otherwise we can end up
+ * with corrupted filesystem.
+ */
+ ext4_mark_inode_dirty(handle, dir);
+ dx_release(frames);
return retval;
+ }
+ dx_release(frames);
retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
brelse(bh);
@@ -2240,6 +2252,7 @@ static int ext4_symlink(struct inode *dir,
handle_t *handle;
struct inode *inode;
int l, err, retries = 0;
+ int credits;
l = strlen(symname)+1;
if (l > dir->i_sb->s_blocksize)
@@ -2247,10 +2260,26 @@ static int ext4_symlink(struct inode *dir,
dquot_initialize(dir);
+ if (l > EXT4_N_BLOCKS * 4) {
+ /*
+ * For non-fast symlinks, we just allocate inode and put it on
+ * orphan list in the first transaction => we need bitmap,
+ * group descriptor, sb, inode block, quota blocks.
+ */
+ credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ } else {
+ /*
+ * Fast symlink. We have to add entry to directory
+ * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS),
+ * allocate new inode (bitmap, group descriptor, inode block,
+ * quota blocks, sb is already counted in previous macros).
+ */
+ credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+ }
retry:
- handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
- EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
- EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ handle = ext4_journal_start(dir, credits);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -2263,21 +2292,44 @@ retry:
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof(EXT4_I(inode)->i_data)) {
+ if (l > EXT4_N_BLOCKS * 4) {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
- * page_symlink() calls into ext4_prepare/commit_write.
- * We have a transaction open. All is sweetness. It also sets
- * i_size in generic_commit_write().
+ * We cannot call page_symlink() with transaction started
+ * because it calls into ext4_write_begin() which can wait
+ * for transaction commit if we are running out of space
+ * and thus we deadlock. So we have to stop transaction now
+ * and restart it when symlink contents is written.
+ *
+ * To keep fs consistent in case of crash, we have to put inode
+ * to orphan list in the mean time.
*/
+ drop_nlink(inode);
+ err = ext4_orphan_add(handle, inode);
+ ext4_journal_stop(handle);
+ if (err)
+ goto err_drop_inode;
err = __page_symlink(inode, symname, l, 1);
+ if (err)
+ goto err_drop_inode;
+ /*
+ * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS
+ * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified
+ */
+ handle = ext4_journal_start(dir,
+ EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto err_drop_inode;
+ }
+ inc_nlink(inode);
+ err = ext4_orphan_del(handle, inode);
if (err) {
+ ext4_journal_stop(handle);
clear_nlink(inode);
- unlock_new_inode(inode);
- ext4_mark_inode_dirty(handle, inode);
- iput(inode);
- goto out_stop;
+ goto err_drop_inode;
}
} else {
/* clear the extent format for fast symlink */
@@ -2293,6 +2345,10 @@ out_stop:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
+err_drop_inode:
+ unlock_new_inode(inode);
+ iput(inode);
+ return err;
}
static int ext4_link(struct dentry *old_dentry,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b6dbd056fcb1..7bb8f76d470a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -203,46 +203,29 @@ static void ext4_end_bio(struct bio *bio, int error)
for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head;
- int partial_write = 0;
+ loff_t offset;
+ loff_t io_end_offset;
- head = page_buffers(page);
- if (error)
+ if (error) {
SetPageError(page);
- BUG_ON(!head);
- if (head->b_size != PAGE_CACHE_SIZE) {
- loff_t offset;
- loff_t io_end_offset = io_end->offset + io_end->size;
+ set_bit(AS_EIO, &page->mapping->flags);
+ head = page_buffers(page);
+ BUG_ON(!head);
+
+ io_end_offset = io_end->offset + io_end->size;
offset = (sector_t) page->index << PAGE_CACHE_SHIFT;
bh = head;
do {
if ((offset >= io_end->offset) &&
- (offset+bh->b_size <= io_end_offset)) {
- if (error)
- buffer_io_error(bh);
-
- }
- if (buffer_delay(bh))
- partial_write = 1;
- else if (!buffer_mapped(bh))
- clear_buffer_dirty(bh);
- else if (buffer_dirty(bh))
- partial_write = 1;
+ (offset+bh->b_size <= io_end_offset))
+ buffer_io_error(bh);
+
offset += bh->b_size;
bh = bh->b_this_page;
} while (bh != head);
}
- /*
- * If this is a partial write which happened to make
- * all buffers uptodate then we can optimize away a
- * bogus readpage() for the next read(). Here we
- * 'discover' whether the page went uptodate as a
- * result of this (potentially partial) write.
- */
- if (!partial_write)
- SetPageUptodate(page);
-
put_io_page(io_end->pages[i]);
}
io_end->num_io_pages = 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 51476477c818..cc5c157aa11d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -76,11 +76,27 @@ static void ext4_write_super(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data);
+static inline int ext2_feature_set_ok(struct super_block *sb);
+static inline int ext3_feature_set_ok(struct super_block *sb);
static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
+#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
+static struct file_system_type ext2_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext2",
+ .mount = ext4_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#else
+#define IS_EXT2_SB(sb) (0)
+#endif
+
+
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext3_fs_type = {
.owner = THIS_MODULE,
@@ -807,6 +823,8 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL;
/*
* Now that we are completely done shutting down the
@@ -1097,7 +1115,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (!test_opt(sb, INIT_INODE_TABLE))
seq_puts(seq, ",noinit_inode_table");
- else if (sbi->s_li_wait_mult)
+ else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
seq_printf(seq, ",init_inode_table=%u",
(unsigned) sbi->s_li_wait_mult);
@@ -1188,9 +1206,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off);
static const struct dquot_operations ext4_quota_operations = {
-#ifdef CONFIG_QUOTA
.get_reserved_space = ext4_get_reserved_space,
-#endif
.write_dquot = ext4_write_dquot,
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
@@ -1901,7 +1917,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_WARNING,
"warning: mounting fs with errors, "
"running e2fsck is recommended");
- else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
ext4_msg(sb, KERN_WARNING,
@@ -2427,6 +2443,18 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
EXT4_SB(sb)->s_sectors_written_start) >> 1)));
}
+static ssize_t extent_cache_hits_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
+}
+
+static ssize_t extent_cache_misses_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
+}
+
static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
@@ -2484,6 +2512,8 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RO_ATTR(extent_cache_hits);
+EXT4_RO_ATTR(extent_cache_misses);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2499,6 +2529,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(extent_cache_hits),
+ ATTR_LIST(extent_cache_misses),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@@ -2661,12 +2693,6 @@ static void print_daily_error_info(unsigned long arg)
mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */
}
-static void ext4_lazyinode_timeout(unsigned long data)
-{
- struct task_struct *p = (struct task_struct *)data;
- wake_up_process(p);
-}
-
/* Find next suitable group and run ext4_init_inode_table */
static int ext4_run_li_request(struct ext4_li_request *elr)
{
@@ -2698,11 +2724,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
if (elr->lr_timeout == 0) {
- timeout = jiffies - timeout;
- if (elr->lr_sbi->s_li_wait_mult)
- timeout *= elr->lr_sbi->s_li_wait_mult;
- else
- timeout *= 20;
+ timeout = (jiffies - timeout) *
+ elr->lr_sbi->s_li_wait_mult;
elr->lr_timeout = timeout;
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -2714,7 +2737,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
/*
* Remove lr_request from the list_request and free the
- * request tructure. Should be called with li_list_mtx held
+ * request structure. Should be called with li_list_mtx held
*/
static void ext4_remove_li_request(struct ext4_li_request *elr)
{
@@ -2732,14 +2755,16 @@ static void ext4_remove_li_request(struct ext4_li_request *elr)
static void ext4_unregister_li_request(struct super_block *sb)
{
- struct ext4_li_request *elr = EXT4_SB(sb)->s_li_request;
-
- if (!ext4_li_info)
+ mutex_lock(&ext4_li_mtx);
+ if (!ext4_li_info) {
+ mutex_unlock(&ext4_li_mtx);
return;
+ }
mutex_lock(&ext4_li_info->li_list_mtx);
- ext4_remove_li_request(elr);
+ ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
mutex_unlock(&ext4_li_info->li_list_mtx);
+ mutex_unlock(&ext4_li_mtx);
}
static struct task_struct *ext4_lazyinit_task;
@@ -2758,17 +2783,10 @@ static int ext4_lazyinit_thread(void *arg)
struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
struct list_head *pos, *n;
struct ext4_li_request *elr;
- unsigned long next_wakeup;
- DEFINE_WAIT(wait);
+ unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
- eli->li_timer.data = (unsigned long)current;
- eli->li_timer.function = ext4_lazyinode_timeout;
-
- eli->li_task = current;
- wake_up(&eli->li_wait_task);
-
cont_thread:
while (true) {
next_wakeup = MAX_JIFFY_OFFSET;
@@ -2799,19 +2817,15 @@ cont_thread:
if (freezing(current))
refrigerator();
- if ((time_after_eq(jiffies, next_wakeup)) ||
+ cur = jiffies;
+ if ((time_after_eq(cur, next_wakeup)) ||
(MAX_JIFFY_OFFSET == next_wakeup)) {
cond_resched();
continue;
}
- eli->li_timer.expires = next_wakeup;
- add_timer(&eli->li_timer);
- prepare_to_wait(&eli->li_wait_daemon, &wait,
- TASK_INTERRUPTIBLE);
- if (time_before(jiffies, next_wakeup))
- schedule();
- finish_wait(&eli->li_wait_daemon, &wait);
+ schedule_timeout_interruptible(next_wakeup - cur);
+
if (kthread_should_stop()) {
ext4_clear_request_list();
goto exit_thread;
@@ -2835,12 +2849,7 @@ exit_thread:
goto cont_thread;
}
mutex_unlock(&eli->li_list_mtx);
- del_timer_sync(&ext4_li_info->li_timer);
- eli->li_task = NULL;
- wake_up(&eli->li_wait_task);
-
kfree(ext4_li_info);
- ext4_lazyinit_task = NULL;
ext4_li_info = NULL;
mutex_unlock(&ext4_li_mtx);
@@ -2868,7 +2877,6 @@ static int ext4_run_lazyinit_thread(void)
if (IS_ERR(ext4_lazyinit_task)) {
int err = PTR_ERR(ext4_lazyinit_task);
ext4_clear_request_list();
- del_timer_sync(&ext4_li_info->li_timer);
kfree(ext4_li_info);
ext4_li_info = NULL;
printk(KERN_CRIT "EXT4: error %d creating inode table "
@@ -2877,8 +2885,6 @@ static int ext4_run_lazyinit_thread(void)
return err;
}
ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
-
- wait_event(ext4_li_info->li_wait_task, ext4_li_info->li_task != NULL);
return 0;
}
@@ -2913,13 +2919,9 @@ static int ext4_li_info_new(void)
if (!eli)
return -ENOMEM;
- eli->li_task = NULL;
INIT_LIST_HEAD(&eli->li_request_list);
mutex_init(&eli->li_list_mtx);
- init_waitqueue_head(&eli->li_wait_daemon);
- init_waitqueue_head(&eli->li_wait_task);
- init_timer(&eli->li_timer);
eli->li_state |= EXT4_LAZYINIT_QUIT;
ext4_li_info = eli;
@@ -2962,20 +2964,19 @@ static int ext4_register_li_request(struct super_block *sb,
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
int ret = 0;
- if (sbi->s_li_request != NULL)
+ if (sbi->s_li_request != NULL) {
+ /*
+ * Reset timeout so it can be computed again, because
+ * s_li_wait_mult might have changed.
+ */
+ sbi->s_li_request->lr_timeout = 0;
return 0;
+ }
if (first_not_zeroed == ngroups ||
(sb->s_flags & MS_RDONLY) ||
- !test_opt(sb, INIT_INODE_TABLE)) {
- sbi->s_li_request = NULL;
+ !test_opt(sb, INIT_INODE_TABLE))
return 0;
- }
-
- if (first_not_zeroed == ngroups) {
- sbi->s_li_request = NULL;
- return 0;
- }
elr = ext4_li_request_new(sb, first_not_zeroed);
if (!elr)
@@ -3168,6 +3169,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
+ /*
+ * set default s_li_wait_mult for lazyinit, for the case there is
+ * no mount option specified.
+ */
+ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+
if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
&journal_devnum, &journal_ioprio, NULL, 0)) {
ext4_msg(sb, KERN_WARNING,
@@ -3189,6 +3196,28 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
+ if (IS_EXT2_SB(sb)) {
+ if (ext2_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
+ if (IS_EXT3_SB(sb)) {
+ if (ext3_feature_set_ok(sb))
+ ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
+ "using the ext4 subsystem");
+ else {
+ ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
+ "to feature incompatibilities");
+ goto failed_mount;
+ }
+ }
+
/*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
@@ -3461,6 +3490,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_RECOVER));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
+ !(sb->s_flags & MS_RDONLY))
+ if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
+ goto failed_mount3;
+
/*
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
@@ -3476,7 +3510,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount_wq;
} else {
clear_opt(sb, DATA_FLAGS);
- set_opt(sb, WRITEBACK_DATA);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
@@ -3709,6 +3742,8 @@ failed_mount3:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ if (sbi->s_mmp_tsk)
+ kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -4244,7 +4279,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
int enable_quota = 0;
ext4_group_t g;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- int err;
+ int err = 0;
#ifdef CONFIG_QUOTA
int i;
#endif
@@ -4370,6 +4405,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_multi_mount_protect(sb,
+ le64_to_cpu(es->s_mmp_block))) {
+ err = -EROFS;
+ goto restore_opts;
+ }
enable_quota = 1;
}
}
@@ -4434,6 +4476,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
u64 fsid;
+ s64 bfree;
if (test_opt(sb, MINIX_DF)) {
sbi->s_overhead_last = 0;
@@ -4477,8 +4520,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+ bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+ /* prevent underflow in case that few free space is available */
+ buf->f_bfree = max_t(s64, bfree, 0);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = 0;
@@ -4654,6 +4699,9 @@ static int ext4_quota_off(struct super_block *sb, int type)
if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
+ if (!inode)
+ goto out;
+
/* Update modification times of quota files when userspace can
* start looking at them */
handle = ext4_journal_start(inode, 1);
@@ -4774,14 +4822,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
}
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
-static struct file_system_type ext2_fs_type = {
- .owner = THIS_MODULE,
- .name = "ext2",
- .mount = ext4_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-
static inline void register_as_ext2(void)
{
int err = register_filesystem(&ext2_fs_type);
@@ -4794,10 +4834,22 @@ static inline void unregister_as_ext2(void)
{
unregister_filesystem(&ext2_fs_type);
}
+
+static inline int ext2_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
MODULE_ALIAS("ext2");
#else
static inline void register_as_ext2(void) { }
static inline void unregister_as_ext2(void) { }
+static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
#endif
#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
@@ -4813,10 +4865,24 @@ static inline void unregister_as_ext3(void)
{
unregister_filesystem(&ext3_fs_type);
}
+
+static inline int ext3_feature_set_ok(struct super_block *sb)
+{
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ return 0;
+ if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ return 0;
+ if (sb->s_flags & MS_RDONLY)
+ return 1;
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ return 0;
+ return 1;
+}
MODULE_ALIAS("ext3");
#else
static inline void register_as_ext3(void) { }
static inline void unregister_as_ext3(void) { }
+static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
#endif
static struct file_system_type ext4_fs_type = {
@@ -4900,8 +4966,8 @@ static int __init ext4_init_fs(void)
err = init_inodecache();
if (err)
goto out1;
- register_as_ext2();
register_as_ext3();
+ register_as_ext2();
err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b545ca1c459c..c757adc97250 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -820,8 +820,8 @@ inserted:
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
- block = ext4_new_meta_blocks(handle, inode,
- goal, NULL, &error);
+ block = ext4_new_meta_blocks(handle, inode, goal, 0,
+ NULL, &error);
if (error)
goto cleanup;
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index ae8200f84e39..1cc7038e273d 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new)
spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
tmp = fat_cache_alloc(inode);
+ if (!tmp) {
+ spin_lock(&MSDOS_I(inode)->cache_lru_lock);
+ MSDOS_I(inode)->nr_caches--;
+ spin_unlock(&MSDOS_I(inode)->cache_lru_lock);
+ return;
+ }
+
spin_lock(&MSDOS_I(inode)->cache_lru_lock);
cache = fat_cache_merge(inode, new);
if (cache != NULL) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index ee42b9e0b16a..4ad64732cbce 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -98,7 +98,7 @@ next:
*bh = sb_bread(sb, phys);
if (*bh == NULL) {
- printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
+ fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed",
(llu)phys);
/* skip this block */
*pos = (iblock + 1) << sb->s_blocksize_bits;
@@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
* but ignore that right now.
* Ahem... Stack smashing in ring 0 isn't fun. Fixed.
*/
-static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
- int uni_xlate, struct nls_table *nls)
+static int uni16_to_x8(struct super_block *sb, unsigned char *ascii,
+ const wchar_t *uni, int len, struct nls_table *nls)
{
+ int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate;
const wchar_t *ip;
wchar_t ec;
unsigned char *op;
@@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
}
if (unlikely(*ip)) {
- printk(KERN_WARNING "FAT: filename was truncated while "
- "converting.");
+ fat_msg(sb, KERN_WARNING, "filename was truncated while "
+ "converting.");
}
*op = 0;
return (op - ascii);
}
-static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni,
unsigned char *buf, int size)
{
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
if (sbi->options.utf8)
return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
UTF16_HOST_ENDIAN, buf, size);
else
- return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
- sbi->nls_io);
+ return uni16_to_x8(sb, buf, uni, size, sbi->nls_io);
}
static inline int
@@ -419,7 +420,7 @@ parse_record:
/* Compare shortname */
bufuname[last_u] = 0x0000;
- len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+ len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
if (fat_name_match(sbi, name, name_len, bufname, len))
goto found;
@@ -428,7 +429,7 @@ parse_record:
int size = PATH_MAX - FAT_MAX_UNI_SIZE;
/* Compare longname */
- len = fat_uni_to_x8(sbi, unicode, longname, size);
+ len = fat_uni_to_x8(sb, unicode, longname, size);
if (fat_name_match(sbi, name, name_len, longname, len))
goto found;
}
@@ -545,7 +546,7 @@ parse_record:
if (nr_slots) {
void *longname = unicode + FAT_MAX_UNI_CHARS;
int size = PATH_MAX - FAT_MAX_UNI_SIZE;
- int len = fat_uni_to_x8(sbi, unicode, longname, size);
+ int len = fat_uni_to_x8(sb, unicode, longname, size);
fill_name = longname;
fill_len = len;
@@ -621,7 +622,7 @@ parse_record:
if (isvfat) {
bufuname[j] = 0x0000;
- i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+ i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname));
}
if (nr_slots) {
/* hack for fat_ioctl_filldir() */
@@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
{
+ struct super_block *sb = dir->i_sb;
struct msdos_dir_entry *de;
struct buffer_head *bh;
int err = 0, nr_slots;
@@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
*/
err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots);
if (err) {
- printk(KERN_WARNING
- "FAT: Couldn't remove the long name slots\n");
+ fat_msg(sb, KERN_WARNING,
+ "Couldn't remove the long name slots");
}
}
@@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
if (sbi->fat_bits != 32)
goto error;
} else if (MSDOS_I(dir)->i_start == 0) {
- printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n",
+ fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)",
MSDOS_I(dir)->i_pos);
err = -EIO;
goto error;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index f50408901f7e..8276cc282dec 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb,
struct msdos_dir_entry *de, loff_t i_pos);
extern int fat_sync_inode(struct inode *inode);
extern int fat_fill_super(struct super_block *sb, void *data, int silent,
- const struct inode_operations *fs_dir_inode_ops,
- int isvfat, void (*setup)(struct super_block *));
+ int isvfat, void (*setup)(struct super_block *));
extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
struct inode *i2);
/* fat/misc.c */
extern void
-__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
+ __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(sb, fmt, args...) \
+ __fat_fs_error(sb, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(sb, fmt, args...) \
+ __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args)
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
__attribute__ ((format (printf, 3, 4))) __cold;
-#define fat_fs_error(s, fmt, args...) \
- __fat_fs_error(s, 1, fmt , ## args)
-#define fat_fs_error_ratelimit(s, fmt, args...) \
- __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
extern int fat_clusters_flush(struct super_block *sb);
extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index b47d2c9f4fa1..2e81ac0df7e2 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
err_brelse:
brelse(bhs[0]);
err:
- printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr);
+ fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
return -EIO;
}
@@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
fatent->bhs[0] = sb_bread(sb, blocknr);
if (!fatent->bhs[0]) {
- printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
+ fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
(llu)blocknr);
return -EIO;
}
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8d68690bdcf1..cb8d8391ac0b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = sbi->free_clusters;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
- buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12;
+ buf->f_namelen =
+ (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE;
return 0;
}
@@ -619,8 +620,8 @@ retry:
bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
if (!bh) {
- printk(KERN_ERR "FAT: unable to read inode block "
- "for updating (i_pos %lld)\n", i_pos);
+ fat_msg(sb, KERN_ERR, "unable to read inode block "
+ "for updating (i_pos %lld)", i_pos);
return -EIO;
}
spin_lock(&sbi->inode_hash_lock);
@@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = {
{Opt_err, NULL}
};
-static int parse_options(char *options, int is_vfat, int silent, int *debug,
- struct fat_mount_options *opts)
+static int parse_options(struct super_block *sb, char *options, int is_vfat,
+ int silent, int *debug, struct fat_mount_options *opts)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
/* obsolete mount options */
case Opt_obsolate:
- printk(KERN_INFO "FAT: \"%s\" option is obsolete, "
- "not supported now\n", p);
+ fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, "
+ "not supported now", p);
break;
/* unknown option */
default:
if (!silent) {
- printk(KERN_ERR
- "FAT: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ fat_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+ "or missing value", p);
}
return -EINVAL;
}
@@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
out:
/* UTF-8 doesn't provide FAT semantics */
if (!strcmp(opts->iocharset, "utf8")) {
- printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
+ fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
" for FAT filesystems, filesystem will be "
"case sensitive!\n");
}
@@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode)
/*
* Read the super block of an MS-DOS FS.
*/
-int fat_fill_super(struct super_block *sb, void *data, int silent,
- const struct inode_operations *fs_dir_inode_ops, int isvfat,
+int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
void (*setup)(struct super_block *))
{
struct inode *root_inode = NULL, *fat_inode = NULL;
@@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
sb->s_magic = MSDOS_SUPER_MAGIC;
sb->s_op = &fat_sops;
sb->s_export_op = &fat_export_ops;
- sbi->dir_ops = fs_dir_inode_ops;
ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- error = parse_options(data, isvfat, silent, &debug, &sbi->options);
+ error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options);
if (error)
goto out_fail;
@@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
sb_min_blocksize(sb, 512);
bh = sb_bread(sb, 0);
if (bh == NULL) {
- printk(KERN_ERR "FAT: unable to read boot sector\n");
+ fat_msg(sb, KERN_ERR, "unable to read boot sector");
goto out_fail;
}
b = (struct fat_boot_sector *) bh->b_data;
if (!b->reserved) {
if (!silent)
- printk(KERN_ERR "FAT: bogus number of reserved sectors\n");
+ fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
brelse(bh);
goto out_invalid;
}
if (!b->fats) {
if (!silent)
- printk(KERN_ERR "FAT: bogus number of FAT structure\n");
+ fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
brelse(bh);
goto out_invalid;
}
@@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
media = b->media;
if (!fat_valid_media(media)) {
if (!silent)
- printk(KERN_ERR "FAT: invalid media value (0x%02x)\n",
+ fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
media);
brelse(bh);
goto out_invalid;
@@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
|| (logical_sector_size < 512)
|| (logical_sector_size > 4096)) {
if (!silent)
- printk(KERN_ERR "FAT: bogus logical sector size %u\n",
+ fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
logical_sector_size);
brelse(bh);
goto out_invalid;
@@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
sbi->sec_per_clus = b->sec_per_clus;
if (!is_power_of_2(sbi->sec_per_clus)) {
if (!silent)
- printk(KERN_ERR "FAT: bogus sectors per cluster %u\n",
+ fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
sbi->sec_per_clus);
brelse(bh);
goto out_invalid;
}
if (logical_sector_size < sb->s_blocksize) {
- printk(KERN_ERR "FAT: logical sector size too small for device"
- " (logical sector size = %u)\n", logical_sector_size);
+ fat_msg(sb, KERN_ERR, "logical sector size too small for device"
+ " (logical sector size = %u)", logical_sector_size);
brelse(bh);
goto out_fail;
}
@@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
brelse(bh);
if (!sb_set_blocksize(sb, logical_sector_size)) {
- printk(KERN_ERR "FAT: unable to set blocksize %u\n",
+ fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
logical_sector_size);
goto out_fail;
}
bh = sb_bread(sb, 0);
if (bh == NULL) {
- printk(KERN_ERR "FAT: unable to read boot sector"
- " (logical sector size = %lu)\n",
+ fat_msg(sb, KERN_ERR, "unable to read boot sector"
+ " (logical sector size = %lu)",
sb->s_blocksize);
goto out_fail;
}
@@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector);
if (fsinfo_bh == NULL) {
- printk(KERN_ERR "FAT: bread failed, FSINFO block"
- " (sector = %lu)\n", sbi->fsinfo_sector);
+ fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
+ " (sector = %lu)", sbi->fsinfo_sector);
brelse(bh);
goto out_fail;
}
fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data;
if (!IS_FSINFO(fsinfo)) {
- printk(KERN_WARNING "FAT: Invalid FSINFO signature: "
- "0x%08x, 0x%08x (sector = %lu)\n",
+ fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: "
+ "0x%08x, 0x%08x (sector = %lu)",
le32_to_cpu(fsinfo->signature1),
le32_to_cpu(fsinfo->signature2),
sbi->fsinfo_sector);
@@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
if (!silent)
- printk(KERN_ERR "FAT: bogus directroy-entries per block"
- " (%u)\n", sbi->dir_entries);
+ fat_msg(sb, KERN_ERR, "bogus directroy-entries per block"
+ " (%u)", sbi->dir_entries);
brelse(bh);
goto out_invalid;
}
@@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT);
if (total_clusters > MAX_FAT(sb)) {
if (!silent)
- printk(KERN_ERR "FAT: count of clusters too big (%u)\n",
+ fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
total_clusters);
brelse(bh);
goto out_invalid;
@@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
sprintf(buf, "cp%d", sbi->options.codepage);
sbi->nls_disk = load_nls(buf);
if (!sbi->nls_disk) {
- printk(KERN_ERR "FAT: codepage %s not found\n", buf);
+ fat_msg(sb, KERN_ERR, "codepage %s not found", buf);
goto out_fail;
}
@@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
if (sbi->options.isvfat) {
sbi->nls_io = load_nls(sbi->options.iocharset);
if (!sbi->nls_io) {
- printk(KERN_ERR "FAT: IO charset %s not found\n",
+ fat_msg(sb, KERN_ERR, "IO charset %s not found",
sbi->options.iocharset);
goto out_fail;
}
@@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
insert_inode_hash(root_inode);
sb->s_root = d_alloc_root(root_inode);
if (!sb->s_root) {
- printk(KERN_ERR "FAT: get root inode failed\n");
+ fat_msg(sb, KERN_ERR, "get root inode failed");
goto out_fail;
}
@@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
out_invalid:
error = -EINVAL;
if (!silent)
- printk(KERN_INFO "VFS: Can't find a valid FAT filesystem"
- " on dev %s.\n", sb->s_id);
+ fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem");
out_fail:
if (fat_inode)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 970e682ea754..6d93360ca0cc 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,30 +20,46 @@
* In case the file system is remounted read-only, it can be made writable
* again by remounting it.
*/
-void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...)
{
- struct fat_mount_options *opts = &MSDOS_SB(s)->options;
+ struct fat_mount_options *opts = &MSDOS_SB(sb)->options;
va_list args;
+ struct va_format vaf;
if (report) {
- printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-
- printk(KERN_ERR " ");
va_start(args, fmt);
- vprintk(fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf);
va_end(args);
- printk("\n");
}
if (opts->errors == FAT_ERRORS_PANIC)
- panic("FAT: fs panic from previous error\n");
- else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
- s->s_flags |= MS_RDONLY;
- printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
+ panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id);
+ else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) {
+ sb->s_flags |= MS_RDONLY;
+ printk(KERN_ERR "FAT-fs (%s): Filesystem has been "
+ "set read-only\n", sb->s_id);
}
}
EXPORT_SYMBOL_GPL(__fat_fs_error);
+/**
+ * fat_msg() - print preformated FAT specific messages. Every thing what is
+ * not fat_fs_error() should be fat_msg().
+ */
+void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ va_end(args);
+}
+
/* Flushes the number of free clusters on FAT32 */
/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
int fat_clusters_flush(struct super_block *sb)
@@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb)
bh = sb_bread(sb, sbi->fsinfo_sector);
if (bh == NULL) {
- printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
+ fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush");
return -EIO;
}
fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
/* Sanity check */
if (!IS_FSINFO(fsinfo)) {
- printk(KERN_ERR "FAT: Invalid FSINFO signature: "
- "0x%08x, 0x%08x (sector = %lu)\n",
+ fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: "
+ "0x%08x, 0x%08x (sector = %lu)",
le32_to_cpu(fsinfo->signature1),
le32_to_cpu(fsinfo->signature2),
sbi->fsinfo_sector);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 711499040eb6..be15437c272e 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,6 +326,8 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
+ dentry_unhash(dentry);
+
lock_super(sb);
/*
* Check whether the directory is not in use, then check
@@ -457,6 +459,9 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
err = fat_scan(old_dir, old_name, &old_sinfo);
if (err) {
err = -EIO;
@@ -659,14 +664,14 @@ static const struct inode_operations msdos_dir_inode_operations = {
static void setup(struct super_block *sb)
{
+ MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
sb->s_d_op = &msdos_dentry_operations;
sb->s_flags |= MS_NOATIME;
}
static int msdos_fill_super(struct super_block *sb, void *data, int silent)
{
- return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations,
- 0, setup);
+ return fat_fill_super(sb, data, silent, 0, setup);
}
static struct dentry *msdos_mount(struct file_system_type *fs_type,
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index adae3fb7451a..c61a6789f36c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,6 +824,8 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
struct fat_slot_info sinfo;
int err;
+ dentry_unhash(dentry);
+
lock_super(sb);
err = fat_dir_empty(inode);
@@ -931,6 +933,9 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
int err, is_dir, update_dotdot, corrupt = 0;
struct super_block *sb = old_dir->i_sb;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
@@ -1065,6 +1070,7 @@ static const struct inode_operations vfat_dir_inode_operations = {
static void setup(struct super_block *sb)
{
+ MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
if (MSDOS_SB(sb)->options.name_check != 's')
sb->s_d_op = &vfat_ci_dentry_ops;
else
@@ -1073,8 +1079,7 @@ static void setup(struct super_block *sb)
static int vfat_fill_super(struct super_block *sb, void *data, int silent)
{
- return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations,
- 1, setup);
+ return fat_fill_super(sb, data, silent, 1, setup);
}
static struct dentry *vfat_mount(struct file_system_type *fs_type,
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 2ba6719ac612..1a4311437a8b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -272,7 +272,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
* *ip: VFS inode
*
* Description:
- * vxfs_put_fake_inode frees all data asssociated with @ip.
+ * vxfs_put_fake_inode frees all data associated with @ip.
*/
void
vxfs_put_fake_inode(struct inode *ip)
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 48a18f184d50..30afdfa7aec7 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op)
_enter("{OBJ%x OP%x,%u}",
op->object->debug_id, op->debug_id, atomic_read(&op->usage));
- fscache_set_op_state(op, "EnQ");
-
ASSERT(list_empty(&op->pend_link));
ASSERT(op->processor != NULL);
ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
@@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
static void fscache_run_op(struct fscache_object *object,
struct fscache_operation *op)
{
- fscache_set_op_state(op, "Run");
-
object->n_in_progress++;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
- fscache_set_op_state(op, "SubmitX");
-
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object,
ASSERTCMP(atomic_read(&op->usage), >, 0);
- fscache_set_op_state(op, "Submit");
-
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
@@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op)
if (!atomic_dec_and_test(&op->usage))
return;
- fscache_set_op_state(op, "Put");
-
_debug("PUT OP");
if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
BUG();
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 41c441c2058d..a2a5d19ece6a 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
if (fscache_object_is_active(object)) {
- fscache_set_op_state(op, "CallFS");
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
- fscache_set_op_state(op, "Done");
if (ret < 0)
fscache_abort_object(object);
}
@@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
fscache_operation_init(op, fscache_attr_changed_op, NULL);
op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
- fscache_set_op_name(op, "Attr");
spin_lock(&cookie->lock);
@@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
op->context = context;
op->start_time = jiffies;
INIT_LIST_HEAD(&op->to_do);
- fscache_set_op_name(&op->op, "Retr");
return op;
}
@@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
_leave(" = -ENOMEM");
return -ENOMEM;
}
- fscache_set_op_name(&op->op, "RetrRA1");
spin_lock(&cookie->lock);
@@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(mapping, end_io_func, context);
if (!op)
return -ENOMEM;
- fscache_set_op_name(&op->op, "RetrRAN");
spin_lock(&cookie->lock);
@@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
- fscache_set_op_name(&op->op, "RetrAL1");
spin_lock(&cookie->lock);
@@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op)
_enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage));
- fscache_set_op_state(&op->op, "GetPage");
-
spin_lock(&object->lock);
cookie = object->cookie;
@@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op)
spin_unlock(&cookie->stores_lock);
spin_unlock(&object->lock);
- fscache_set_op_state(&op->op, "Store");
fscache_stat(&fscache_n_store_pages);
fscache_stat(&fscache_n_cop_write_page);
ret = object->cache->ops->write_page(op, page);
fscache_stat_d(&fscache_n_cop_write_page);
- fscache_set_op_state(&op->op, "EndWrite");
fscache_end_page_write(object, page);
if (ret < 0) {
- fscache_set_op_state(&op->op, "Abort");
fscache_abort_object(object);
} else {
fscache_enqueue_operation(&op->op);
@@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_operation_init(&op->op, fscache_write_op,
fscache_release_write_op);
op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING);
- fscache_set_op_name(&op->op, "Write1");
ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
if (ret < 0)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b32eb29a4e6f..0d0e3faddcfa 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,6 +667,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
if (IS_ERR(req))
return PTR_ERR(req);
+ dentry_unhash(entry);
+
req->in.h.opcode = FUSE_RMDIR;
req->in.h.nodeid = get_node_id(dir);
req->in.numargs = 1;
@@ -691,6 +693,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
struct fuse_rename_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
struct fuse_req *req = fuse_get_req(fc);
+
+ if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
+ dentry_unhash(newent);
+
if (IS_ERR(req))
return PTR_ERR(req);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index f3d23ef4e876..86128202384f 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
ccflags-y := -I$(src)
obj-$(CONFIG_GFS2_FS) += gfs2.o
gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
- glops.o inode.o log.o lops.o main.o meta_io.o \
+ glops.o log.o lops.o main.o meta_io.o \
aops.o dentry.o export.o file.o \
- ops_fstype.o ops_inode.o quota.o \
+ ops_fstype.o inode.o quota.o \
recovery.o rgrp.o super.o sys.o trans.o util.o
gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 0f5c4f9d5d62..802ac5eeba28 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1076,8 +1076,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
bd = bh->b_private;
if (bd && bd->bd_ail)
goto cannot_release;
- gfs2_assert_warn(sdp, !buffer_pinned(bh));
- gfs2_assert_warn(sdp, !buffer_dirty(bh));
+ if (buffer_pinned(bh) || buffer_dirty(bh))
+ goto not_possible;
bh = bh->b_this_page;
} while(bh != head);
gfs2_log_unlock(sdp);
@@ -1107,6 +1107,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
} while (bh != head);
return try_to_free_buffers(page);
+
+not_possible: /* Should never happen */
+ WARN_ON(buffer_dirty(bh));
+ WARN_ON(buffer_pinned(bh));
cannot_release:
gfs2_log_unlock(sdp);
return 0;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 74add2ddcc3f..e65493a8ac00 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -780,6 +780,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
metadata = (height != ip->i_height - 1);
if (metadata)
revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
+ else if (ip->i_depth)
+ revokes = sdp->sd_inptrs;
if (ip != GFS2_I(sdp->sd_rindex))
error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index f789c5732b7c..091ee4779538 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,12 +82,9 @@
struct qstr gfs2_qdot __read_mostly;
struct qstr gfs2_qdotdot __read_mostly;
-typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
- u64 leaf_no, void *data);
typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
const struct qstr *name, void *opaque);
-
int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head **bhp)
{
@@ -1600,7 +1597,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
*/
int gfs2_dir_add(struct inode *inode, const struct qstr *name,
- const struct gfs2_inode *nip, unsigned type)
+ const struct gfs2_inode *nip)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *bh;
@@ -1616,7 +1613,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
return PTR_ERR(dent);
dent = gfs2_init_dirent(inode, dent, name, bh);
gfs2_inum_out(nip, dent);
- dent->de_type = cpu_to_be16(type);
+ dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
if (ip->i_diskflags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
be16_add_cpu(&leaf->lf_entries, 1);
@@ -1628,6 +1625,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
gfs2_trans_add_bh(ip->i_gl, bh, 1);
ip->i_entries++;
ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ if (S_ISDIR(nip->i_inode.i_mode))
+ inc_nlink(&ip->i_inode);
gfs2_dinode_out(ip, bh->b_data);
brelse(bh);
error = 0;
@@ -1672,8 +1671,9 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
* Returns: 0 on success, error code on failure
*/
-int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
+int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
{
+ const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
int error;
@@ -1714,6 +1714,8 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name)
gfs2_trans_add_bh(dip->i_gl, bh, 1);
dip->i_entries--;
dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+ if (S_ISDIR(dentry->d_inode->i_mode))
+ drop_nlink(&dip->i_inode);
gfs2_dinode_out(dip, bh->b_data);
brelse(bh);
mark_inode_dirty(&dip->i_inode);
@@ -1768,94 +1770,20 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
}
/**
- * foreach_leaf - call a function for each leaf in a directory
- * @dip: the directory
- * @lc: the function to call for each each
- * @data: private data to pass to it
- *
- * Returns: errno
- */
-
-static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct buffer_head *bh;
- struct gfs2_leaf *leaf;
- u32 hsize, len;
- u32 ht_offset, lp_offset, ht_offset_cur = -1;
- u32 index = 0;
- __be64 *lp;
- u64 leaf_no;
- int error = 0;
-
- hsize = 1 << dip->i_depth;
- if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
- gfs2_consist_inode(dip);
- return -EIO;
- }
-
- lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
- if (!lp)
- return -ENOMEM;
-
- while (index < hsize) {
- lp_offset = index & (sdp->sd_hash_ptrs - 1);
- ht_offset = index - lp_offset;
-
- if (ht_offset_cur != ht_offset) {
- error = gfs2_dir_read_data(dip, (char *)lp,
- ht_offset * sizeof(__be64),
- sdp->sd_hash_bsize, 1);
- if (error != sdp->sd_hash_bsize) {
- if (error >= 0)
- error = -EIO;
- goto out;
- }
- ht_offset_cur = ht_offset;
- }
-
- leaf_no = be64_to_cpu(lp[lp_offset]);
- if (leaf_no) {
- error = get_leaf(dip, leaf_no, &bh);
- if (error)
- goto out;
- leaf = (struct gfs2_leaf *)bh->b_data;
- len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
- brelse(bh);
-
- error = lc(dip, index, len, leaf_no, data);
- if (error)
- goto out;
-
- index = (index & ~(len - 1)) + len;
- } else
- index++;
- }
-
- if (index != hsize) {
- gfs2_consist_inode(dip);
- error = -EIO;
- }
-
-out:
- kfree(lp);
-
- return error;
-}
-
-/**
* leaf_dealloc - Deallocate a directory leaf
* @dip: the directory
* @index: the hash table offset in the directory
* @len: the number of pointers to this leaf
* @leaf_no: the leaf number
- * @data: not used
+ * @leaf_bh: buffer_head for the starting leaf
+ * last_dealloc: 1 if this is the final dealloc for the leaf, else 0
*
* Returns: errno
*/
static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
- u64 leaf_no, void *data)
+ u64 leaf_no, struct buffer_head *leaf_bh,
+ int last_dealloc)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_leaf *tmp_leaf;
@@ -1887,14 +1815,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
goto out_qs;
/* Count the number of leaves */
+ bh = leaf_bh;
for (blk = leaf_no; blk; blk = nblk) {
- error = get_leaf(dip, blk, &bh);
- if (error)
- goto out_rlist;
+ if (blk != leaf_no) {
+ error = get_leaf(dip, blk, &bh);
+ if (error)
+ goto out_rlist;
+ }
tmp_leaf = (struct gfs2_leaf *)bh->b_data;
nblk = be64_to_cpu(tmp_leaf->lf_next);
- brelse(bh);
+ if (blk != leaf_no)
+ brelse(bh);
gfs2_rlist_add(sdp, &rlist, blk);
l_blocks++;
@@ -1918,13 +1850,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (error)
goto out_rg_gunlock;
+ bh = leaf_bh;
+
for (blk = leaf_no; blk; blk = nblk) {
- error = get_leaf(dip, blk, &bh);
- if (error)
- goto out_end_trans;
+ if (blk != leaf_no) {
+ error = get_leaf(dip, blk, &bh);
+ if (error)
+ goto out_end_trans;
+ }
tmp_leaf = (struct gfs2_leaf *)bh->b_data;
nblk = be64_to_cpu(tmp_leaf->lf_next);
- brelse(bh);
+ if (blk != leaf_no)
+ brelse(bh);
gfs2_free_meta(dip, blk, 1);
gfs2_add_inode_blocks(&dip->i_inode, -1);
@@ -1942,6 +1879,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
goto out_end_trans;
gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+ /* On the last dealloc, make this a regular file in case we crash.
+ (We don't want to free these blocks a second time.) */
+ if (last_dealloc)
+ dip->i_inode.i_mode = S_IFREG;
gfs2_dinode_out(dip, dibh->b_data);
brelse(dibh);
@@ -1975,29 +1916,67 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct buffer_head *bh;
- int error;
+ struct gfs2_leaf *leaf;
+ u32 hsize, len;
+ u32 ht_offset, lp_offset, ht_offset_cur = -1;
+ u32 index = 0, next_index;
+ __be64 *lp;
+ u64 leaf_no;
+ int error = 0, last;
- /* Dealloc on-disk leaves to FREEMETA state */
- error = foreach_leaf(dip, leaf_dealloc, NULL);
- if (error)
- return error;
+ hsize = 1 << dip->i_depth;
+ if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
+ gfs2_consist_inode(dip);
+ return -EIO;
+ }
- /* Make this a regular file in case we crash.
- (We don't want to free these blocks a second time.) */
+ lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
+ if (!lp)
+ return -ENOMEM;
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error)
- return error;
+ while (index < hsize) {
+ lp_offset = index & (sdp->sd_hash_ptrs - 1);
+ ht_offset = index - lp_offset;
- error = gfs2_meta_inode_buffer(dip, &bh);
- if (!error) {
- gfs2_trans_add_bh(dip->i_gl, bh, 1);
- ((struct gfs2_dinode *)bh->b_data)->di_mode =
- cpu_to_be32(S_IFREG);
- brelse(bh);
+ if (ht_offset_cur != ht_offset) {
+ error = gfs2_dir_read_data(dip, (char *)lp,
+ ht_offset * sizeof(__be64),
+ sdp->sd_hash_bsize, 1);
+ if (error != sdp->sd_hash_bsize) {
+ if (error >= 0)
+ error = -EIO;
+ goto out;
+ }
+ ht_offset_cur = ht_offset;
+ }
+
+ leaf_no = be64_to_cpu(lp[lp_offset]);
+ if (leaf_no) {
+ error = get_leaf(dip, leaf_no, &bh);
+ if (error)
+ goto out;
+ leaf = (struct gfs2_leaf *)bh->b_data;
+ len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
+
+ next_index = (index & ~(len - 1)) + len;
+ last = ((next_index >= hsize) ? 1 : 0);
+ error = leaf_dealloc(dip, index, len, leaf_no, bh,
+ last);
+ brelse(bh);
+ if (error)
+ goto out;
+ index = next_index;
+ } else
+ index++;
}
- gfs2_trans_end(sdp);
+ if (index != hsize) {
+ gfs2_consist_inode(dip);
+ error = -EIO;
+ }
+
+out:
+ kfree(lp);
return error;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index a98f644bd3df..e686af11becd 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -22,8 +22,8 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
- const struct gfs2_inode *ip, unsigned int type);
-extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
+ const struct gfs2_inode *ip);
+extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
filldir_t filldir);
extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index b5a5e60df0d5..fe9945f2ff72 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -139,7 +139,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
struct gfs2_sbd *sdp = sb->s_fs_info;
struct inode *inode;
- inode = gfs2_ilookup(sb, inum->no_addr);
+ inode = gfs2_ilookup(sb, inum->no_addr, 0);
if (inode) {
if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
iput(inode);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e48310885c48..a9f5cbe45cd9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -545,18 +545,10 @@ static int gfs2_close(struct inode *inode, struct file *file)
/**
* gfs2_fsync - sync the dirty data for a file (across the cluster)
* @file: the file that points to the dentry (we ignore this)
- * @dentry: the dentry that points to the inode to sync
+ * @datasync: set if we can ignore timestamp changes
*
- * The VFS will flush "normal" data for us. We only need to worry
- * about metadata here. For journaled data, we just do a log flush
- * as we can't avoid it. Otherwise we can just bale out if datasync
- * is set. For stuffed inodes we must flush the log in order to
- * ensure that all data is on disk.
- *
- * The call to write_inode_now() is there to write back metadata and
- * the inode itself. It does also try and write the data, but thats
- * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite()
- * for us.
+ * The VFS will flush data for us. We only need to worry
+ * about metadata here.
*
* Returns: errno
*/
@@ -565,22 +557,20 @@ static int gfs2_fsync(struct file *file, int datasync)
{
struct inode *inode = file->f_mapping->host;
int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
- int ret = 0;
-
- if (gfs2_is_jdata(GFS2_I(inode))) {
- gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
- return 0;
- }
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int ret;
- if (sync_state != 0) {
- if (!datasync)
- ret = write_inode_now(inode, 0);
+ if (datasync)
+ sync_state &= ~I_DIRTY_SYNC;
- if (gfs2_is_stuffed(GFS2_I(inode)))
- gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl);
+ if (sync_state) {
+ ret = sync_inode_metadata(inode, 1);
+ if (ret)
+ return ret;
+ gfs2_ail_flush(ip->i_gl);
}
- return ret;
+ return 0;
}
/**
@@ -826,6 +816,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t bytes, max_bytes;
struct gfs2_alloc *al;
int error;
+ loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
@@ -833,13 +824,15 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
- offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
- sdp->sd_sb.sb_bsize_shift;
+ offset &= bsize_mask;
len = next - offset;
bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
if (!bytes)
bytes = UINT_MAX;
+ bytes &= bsize_mask;
+ if (bytes == 0)
+ bytes = sdp->sd_sb.sb_bsize;
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
error = gfs2_glock_nq(&ip->i_gh);
@@ -870,6 +863,9 @@ retry:
if (error) {
if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
bytes >>= 1;
+ bytes &= bsize_mask;
+ if (bytes == 0)
+ bytes = sdp->sd_sb.sb_bsize;
goto retry;
}
goto out_qunlock;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7a4fb630a320..2792a790e50b 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -143,14 +143,9 @@ static int demote_ok(const struct gfs2_glock *gl)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- /* assert_spin_locked(&gl->gl_spin); */
-
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
- if (test_bit(GLF_LFLUSH, &gl->gl_flags))
- return 0;
- if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
- !list_empty(&gl->gl_holders))
+ if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
return glops->go_demote_ok(gl);
@@ -158,6 +153,31 @@ static int demote_ok(const struct gfs2_glock *gl)
}
+void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
+{
+ spin_lock(&lru_lock);
+
+ if (!list_empty(&gl->gl_lru))
+ list_del_init(&gl->gl_lru);
+ else
+ atomic_inc(&lru_count);
+
+ list_add_tail(&gl->gl_lru, &lru_list);
+ set_bit(GLF_LRU, &gl->gl_flags);
+ spin_unlock(&lru_lock);
+}
+
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+{
+ spin_lock(&lru_lock);
+ if (!list_empty(&gl->gl_lru)) {
+ list_del_init(&gl->gl_lru);
+ atomic_dec(&lru_count);
+ clear_bit(GLF_LRU, &gl->gl_flags);
+ }
+ spin_unlock(&lru_lock);
+}
+
/**
* __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
* @gl: the glock
@@ -168,24 +188,8 @@ static int demote_ok(const struct gfs2_glock *gl)
static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
{
- if (demote_ok(gl)) {
- spin_lock(&lru_lock);
-
- if (!list_empty(&gl->gl_lru))
- list_del_init(&gl->gl_lru);
- else
- atomic_inc(&lru_count);
-
- list_add_tail(&gl->gl_lru, &lru_list);
- spin_unlock(&lru_lock);
- }
-}
-
-void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
-{
- spin_lock(&gl->gl_spin);
- __gfs2_glock_schedule_for_reclaim(gl);
- spin_unlock(&gl->gl_spin);
+ if (demote_ok(gl))
+ gfs2_glock_add_to_lru(gl);
}
/**
@@ -217,12 +221,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
spin_lock_bucket(gl->gl_hash);
hlist_bl_del_rcu(&gl->gl_list);
spin_unlock_bucket(gl->gl_hash);
- spin_lock(&lru_lock);
- if (!list_empty(&gl->gl_lru)) {
- list_del_init(&gl->gl_lru);
- atomic_dec(&lru_count);
- }
- spin_unlock(&lru_lock);
+ gfs2_glock_remove_from_lru(gl);
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
trace_gfs2_glock_put(gl);
@@ -542,11 +541,6 @@ __acquires(&gl->gl_spin)
clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
gfs2_glock_hold(gl);
- if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
- gl->gl_state == LM_ST_DEFERRED) &&
- !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
- lck_flags |= LM_FLAG_TRY_1CB;
-
if (sdp->sd_lockstruct.ls_ops->lm_lock) {
/* lock_dlm */
ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
@@ -648,7 +642,7 @@ static void delete_work_func(struct work_struct *work)
/* Note: Unsafe to dereference ip as we don't hold right refs/locks */
if (ip)
- inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+ inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1);
else
inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED);
if (inode && !IS_ERR(inode)) {
@@ -1025,6 +1019,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
return -EIO;
+ if (test_bit(GLF_LRU, &gl->gl_flags))
+ gfs2_glock_remove_from_lru(gl);
+
spin_lock(&gl->gl_spin);
add_to_queue(gh);
if ((LM_FLAG_NOEXP & gh->gh_flags) &&
@@ -1082,7 +1079,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
!test_bit(GLF_DEMOTE, &gl->gl_flags))
fast_path = 1;
}
- __gfs2_glock_schedule_for_reclaim(gl);
+ if (!test_bit(GLF_LFLUSH, &gl->gl_flags))
+ __gfs2_glock_schedule_for_reclaim(gl);
trace_gfs2_glock_queue(gh, 0);
spin_unlock(&gl->gl_spin);
if (likely(fast_path))
@@ -1348,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
}
-static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
struct gfs2_glock *gl;
int may_demote;
int nr_skipped = 0;
+ int nr = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
LIST_HEAD(skipped);
if (nr == 0)
@@ -1365,6 +1366,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
while(nr && !list_empty(&lru_list)) {
gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
+ clear_bit(GLF_LRU, &gl->gl_flags);
atomic_dec(&lru_count);
/* Test for being demotable */
@@ -1387,6 +1389,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m
}
nr_skipped++;
list_add(&gl->gl_lru, &skipped);
+ set_bit(GLF_LRU, &gl->gl_flags);
}
list_splice(&skipped, &lru_list);
atomic_add(nr_skipped, &lru_count);
@@ -1459,12 +1462,7 @@ static void thaw_glock(struct gfs2_glock *gl)
static void clear_glock(struct gfs2_glock *gl)
{
- spin_lock(&lru_lock);
- if (!list_empty(&gl->gl_lru)) {
- list_del_init(&gl->gl_lru);
- atomic_dec(&lru_count);
- }
- spin_unlock(&lru_lock);
+ gfs2_glock_remove_from_lru(gl);
spin_lock(&gl->gl_spin);
if (gl->gl_state != LM_ST_UNLOCKED)
@@ -1599,9 +1597,11 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
return 0;
}
-static const char *gflags2str(char *buf, const unsigned long *gflags)
+static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
{
+ const unsigned long *gflags = &gl->gl_flags;
char *p = buf;
+
if (test_bit(GLF_LOCK, gflags))
*p++ = 'l';
if (test_bit(GLF_DEMOTE, gflags))
@@ -1624,6 +1624,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
*p++ = 'F';
if (test_bit(GLF_QUEUED, gflags))
*p++ = 'q';
+ if (test_bit(GLF_LRU, gflags))
+ *p++ = 'L';
+ if (gl->gl_object)
+ *p++ = 'o';
*p = 0;
return buf;
}
@@ -1658,14 +1662,15 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
dtime *= 1000000/HZ; /* demote time in uSec */
if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
dtime = 0;
- gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n",
+ gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n",
state2str(gl->gl_state),
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number,
- gflags2str(gflags_buf, &gl->gl_flags),
+ gflags2str(gflags_buf, gl),
state2str(gl->gl_target),
state2str(gl->gl_demote_state), dtime,
atomic_read(&gl->gl_ail_count),
+ atomic_read(&gl->gl_revokes),
atomic_read(&gl->gl_ref));
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index aea160690e94..6b2f757b9281 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -225,11 +225,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
-extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
-extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
+extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl);
extern void gfs2_glock_free(struct gfs2_glock *gl);
extern int __init gfs2_glock_init(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 25eeb2bcee47..8ef70f464731 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,33 +28,18 @@
#include "trans.h"
/**
- * ail_empty_gl - remove all buffers for a given lock from the AIL
+ * __gfs2_ail_flush - remove all buffers for a given lock from the AIL
* @gl: the glock
*
* None of the buffers should be dirty, locked, or pinned.
*/
-static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+static void __gfs2_ail_flush(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct list_head *head = &gl->gl_ail_list;
struct gfs2_bufdata *bd;
struct buffer_head *bh;
- struct gfs2_trans tr;
-
- memset(&tr, 0, sizeof(tr));
- tr.tr_revokes = atomic_read(&gl->gl_ail_count);
-
- if (!tr.tr_revokes)
- return;
-
- /* A shortened, inline version of gfs2_trans_begin() */
- tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
- tr.tr_ip = (unsigned long)__builtin_return_address(0);
- INIT_LIST_HEAD(&tr.tr_list_buf);
- gfs2_log_reserve(sdp, tr.tr_reserved);
- BUG_ON(current->journal_info);
- current->journal_info = &tr;
spin_lock(&sdp->sd_ail_lock);
while (!list_empty(head)) {
@@ -76,7 +61,47 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
}
gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
spin_unlock(&sdp->sd_ail_lock);
+}
+
+
+static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct gfs2_trans tr;
+
+ memset(&tr, 0, sizeof(tr));
+ tr.tr_revokes = atomic_read(&gl->gl_ail_count);
+
+ if (!tr.tr_revokes)
+ return;
+
+ /* A shortened, inline version of gfs2_trans_begin() */
+ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+ tr.tr_ip = (unsigned long)__builtin_return_address(0);
+ INIT_LIST_HEAD(&tr.tr_list_buf);
+ gfs2_log_reserve(sdp, tr.tr_reserved);
+ BUG_ON(current->journal_info);
+ current->journal_info = &tr;
+
+ __gfs2_ail_flush(gl);
+
+ gfs2_trans_end(sdp);
+ gfs2_log_flush(sdp, NULL);
+}
+
+void gfs2_ail_flush(struct gfs2_glock *gl)
+{
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ unsigned int revokes = atomic_read(&gl->gl_ail_count);
+ int ret;
+
+ if (!revokes)
+ return;
+ ret = gfs2_trans_begin(sdp, 0, revokes);
+ if (ret)
+ return;
+ __gfs2_ail_flush(gl);
gfs2_trans_end(sdp);
gfs2_log_flush(sdp, NULL);
}
@@ -227,6 +252,119 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
}
/**
+ * gfs2_set_nlink - Set the inode's link count based on on-disk info
+ * @inode: The inode in question
+ * @nlink: The link count
+ *
+ * If the link count has hit zero, it must never be raised, whatever the
+ * on-disk inode might say. When new struct inodes are created the link
+ * count is set to 1, so that we can safely use this test even when reading
+ * in on disk information for the first time.
+ */
+
+static void gfs2_set_nlink(struct inode *inode, u32 nlink)
+{
+ /*
+ * We will need to review setting the nlink count here in the
+ * light of the forthcoming ro bind mount work. This is a reminder
+ * to do that.
+ */
+ if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) {
+ if (nlink == 0)
+ clear_nlink(inode);
+ else
+ inode->i_nlink = nlink;
+ }
+}
+
+static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
+{
+ const struct gfs2_dinode *str = buf;
+ struct timespec atime;
+ u16 height, depth;
+
+ if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
+ goto corrupt;
+ ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
+ ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
+ ip->i_inode.i_rdev = 0;
+ switch (ip->i_inode.i_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
+ be32_to_cpu(str->di_minor));
+ break;
+ };
+
+ ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
+ ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
+ gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
+ i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
+ gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+ atime.tv_sec = be64_to_cpu(str->di_atime);
+ atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+ if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+ ip->i_inode.i_atime = atime;
+ ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+ ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+ ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+ ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+
+ ip->i_goal = be64_to_cpu(str->di_goal_meta);
+ ip->i_generation = be64_to_cpu(str->di_generation);
+
+ ip->i_diskflags = be32_to_cpu(str->di_flags);
+ gfs2_set_inode_flags(&ip->i_inode);
+ height = be16_to_cpu(str->di_height);
+ if (unlikely(height > GFS2_MAX_META_HEIGHT))
+ goto corrupt;
+ ip->i_height = (u8)height;
+
+ depth = be16_to_cpu(str->di_depth);
+ if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
+ goto corrupt;
+ ip->i_depth = (u8)depth;
+ ip->i_entries = be32_to_cpu(str->di_entries);
+
+ ip->i_eattr = be64_to_cpu(str->di_eattr);
+ if (S_ISREG(ip->i_inode.i_mode))
+ gfs2_set_aops(&ip->i_inode);
+
+ return 0;
+corrupt:
+ gfs2_consist_inode(ip);
+ return -EIO;
+}
+
+/**
+ * gfs2_inode_refresh - Refresh the incore copy of the dinode
+ * @ip: The GFS2 inode
+ *
+ * Returns: errno
+ */
+
+int gfs2_inode_refresh(struct gfs2_inode *ip)
+{
+ struct buffer_head *dibh;
+ int error;
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ return error;
+
+ if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
+ brelse(dibh);
+ return -EIO;
+ }
+
+ error = gfs2_dinode_in(ip, dibh->b_data);
+ brelse(dibh);
+ clear_bit(GIF_INVALID, &ip->i_flags);
+
+ return error;
+}
+
+/**
* inode_go_lock - operation done after an inode lock is locked by a process
* @gl: the glock
* @flags:
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index b3aa2e3210fd..6fce409b5a50 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -23,4 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops;
extern const struct gfs2_glock_operations gfs2_journal_glops;
extern const struct gfs2_glock_operations *gfs2_glops_list[];
+extern void gfs2_ail_flush(struct gfs2_glock *gl);
+
#endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 870a89d6d4dc..0a064e91ac70 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -20,7 +20,6 @@
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
-#define DIO_ALL 0x00000100
struct gfs2_log_operations;
struct gfs2_log_element;
@@ -200,6 +199,8 @@ enum {
GLF_INITIAL = 10,
GLF_FROZEN = 11,
GLF_QUEUED = 12,
+ GLF_LRU = 13,
+ GLF_OBJECT = 14, /* Used only for tracing */
};
struct gfs2_glock {
@@ -234,6 +235,7 @@ struct gfs2_glock {
struct list_head gl_ail_list;
atomic_t gl_ail_count;
+ atomic_t gl_revokes;
struct delayed_work gl_work;
struct work_struct gl_delete;
struct rcu_head gl_rcu;
@@ -374,8 +376,6 @@ struct gfs2_ail {
unsigned int ai_first;
struct list_head ai_ail1_list;
struct list_head ai_ail2_list;
-
- u64 ai_sync_gen;
};
struct gfs2_journal_extent {
@@ -488,7 +488,6 @@ struct gfs2_sb_host {
char sb_lockproto[GFS2_LOCKNAME_LEN];
char sb_locktable[GFS2_LOCKNAME_LEN];
- u8 sb_uuid[16];
};
/*
@@ -654,7 +653,6 @@ struct gfs2_sbd {
spinlock_t sd_ail_lock;
struct list_head sd_ail1_list;
struct list_head sd_ail2_list;
- u64 sd_ail_sync_gen;
/* Replay stuff */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9134dcb89479..03e0c529063e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,23 +1,25 @@
/*
* Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU General Public License version 2.
*/
-#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/namei.h>
+#include <linux/mm.h>
+#include <linux/xattr.h>
#include <linux/posix_acl.h>
-#include <linux/sort.h>
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
+#include <linux/fiemap.h>
#include <linux/security.h>
-#include <linux/time.h>
+#include <asm/uaccess.h>
#include "gfs2.h"
#include "incore.h"
@@ -26,19 +28,14 @@
#include "dir.h"
#include "xattr.h"
#include "glock.h"
-#include "glops.h"
#include "inode.h"
-#include "log.h"
#include "meta_io.h"
#include "quota.h"
#include "rgrp.h"
#include "trans.h"
#include "util.h"
-
-struct gfs2_inum_range_host {
- u64 ir_start;
- u64 ir_length;
-};
+#include "super.h"
+#include "glops.h"
struct gfs2_skip_data {
u64 no_addr;
@@ -74,14 +71,14 @@ static int iget_set(struct inode *inode, void *opaque)
return 0;
}
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr)
+struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block)
{
unsigned long hash = (unsigned long)no_addr;
struct gfs2_skip_data data;
data.no_addr = no_addr;
data.skipped = 0;
- data.non_block = 0;
+ data.non_block = non_block;
return ilookup5(sb, hash, iget_test, &data);
}
@@ -248,203 +245,6 @@ fail_iput:
goto fail;
}
-static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
-{
- const struct gfs2_dinode *str = buf;
- struct timespec atime;
- u16 height, depth;
-
- if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
- goto corrupt;
- ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
- ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
- ip->i_inode.i_rdev = 0;
- switch (ip->i_inode.i_mode & S_IFMT) {
- case S_IFBLK:
- case S_IFCHR:
- ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
- be32_to_cpu(str->di_minor));
- break;
- };
-
- ip->i_inode.i_uid = be32_to_cpu(str->di_uid);
- ip->i_inode.i_gid = be32_to_cpu(str->di_gid);
- /*
- * We will need to review setting the nlink count here in the
- * light of the forthcoming ro bind mount work. This is a reminder
- * to do that.
- */
- ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
- i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
- gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
- atime.tv_sec = be64_to_cpu(str->di_atime);
- atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
- if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
- ip->i_inode.i_atime = atime;
- ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
- ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
- ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
- ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
-
- ip->i_goal = be64_to_cpu(str->di_goal_meta);
- ip->i_generation = be64_to_cpu(str->di_generation);
-
- ip->i_diskflags = be32_to_cpu(str->di_flags);
- gfs2_set_inode_flags(&ip->i_inode);
- height = be16_to_cpu(str->di_height);
- if (unlikely(height > GFS2_MAX_META_HEIGHT))
- goto corrupt;
- ip->i_height = (u8)height;
-
- depth = be16_to_cpu(str->di_depth);
- if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
- goto corrupt;
- ip->i_depth = (u8)depth;
- ip->i_entries = be32_to_cpu(str->di_entries);
-
- ip->i_eattr = be64_to_cpu(str->di_eattr);
- if (S_ISREG(ip->i_inode.i_mode))
- gfs2_set_aops(&ip->i_inode);
-
- return 0;
-corrupt:
- if (gfs2_consist_inode(ip))
- gfs2_dinode_print(ip);
- return -EIO;
-}
-
-/**
- * gfs2_inode_refresh - Refresh the incore copy of the dinode
- * @ip: The GFS2 inode
- *
- * Returns: errno
- */
-
-int gfs2_inode_refresh(struct gfs2_inode *ip)
-{
- struct buffer_head *dibh;
- int error;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
- if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) {
- brelse(dibh);
- return -EIO;
- }
-
- error = gfs2_dinode_in(ip, dibh->b_data);
- brelse(dibh);
- clear_bit(GIF_INVALID, &ip->i_flags);
-
- return error;
-}
-
-int gfs2_dinode_dealloc(struct gfs2_inode *ip)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_alloc *al;
- struct gfs2_rgrpd *rgd;
- int error;
-
- if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
- if (gfs2_consist_inode(ip))
- gfs2_dinode_print(ip);
- return -EIO;
- }
-
- al = gfs2_alloc_get(ip);
- if (!al)
- return -ENOMEM;
-
- error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
- if (error)
- goto out;
-
- error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
- if (error)
- goto out_qs;
-
- rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
- if (!rgd) {
- gfs2_consist_inode(ip);
- error = -EIO;
- goto out_rindex_relse;
- }
-
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
- &al->al_rgd_gh);
- if (error)
- goto out_rindex_relse;
-
- error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1);
- if (error)
- goto out_rg_gunlock;
-
- set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
- set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags);
-
- gfs2_free_di(rgd, ip);
-
- gfs2_trans_end(sdp);
-
-out_rg_gunlock:
- gfs2_glock_dq_uninit(&al->al_rgd_gh);
-out_rindex_relse:
- gfs2_glock_dq_uninit(&al->al_ri_gh);
-out_qs:
- gfs2_quota_unhold(ip);
-out:
- gfs2_alloc_put(ip);
- return error;
-}
-
-/**
- * gfs2_change_nlink - Change nlink count on inode
- * @ip: The GFS2 inode
- * @diff: The change in the nlink count required
- *
- * Returns: errno
- */
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
-{
- struct buffer_head *dibh;
- u32 nlink;
- int error;
-
- BUG_ON(diff != 1 && diff != -1);
- nlink = ip->i_inode.i_nlink + diff;
-
- /* If we are reducing the nlink count, but the new value ends up being
- bigger than the old one, we must have underflowed. */
- if (diff < 0 && nlink > ip->i_inode.i_nlink) {
- if (gfs2_consist_inode(ip))
- gfs2_dinode_print(ip);
- return -EIO;
- }
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
- if (diff > 0)
- inc_nlink(&ip->i_inode);
- else
- drop_nlink(&ip->i_inode);
-
- ip->i_inode.i_ctime = CURRENT_TIME;
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- mark_inode_dirty(&ip->i_inode);
-
- if (ip->i_inode.i_nlink == 0)
- gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
-
- return error;
-}
struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
{
@@ -543,7 +343,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
/* Don't create entries in an unlinked directory */
if (!dip->i_inode.i_nlink)
- return -EPERM;
+ return -ENOENT;
error = gfs2_dir_check(&dip->i_inode, name, NULL);
switch (error) {
@@ -613,21 +413,44 @@ out:
return error;
}
+static void gfs2_init_dir(struct buffer_head *dibh,
+ const struct gfs2_inode *parent)
+{
+ struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
+ struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
+
+ gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
+ dent->de_inum = di->di_num; /* already GFS2 endian */
+ dent->de_type = cpu_to_be16(DT_DIR);
+
+ dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
+ gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
+ gfs2_inum_out(parent, dent);
+ dent->de_type = cpu_to_be16(DT_DIR);
+
+}
+
/**
* init_dinode - Fill in a new dinode structure
- * @dip: the directory this inode is being created in
+ * @dip: The directory this inode is being created in
* @gl: The glock covering the new inode
- * @inum: the inode number
- * @mode: the file permissions
- * @uid:
- * @gid:
+ * @inum: The inode number
+ * @mode: The file permissions
+ * @uid: The uid of the new inode
+ * @gid: The gid of the new inode
+ * @generation: The generation number of the new inode
+ * @dev: The device number (if a device node)
+ * @symname: The symlink destination (if a symlink)
+ * @size: The inode size (ignored for directories)
+ * @bhp: The buffer head (returned to caller)
*
*/
static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
const struct gfs2_inum_host *inum, unsigned int mode,
unsigned int uid, unsigned int gid,
- const u64 *generation, dev_t dev, struct buffer_head **bhp)
+ const u64 *generation, dev_t dev, const char *symname,
+ unsigned size, struct buffer_head **bhp)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_dinode *di;
@@ -646,7 +469,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
di->di_uid = cpu_to_be32(uid);
di->di_gid = cpu_to_be32(gid);
di->di_nlink = 0;
- di->di_size = 0;
+ di->di_size = cpu_to_be64(size);
di->di_blocks = cpu_to_be64(1);
di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
di->di_major = cpu_to_be32(MAJOR(dev));
@@ -654,16 +477,6 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
di->di_generation = cpu_to_be64(*generation);
di->di_flags = 0;
-
- if (S_ISREG(mode)) {
- if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
- gfs2_tune_get(sdp, gt_new_files_jdata))
- di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
- } else if (S_ISDIR(mode)) {
- di->di_flags |= cpu_to_be32(dip->i_diskflags &
- GFS2_DIF_INHERIT_JDATA);
- }
-
di->__pad1 = 0;
di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
di->di_height = 0;
@@ -677,7 +490,26 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec);
di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
memset(&di->di_reserved, 0, sizeof(di->di_reserved));
-
+
+ switch(mode & S_IFMT) {
+ case S_IFREG:
+ if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
+ gfs2_tune_get(sdp, gt_new_files_jdata))
+ di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+ break;
+ case S_IFDIR:
+ di->di_flags |= cpu_to_be32(dip->i_diskflags &
+ GFS2_DIF_INHERIT_JDATA);
+ di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
+ di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
+ di->di_entries = cpu_to_be32(2);
+ gfs2_init_dir(dibh, dip);
+ break;
+ case S_IFLNK:
+ memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size);
+ break;
+ }
+
set_buffer_uptodate(dibh);
*bhp = dibh;
@@ -685,7 +517,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
unsigned int mode, const struct gfs2_inum_host *inum,
- const u64 *generation, dev_t dev, struct buffer_head **bhp)
+ const u64 *generation, dev_t dev, const char *symname,
+ unsigned int size, struct buffer_head **bhp)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
unsigned int uid, gid;
@@ -707,7 +540,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
if (error)
goto out_quota;
- init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp);
+ init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp);
gfs2_quota_change(dip, +1, uid, gid);
gfs2_trans_end(sdp);
@@ -761,14 +594,16 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
goto fail_quota_locks;
}
- error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode));
+ error = gfs2_dir_add(&dip->i_inode, name, ip);
if (error)
goto fail_end_trans;
error = gfs2_meta_inode_buffer(ip, &dibh);
if (error)
goto fail_end_trans;
- ip->i_inode.i_nlink = 1;
+ inc_nlink(&ip->i_inode);
+ if (S_ISDIR(ip->i_inode.i_mode))
+ inc_nlink(&ip->i_inode);
gfs2_trans_add_bh(ip->i_gl, dibh, 1);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -815,27 +650,25 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
}
/**
- * gfs2_createi - Create a new inode
- * @ghs: An array of two holders
- * @name: The name of the new file
- * @mode: the permissions on the new inode
- *
- * @ghs[0] is an initialized holder for the directory
- * @ghs[1] is the holder for the inode lock
+ * gfs2_create_inode - Create a new inode
+ * @dir: The parent directory
+ * @dentry: The new dentry
+ * @mode: The permissions on the new inode
+ * @dev: For device nodes, this is the device number
+ * @symname: For symlinks, this is the link destination
+ * @size: The initial size of the inode (ignored for directories)
*
- * If the return value is not NULL, the glocks on both the directory and the new
- * file are held. A transaction has been started and an inplace reservation
- * is held, as well.
- *
- * Returns: An inode
+ * Returns: 0 on success, or error code
*/
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
- unsigned int mode, dev_t dev)
+static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
+ unsigned int mode, dev_t dev, const char *symname,
+ unsigned int size)
{
+ const struct qstr *name = &dentry->d_name;
+ struct gfs2_holder ghs[2];
struct inode *inode = NULL;
- struct gfs2_inode *dip = ghs->gh_gl->gl_object;
- struct inode *dir = &dip->i_inode;
+ struct gfs2_inode *dip = GFS2_I(dir);
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
int error;
@@ -843,10 +676,9 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
struct buffer_head *bh = NULL;
if (!name->len || name->len > GFS2_FNAMESIZE)
- return ERR_PTR(-ENAMETOOLONG);
+ return -ENAMETOOLONG;
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs);
- error = gfs2_glock_nq(ghs);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
if (error)
goto fail;
@@ -864,7 +696,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (error)
goto fail_gunlock;
- error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh);
+ error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh);
if (error)
goto fail_gunlock2;
@@ -891,18 +723,852 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (bh)
brelse(bh);
- return inode;
+
+ gfs2_trans_end(sdp);
+ if (dip->i_alloc->al_rgd)
+ gfs2_inplace_release(dip);
+ gfs2_quota_unlock(dip);
+ gfs2_alloc_put(dip);
+ gfs2_glock_dq_uninit_m(2, ghs);
+ mark_inode_dirty(inode);
+ d_instantiate(dentry, inode);
+ return 0;
fail_gunlock2:
gfs2_glock_dq_uninit(ghs + 1);
if (inode && !IS_ERR(inode))
iput(inode);
fail_gunlock:
- gfs2_glock_dq(ghs);
+ gfs2_glock_dq_uninit(ghs);
fail:
if (bh)
brelse(bh);
- return ERR_PTR(error);
+ return error;
+}
+
+/**
+ * gfs2_create - Create a file
+ * @dir: The directory in which to create the file
+ * @dentry: The dentry of the new file
+ * @mode: The mode of the new file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_create(struct inode *dir, struct dentry *dentry,
+ int mode, struct nameidata *nd)
+{
+ struct inode *inode;
+ int ret;
+
+ for (;;) {
+ ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0);
+ if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL)))
+ return ret;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ if (inode) {
+ if (!IS_ERR(inode))
+ break;
+ return PTR_ERR(inode);
+ }
+ }
+
+ d_instantiate(dentry, inode);
+ return 0;
+}
+
+/**
+ * gfs2_lookup - Look up a filename in a directory and return its inode
+ * @dir: The directory inode
+ * @dentry: The dentry of the new inode
+ * @nd: passed from Linux VFS, ignored by us
+ *
+ * Called by the VFS layer. Lock dir and call gfs2_lookupi()
+ *
+ * Returns: errno
+ */
+
+static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct inode *inode = NULL;
+
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ if (inode && IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ if (inode) {
+ struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
+ struct gfs2_holder gh;
+ int error;
+ error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error) {
+ iput(inode);
+ return ERR_PTR(error);
+ }
+ gfs2_glock_dq_uninit(&gh);
+ return d_splice_alias(inode, dentry);
+ }
+ d_add(dentry, inode);
+
+ return NULL;
+}
+
+/**
+ * gfs2_link - Link to a file
+ * @old_dentry: The inode to link
+ * @dir: Add link to this directory
+ * @dentry: The name of the link
+ *
+ * Link the inode in "old_dentry" into the directory "dir" with the
+ * name in "dentry".
+ *
+ * Returns: errno
+ */
+
+static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct inode *inode = old_dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder ghs[2];
+ struct buffer_head *dibh;
+ int alloc_required;
+ int error;
+
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+ error = gfs2_glock_nq(ghs); /* parent */
+ if (error)
+ goto out_parent;
+
+ error = gfs2_glock_nq(ghs + 1); /* child */
+ if (error)
+ goto out_child;
+
+ error = -ENOENT;
+ if (inode->i_nlink == 0)
+ goto out_gunlock;
+
+ error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_dir_check(dir, &dentry->d_name, NULL);
+ switch (error) {
+ case -ENOENT:
+ break;
+ case 0:
+ error = -EEXIST;
+ default:
+ goto out_gunlock;
+ }
+
+ error = -EINVAL;
+ if (!dip->i_inode.i_nlink)
+ goto out_gunlock;
+ error = -EFBIG;
+ if (dip->i_entries == (u32)-1)
+ goto out_gunlock;
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out_gunlock;
+ error = -EINVAL;
+ if (!ip->i_inode.i_nlink)
+ goto out_gunlock;
+ error = -EMLINK;
+ if (ip->i_inode.i_nlink == (u32)-1)
+ goto out_gunlock;
+
+ alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+ if (error < 0)
+ goto out_gunlock;
+ error = 0;
+
+ if (alloc_required) {
+ struct gfs2_alloc *al = gfs2_alloc_get(dip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_gunlock;
+ }
+
+ error = gfs2_quota_lock_check(dip);
+ if (error)
+ goto out_alloc;
+
+ al->al_requested = sdp->sd_max_dirres;
+
+ error = gfs2_inplace_reserve(dip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+ gfs2_rg_blocks(al) +
+ 2 * RES_DINODE + RES_STATFS +
+ RES_QUOTA, 0);
+ if (error)
+ goto out_ipres;
+ } else {
+ error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
+ if (error)
+ goto out_ipres;
+ }
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_dir_add(dir, &dentry->d_name, ip);
+ if (error)
+ goto out_brelse;
+
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ inc_nlink(&ip->i_inode);
+ ip->i_inode.i_ctime = CURRENT_TIME;
+ gfs2_dinode_out(ip, dibh->b_data);
+ mark_inode_dirty(&ip->i_inode);
+
+out_brelse:
+ brelse(dibh);
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipres:
+ if (alloc_required)
+ gfs2_inplace_release(dip);
+out_gunlock_q:
+ if (alloc_required)
+ gfs2_quota_unlock(dip);
+out_alloc:
+ if (alloc_required)
+ gfs2_alloc_put(dip);
+out_gunlock:
+ gfs2_glock_dq(ghs + 1);
+out_child:
+ gfs2_glock_dq(ghs);
+out_parent:
+ gfs2_holder_uninit(ghs);
+ gfs2_holder_uninit(ghs + 1);
+ if (!error) {
+ ihold(inode);
+ d_instantiate(dentry, inode);
+ mark_inode_dirty(inode);
+ }
+ return error;
+}
+
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+ const struct gfs2_inode *ip)
+{
+ int error;
+
+ if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+ return -EPERM;
+
+ if ((dip->i_inode.i_mode & S_ISVTX) &&
+ dip->i_inode.i_uid != current_fsuid() &&
+ ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+ return -EPERM;
+
+ if (IS_APPEND(&dip->i_inode))
+ return -EPERM;
+
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+ if (error)
+ return error;
+
+ error = gfs2_dir_check(&dip->i_inode, name, ip);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+/**
+ * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
+ * @dip: The parent directory
+ * @name: The name of the entry in the parent directory
+ * @bh: The inode buffer for the inode to be removed
+ * @inode: The inode to be removed
+ *
+ * Called with all the locks and in a transaction. This will only be
+ * called for a directory after it has been checked to ensure it is empty.
+ *
+ * Returns: 0 on success, or an error
+ */
+
+static int gfs2_unlink_inode(struct gfs2_inode *dip,
+ const struct dentry *dentry,
+ struct buffer_head *bh)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ int error;
+
+ error = gfs2_dir_del(dip, dentry);
+ if (error)
+ return error;
+
+ ip->i_entries = 0;
+ inode->i_ctime = CURRENT_TIME;
+ if (S_ISDIR(inode->i_mode))
+ clear_nlink(inode);
+ else
+ drop_nlink(inode);
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_dinode_out(ip, bh->b_data);
+ mark_inode_dirty(inode);
+ if (inode->i_nlink == 0)
+ gfs2_unlink_di(inode);
+ return 0;
+}
+
+
+/**
+ * gfs2_unlink - Unlink an inode (this does rmdir as well)
+ * @dir: The inode of the directory containing the inode to unlink
+ * @dentry: The file itself
+ *
+ * This routine uses the type of the inode as a flag to figure out
+ * whether this is an unlink or an rmdir.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct gfs2_inode *dip = GFS2_I(dir);
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct buffer_head *bh;
+ struct gfs2_holder ghs[3];
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder ri_gh;
+ int error;
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ return error;
+
+ gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+ gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
+
+
+ error = gfs2_glock_nq(ghs); /* parent */
+ if (error)
+ goto out_parent;
+
+ error = gfs2_glock_nq(ghs + 1); /* child */
+ if (error)
+ goto out_child;
+
+ error = -ENOENT;
+ if (inode->i_nlink == 0)
+ goto out_rgrp;
+
+ if (S_ISDIR(inode->i_mode)) {
+ error = -ENOTEMPTY;
+ if (ip->i_entries > 2 || inode->i_nlink > 2)
+ goto out_rgrp;
+ }
+
+ error = gfs2_glock_nq(ghs + 2); /* rgrp */
+ if (error)
+ goto out_rgrp;
+
+ error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_meta_inode_buffer(ip, &bh);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_unlink_inode(dip, dentry, bh);
+ brelse(bh);
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_gunlock:
+ gfs2_glock_dq(ghs + 2);
+out_rgrp:
+ gfs2_holder_uninit(ghs + 2);
+ gfs2_glock_dq(ghs + 1);
+out_child:
+ gfs2_holder_uninit(ghs + 1);
+ gfs2_glock_dq(ghs);
+out_parent:
+ gfs2_holder_uninit(ghs);
+ gfs2_glock_dq_uninit(&ri_gh);
+ return error;
+}
+
+/**
+ * gfs2_symlink - Create a symlink
+ * @dir: The directory to create the symlink in
+ * @dentry: The dentry to put the symlink in
+ * @symname: The thing which the link points to
+ *
+ * Returns: errno
+ */
+
+static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(dir);
+ unsigned int size;
+
+ size = strlen(symname);
+ if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
+ return -ENAMETOOLONG;
+
+ return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size);
+}
+
+/**
+ * gfs2_mkdir - Make a directory
+ * @dir: The parent directory of the new one
+ * @dentry: The dentry of the new directory
+ * @mode: The mode of the new directory
+ *
+ * Returns: errno
+ */
+
+static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0);
+}
+
+/**
+ * gfs2_mknod - Make a special file
+ * @dir: The directory in which the special file will reside
+ * @dentry: The dentry of the special file
+ * @mode: The mode of the special file
+ * @dev: The device specification of the special file
+ *
+ */
+
+static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
+ dev_t dev)
+{
+ return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0);
+}
+
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+ struct inode *dir = &to->i_inode;
+ struct super_block *sb = dir->i_sb;
+ struct inode *tmp;
+ int error = 0;
+
+ igrab(dir);
+
+ for (;;) {
+ if (dir == &this->i_inode) {
+ error = -EINVAL;
+ break;
+ }
+ if (dir == sb->s_root->d_inode) {
+ error = 0;
+ break;
+ }
+
+ tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+ if (IS_ERR(tmp)) {
+ error = PTR_ERR(tmp);
+ break;
+ }
+
+ iput(dir);
+ dir = tmp;
+ }
+
+ iput(dir);
+
+ return error;
+}
+
+/**
+ * gfs2_rename - Rename a file
+ * @odir: Parent directory of old file name
+ * @odentry: The old dentry of the file
+ * @ndir: Parent directory of new file name
+ * @ndentry: The new dentry of the file
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rename(struct inode *odir, struct dentry *odentry,
+ struct inode *ndir, struct dentry *ndentry)
+{
+ struct gfs2_inode *odip = GFS2_I(odir);
+ struct gfs2_inode *ndip = GFS2_I(ndir);
+ struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
+ struct gfs2_inode *nip = NULL;
+ struct gfs2_sbd *sdp = GFS2_SB(odir);
+ struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
+ struct gfs2_rgrpd *nrgd;
+ unsigned int num_gh;
+ int dir_rename = 0;
+ int alloc_required = 0;
+ unsigned int x;
+ int error;
+
+ if (ndentry->d_inode) {
+ nip = GFS2_I(ndentry->d_inode);
+ if (ip == nip)
+ return 0;
+ }
+
+ error = gfs2_rindex_hold(sdp, &ri_gh);
+ if (error)
+ return error;
+
+ if (odip != ndip) {
+ error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+ 0, &r_gh);
+ if (error)
+ goto out;
+
+ if (S_ISDIR(ip->i_inode.i_mode)) {
+ dir_rename = 1;
+ /* don't move a dirctory into it's subdir */
+ error = gfs2_ok_to_move(ip, ndip);
+ if (error)
+ goto out_gunlock_r;
+ }
+ }
+
+ num_gh = 1;
+ gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ if (odip != ndip) {
+ gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+ }
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+
+ if (nip) {
+ gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
+ num_gh++;
+ /* grab the resource lock for unlink flag twiddling
+ * this is the case of the target file already existing
+ * so we unlink before doing the rename
+ */
+ nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
+ if (nrgd)
+ gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
+ }
+
+ for (x = 0; x < num_gh; x++) {
+ error = gfs2_glock_nq(ghs + x);
+ if (error)
+ goto out_gunlock;
+ }
+
+ error = -ENOENT;
+ if (ip->i_inode.i_nlink == 0)
+ goto out_gunlock;
+
+ /* Check out the old directory */
+
+ error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
+ if (error)
+ goto out_gunlock;
+
+ /* Check out the new directory */
+
+ if (nip) {
+ error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
+ if (error)
+ goto out_gunlock;
+
+ if (nip->i_inode.i_nlink == 0) {
+ error = -EAGAIN;
+ goto out_gunlock;
+ }
+
+ if (S_ISDIR(nip->i_inode.i_mode)) {
+ if (nip->i_entries < 2) {
+ gfs2_consist_inode(nip);
+ error = -EIO;
+ goto out_gunlock;
+ }
+ if (nip->i_entries > 2) {
+ error = -ENOTEMPTY;
+ goto out_gunlock;
+ }
+ }
+ } else {
+ error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
+ if (error)
+ goto out_gunlock;
+
+ error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
+ switch (error) {
+ case -ENOENT:
+ error = 0;
+ break;
+ case 0:
+ error = -EEXIST;
+ default:
+ goto out_gunlock;
+ };
+
+ if (odip != ndip) {
+ if (!ndip->i_inode.i_nlink) {
+ error = -ENOENT;
+ goto out_gunlock;
+ }
+ if (ndip->i_entries == (u32)-1) {
+ error = -EFBIG;
+ goto out_gunlock;
+ }
+ if (S_ISDIR(ip->i_inode.i_mode) &&
+ ndip->i_inode.i_nlink == (u32)-1) {
+ error = -EMLINK;
+ goto out_gunlock;
+ }
+ }
+ }
+
+ /* Check out the dir to be renamed */
+
+ if (dir_rename) {
+ error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
+ if (error)
+ goto out_gunlock;
+ }
+
+ if (nip == NULL)
+ alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
+ error = alloc_required;
+ if (error < 0)
+ goto out_gunlock;
+ error = 0;
+
+ if (alloc_required) {
+ struct gfs2_alloc *al = gfs2_alloc_get(ndip);
+ if (!al) {
+ error = -ENOMEM;
+ goto out_gunlock;
+ }
+
+ error = gfs2_quota_lock_check(ndip);
+ if (error)
+ goto out_alloc;
+
+ al->al_requested = sdp->sd_max_dirres;
+
+ error = gfs2_inplace_reserve_ri(ndip);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
+ gfs2_rg_blocks(al) +
+ 4 * RES_DINODE + 4 * RES_LEAF +
+ RES_STATFS + RES_QUOTA + 4, 0);
+ if (error)
+ goto out_ipreserv;
+ } else {
+ error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
+ 5 * RES_LEAF + 4, 0);
+ if (error)
+ goto out_gunlock;
+ }
+
+ /* Remove the target file, if it exists */
+
+ if (nip) {
+ struct buffer_head *bh;
+ error = gfs2_meta_inode_buffer(nip, &bh);
+ if (error)
+ goto out_end_trans;
+ error = gfs2_unlink_inode(ndip, ndentry, bh);
+ brelse(bh);
+ }
+
+ if (dir_rename) {
+ error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
+ if (error)
+ goto out_end_trans;
+ } else {
+ struct buffer_head *dibh;
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out_end_trans;
+ ip->i_inode.i_ctime = CURRENT_TIME;
+ gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ }
+
+ error = gfs2_dir_del(odip, odentry);
+ if (error)
+ goto out_end_trans;
+
+ error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+ if (error)
+ goto out_end_trans;
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_ipreserv:
+ if (alloc_required)
+ gfs2_inplace_release(ndip);
+out_gunlock_q:
+ if (alloc_required)
+ gfs2_quota_unlock(ndip);
+out_alloc:
+ if (alloc_required)
+ gfs2_alloc_put(ndip);
+out_gunlock:
+ while (x--) {
+ gfs2_glock_dq(ghs + x);
+ gfs2_holder_uninit(ghs + x);
+ }
+out_gunlock_r:
+ if (r_gh.gh_gl)
+ gfs2_glock_dq_uninit(&r_gh);
+out:
+ gfs2_glock_dq_uninit(&ri_gh);
+ return error;
+}
+
+/**
+ * gfs2_follow_link - Follow a symbolic link
+ * @dentry: The dentry of the link
+ * @nd: Data that we pass to vfs_follow_link()
+ *
+ * This can handle symlinks of any size.
+ *
+ * Returns: 0 on success or error code
+ */
+
+static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+ struct gfs2_holder i_gh;
+ struct buffer_head *dibh;
+ unsigned int size;
+ char *buf;
+ int error;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+ error = gfs2_glock_nq(&i_gh);
+ if (error) {
+ gfs2_holder_uninit(&i_gh);
+ nd_set_link(nd, ERR_PTR(error));
+ return NULL;
+ }
+
+ size = (unsigned int)i_size_read(&ip->i_inode);
+ if (size == 0) {
+ gfs2_consist_inode(ip);
+ buf = ERR_PTR(-EIO);
+ goto out;
+ }
+
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error) {
+ buf = ERR_PTR(error);
+ goto out;
+ }
+
+ buf = kzalloc(size + 1, GFP_NOFS);
+ if (!buf)
+ buf = ERR_PTR(-ENOMEM);
+ else
+ memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size);
+ brelse(dibh);
+out:
+ gfs2_glock_dq_uninit(&i_gh);
+ nd_set_link(nd, buf);
+ return NULL;
+}
+
+static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+{
+ char *s = nd_get_link(nd);
+ if (!IS_ERR(s))
+ kfree(s);
+}
+
+/**
+ * gfs2_permission -
+ * @inode: The inode
+ * @mask: The mask to be tested
+ * @flags: Indicates whether this is an RCU path walk or not
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done.
+ *
+ * Returns: errno
+ */
+
+int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
+{
+ struct gfs2_inode *ip;
+ struct gfs2_holder i_gh;
+ int error;
+ int unlock = 0;
+
+
+ ip = GFS2_I(inode);
+ if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+ if (flags & IPERM_FLAG_RCU)
+ return -ECHILD;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+ unlock = 1;
+ }
+
+ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+ error = -EACCES;
+ else
+ error = generic_permission(inode, mask, flags, gfs2_check_acl);
+ if (unlock)
+ gfs2_glock_dq_uninit(&i_gh);
+
+ return error;
}
static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
@@ -928,8 +1594,6 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
* @ip:
* @attr:
*
- * Called with a reference on the vnode.
- *
* Returns: errno
*/
@@ -949,60 +1613,280 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
return error;
}
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
-{
- struct gfs2_dinode *str = buf;
-
- str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
- str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
- str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
- str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
- str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
- str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
- str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
- str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
- str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
- str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
- str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
- str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
-
- str->di_goal_meta = cpu_to_be64(ip->i_goal);
- str->di_goal_data = cpu_to_be64(ip->i_goal);
- str->di_generation = cpu_to_be64(ip->i_generation);
-
- str->di_flags = cpu_to_be32(ip->i_diskflags);
- str->di_height = cpu_to_be16(ip->i_height);
- str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
- !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
- GFS2_FORMAT_DE : 0);
- str->di_depth = cpu_to_be16(ip->i_depth);
- str->di_entries = cpu_to_be32(ip->i_entries);
-
- str->di_eattr = cpu_to_be64(ip->i_eattr);
- str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
- str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
-}
-
-void gfs2_dinode_print(const struct gfs2_inode *ip)
-{
- printk(KERN_INFO " no_formal_ino = %llu\n",
- (unsigned long long)ip->i_no_formal_ino);
- printk(KERN_INFO " no_addr = %llu\n",
- (unsigned long long)ip->i_no_addr);
- printk(KERN_INFO " i_size = %llu\n",
- (unsigned long long)i_size_read(&ip->i_inode));
- printk(KERN_INFO " blocks = %llu\n",
- (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
- printk(KERN_INFO " i_goal = %llu\n",
- (unsigned long long)ip->i_goal);
- printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags);
- printk(KERN_INFO " i_height = %u\n", ip->i_height);
- printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
- printk(KERN_INFO " i_entries = %u\n", ip->i_entries);
- printk(KERN_INFO " i_eattr = %llu\n",
- (unsigned long long)ip->i_eattr);
+static int setattr_chown(struct inode *inode, struct iattr *attr)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ u32 ouid, ogid, nuid, ngid;
+ int error;
+
+ ouid = inode->i_uid;
+ ogid = inode->i_gid;
+ nuid = attr->ia_uid;
+ ngid = attr->ia_gid;
+
+ if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
+ ouid = nuid = NO_QUOTA_CHANGE;
+ if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
+ ogid = ngid = NO_QUOTA_CHANGE;
+
+ if (!gfs2_alloc_get(ip))
+ return -ENOMEM;
+
+ error = gfs2_quota_lock(ip, nuid, ngid);
+ if (error)
+ goto out_alloc;
+
+ if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ error = gfs2_quota_check(ip, nuid, ngid);
+ if (error)
+ goto out_gunlock_q;
+ }
+
+ error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
+ if (error)
+ goto out_gunlock_q;
+
+ error = gfs2_setattr_simple(ip, attr);
+ if (error)
+ goto out_end_trans;
+
+ if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
+ u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
+ gfs2_quota_change(ip, -blocks, ouid, ogid);
+ gfs2_quota_change(ip, blocks, nuid, ngid);
+ }
+
+out_end_trans:
+ gfs2_trans_end(sdp);
+out_gunlock_q:
+ gfs2_quota_unlock(ip);
+out_alloc:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
+/**
+ * gfs2_setattr - Change attributes on an inode
+ * @dentry: The dentry which is changing
+ * @attr: The structure describing the change
+ *
+ * The VFS layer wants to change one or more of an inodes attributes. Write
+ * that change out to disk.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder i_gh;
+ int error;
+
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+ if (error)
+ return error;
+
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out;
+
+ error = inode_change_ok(inode, attr);
+ if (error)
+ goto out;
+
+ if (attr->ia_valid & ATTR_SIZE)
+ error = gfs2_setattr_size(inode, attr->ia_size);
+ else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
+ error = setattr_chown(inode, attr);
+ else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
+ error = gfs2_acl_chmod(ip, attr);
+ else
+ error = gfs2_setattr_simple(ip, attr);
+
+out:
+ gfs2_glock_dq_uninit(&i_gh);
+ if (!error)
+ mark_inode_dirty(inode);
+ return error;
+}
+
+/**
+ * gfs2_getattr - Read out an inode's attributes
+ * @mnt: The vfsmount the inode is being accessed from
+ * @dentry: The dentry to stat
+ * @stat: The inode's stats
+ *
+ * This may be called from the VFS directly, or from within GFS2 with the
+ * inode locked, so we look to see if the glock is already locked and only
+ * lock the glock if its not already been done. Note that its the NFS
+ * readdirplus operation which causes this to be called (from filldir)
+ * with the glock already held.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int error;
+ int unlock = 0;
+
+ if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error)
+ return error;
+ unlock = 1;
+ }
+
+ generic_fillattr(inode, stat);
+ if (unlock)
+ gfs2_glock_dq_uninit(&gh);
+
+ return 0;
+}
+
+static int gfs2_setxattr(struct dentry *dentry, const char *name,
+ const void *data, size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_setxattr(dentry, name, data, size, flags);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
+}
+
+static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
+ void *data, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_getxattr(dentry, name, data, size);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
+}
+
+static int gfs2_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode = dentry->d_inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
+ if (ret == 0) {
+ ret = generic_removexattr(dentry, name);
+ gfs2_glock_dq(&gh);
+ }
+ gfs2_holder_uninit(&gh);
+ return ret;
+}
+
+static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_holder gh;
+ int ret;
+
+ ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ if (ret)
+ goto out;
+
+ if (gfs2_is_stuffed(ip)) {
+ u64 phys = ip->i_no_addr << inode->i_blkbits;
+ u64 size = i_size_read(inode);
+ u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
+ FIEMAP_EXTENT_DATA_INLINE;
+ phys += sizeof(struct gfs2_dinode);
+ phys += start;
+ if (start + len > size)
+ len = size - start;
+ if (start < size)
+ ret = fiemap_fill_next_extent(fieinfo, start, phys,
+ len, flags);
+ if (ret == 1)
+ ret = 0;
+ } else {
+ ret = __generic_block_fiemap(inode, fieinfo, start, len,
+ gfs2_block_map);
+ }
+
+ gfs2_glock_dq_uninit(&gh);
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
+const struct inode_operations gfs2_file_iops = {
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+ .fiemap = gfs2_fiemap,
+};
+
+const struct inode_operations gfs2_dir_iops = {
+ .create = gfs2_create,
+ .lookup = gfs2_lookup,
+ .link = gfs2_link,
+ .unlink = gfs2_unlink,
+ .symlink = gfs2_symlink,
+ .mkdir = gfs2_mkdir,
+ .rmdir = gfs2_unlink,
+ .mknod = gfs2_mknod,
+ .rename = gfs2_rename,
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+ .fiemap = gfs2_fiemap,
+};
+
+const struct inode_operations gfs2_symlink_iops = {
+ .readlink = generic_readlink,
+ .follow_link = gfs2_follow_link,
+ .put_link = gfs2_put_link,
+ .permission = gfs2_permission,
+ .setattr = gfs2_setattr,
+ .getattr = gfs2_getattr,
+ .setxattr = gfs2_setxattr,
+ .getxattr = gfs2_getxattr,
+ .listxattr = gfs2_listxattr,
+ .removexattr = gfs2_removexattr,
+ .fiemap = gfs2_fiemap,
+};
+
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 099ca305e518..31606076f701 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -102,22 +102,16 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
u64 *no_formal_ino,
unsigned int blktype);
-extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock);
extern int gfs2_inode_refresh(struct gfs2_inode *ip);
-extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
int is_root);
-extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
- const struct qstr *name,
- unsigned int mode, dev_t dev);
extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags);
extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-extern void gfs2_dinode_print(const struct gfs2_inode *ip);
extern const struct inode_operations gfs2_file_iops;
extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 5b102c1887fd..903115f2bb34 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/bio.h>
+#include <linux/writeback.h>
#include "gfs2.h"
#include "incore.h"
@@ -83,55 +84,97 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
/**
* gfs2_ail1_start_one - Start I/O on a part of the AIL
* @sdp: the filesystem
- * @tr: the part of the AIL
+ * @wbc: The writeback control structure
+ * @ai: The ail structure
*
*/
-static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+static int gfs2_ail1_start_one(struct gfs2_sbd *sdp,
+ struct writeback_control *wbc,
+ struct gfs2_ail *ai)
__releases(&sdp->sd_ail_lock)
__acquires(&sdp->sd_ail_lock)
{
+ struct gfs2_glock *gl = NULL;
+ struct address_space *mapping;
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
- int retry;
- do {
- retry = 0;
+ list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) {
+ bh = bd->bd_bh;
- list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
- bd_ail_st_list) {
- bh = bd->bd_bh;
+ gfs2_assert(sdp, bd->bd_ail == ai);
- gfs2_assert(sdp, bd->bd_ail == ai);
+ if (!buffer_busy(bh)) {
+ if (!buffer_uptodate(bh))
+ gfs2_io_error_bh(sdp, bh);
+ list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
+ continue;
+ }
- if (!buffer_busy(bh)) {
- if (!buffer_uptodate(bh))
- gfs2_io_error_bh(sdp, bh);
- list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
- continue;
- }
+ if (!buffer_dirty(bh))
+ continue;
+ if (gl == bd->bd_gl)
+ continue;
+ gl = bd->bd_gl;
+ list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
+ mapping = bh->b_page->mapping;
+ if (!mapping)
+ continue;
+ spin_unlock(&sdp->sd_ail_lock);
+ generic_writepages(mapping, wbc);
+ spin_lock(&sdp->sd_ail_lock);
+ if (wbc->nr_to_write <= 0)
+ break;
+ return 1;
+ }
- if (!buffer_dirty(bh))
- continue;
+ return 0;
+}
- list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
- get_bh(bh);
- spin_unlock(&sdp->sd_ail_lock);
- lock_buffer(bh);
- if (test_clear_buffer_dirty(bh)) {
- bh->b_end_io = end_buffer_write_sync;
- submit_bh(WRITE_SYNC, bh);
- } else {
- unlock_buffer(bh);
- brelse(bh);
- }
- spin_lock(&sdp->sd_ail_lock);
-
- retry = 1;
+/**
+ * gfs2_ail1_flush - start writeback of some ail1 entries
+ * @sdp: The super block
+ * @wbc: The writeback control structure
+ *
+ * Writes back some ail1 entries, according to the limits in the
+ * writeback control structure
+ */
+
+void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
+{
+ struct list_head *head = &sdp->sd_ail1_list;
+ struct gfs2_ail *ai;
+
+ trace_gfs2_ail_flush(sdp, wbc, 1);
+ spin_lock(&sdp->sd_ail_lock);
+restart:
+ list_for_each_entry_reverse(ai, head, ai_list) {
+ if (wbc->nr_to_write <= 0)
break;
- }
- } while (retry);
+ if (gfs2_ail1_start_one(sdp, wbc, ai))
+ goto restart;
+ }
+ spin_unlock(&sdp->sd_ail_lock);
+ trace_gfs2_ail_flush(sdp, wbc, 0);
+}
+
+/**
+ * gfs2_ail1_start - start writeback of all ail1 entries
+ * @sdp: The superblock
+ */
+
+static void gfs2_ail1_start(struct gfs2_sbd *sdp)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .nr_to_write = LONG_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+
+ return gfs2_ail1_flush(sdp, &wbc);
}
/**
@@ -141,7 +184,7 @@ __acquires(&sdp->sd_ail_lock)
*
*/
-static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
+static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
@@ -149,76 +192,63 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl
list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
bd_ail_st_list) {
bh = bd->bd_bh;
-
gfs2_assert(sdp, bd->bd_ail == ai);
-
- if (buffer_busy(bh)) {
- if (flags & DIO_ALL)
- continue;
- else
- break;
- }
-
+ if (buffer_busy(bh))
+ continue;
if (!buffer_uptodate(bh))
gfs2_io_error_bh(sdp, bh);
-
list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
}
- return list_empty(&ai->ai_ail1_list);
}
-static void gfs2_ail1_start(struct gfs2_sbd *sdp)
-{
- struct list_head *head;
- u64 sync_gen;
- struct gfs2_ail *ai;
- int done = 0;
-
- spin_lock(&sdp->sd_ail_lock);
- head = &sdp->sd_ail1_list;
- if (list_empty(head)) {
- spin_unlock(&sdp->sd_ail_lock);
- return;
- }
- sync_gen = sdp->sd_ail_sync_gen++;
-
- while(!done) {
- done = 1;
- list_for_each_entry_reverse(ai, head, ai_list) {
- if (ai->ai_sync_gen >= sync_gen)
- continue;
- ai->ai_sync_gen = sync_gen;
- gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
- done = 0;
- break;
- }
- }
-
- spin_unlock(&sdp->sd_ail_lock);
-}
+/**
+ * gfs2_ail1_empty - Try to empty the ail1 lists
+ * @sdp: The superblock
+ *
+ * Tries to empty the ail1 lists, starting with the oldest first
+ */
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
+static int gfs2_ail1_empty(struct gfs2_sbd *sdp)
{
struct gfs2_ail *ai, *s;
int ret;
spin_lock(&sdp->sd_ail_lock);
-
list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
- if (gfs2_ail1_empty_one(sdp, ai, flags))
+ gfs2_ail1_empty_one(sdp, ai);
+ if (list_empty(&ai->ai_ail1_list))
list_move(&ai->ai_list, &sdp->sd_ail2_list);
- else if (!(flags & DIO_ALL))
+ else
break;
}
-
ret = list_empty(&sdp->sd_ail1_list);
-
spin_unlock(&sdp->sd_ail_lock);
return ret;
}
+static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
+{
+ struct gfs2_ail *ai;
+ struct gfs2_bufdata *bd;
+ struct buffer_head *bh;
+
+ spin_lock(&sdp->sd_ail_lock);
+ list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) {
+ list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) {
+ bh = bd->bd_bh;
+ if (!buffer_locked(bh))
+ continue;
+ get_bh(bh);
+ spin_unlock(&sdp->sd_ail_lock);
+ wait_on_buffer(bh);
+ brelse(bh);
+ return;
+ }
+ }
+ spin_unlock(&sdp->sd_ail_lock);
+}
/**
* gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
@@ -574,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- gfs2_ail1_empty(sdp, 0);
+ gfs2_ail1_empty(sdp);
tail = current_tail(sdp);
lh = (struct gfs2_log_header *)bh->b_data;
@@ -869,9 +899,9 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
gfs2_log_flush(sdp, NULL);
for (;;) {
gfs2_ail1_start(sdp);
- if (gfs2_ail1_empty(sdp, DIO_ALL))
+ gfs2_ail1_wait(sdp);
+ if (gfs2_ail1_empty(sdp))
break;
- msleep(10);
}
}
@@ -905,20 +935,20 @@ int gfs2_logd(void *data)
preflush = atomic_read(&sdp->sd_log_pinned);
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
- gfs2_ail1_empty(sdp, DIO_ALL);
+ gfs2_ail1_empty(sdp);
gfs2_log_flush(sdp, NULL);
- gfs2_ail1_empty(sdp, DIO_ALL);
}
if (gfs2_ail_flush_reqd(sdp)) {
gfs2_ail1_start(sdp);
- io_schedule();
- gfs2_ail1_empty(sdp, 0);
+ gfs2_ail1_wait(sdp);
+ gfs2_ail1_empty(sdp);
gfs2_log_flush(sdp, NULL);
- gfs2_ail1_empty(sdp, DIO_ALL);
}
- wake_up(&sdp->sd_log_waitq);
+ if (!gfs2_ail_flush_reqd(sdp))
+ wake_up(&sdp->sd_log_waitq);
+
t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
if (freezing(current))
refrigerator();
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 0d007f920234..ab0621698b73 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -12,6 +12,7 @@
#include <linux/list.h>
#include <linux/spinlock.h>
+#include <linux/writeback.h>
#include "incore.h"
/**
@@ -59,6 +60,7 @@ extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
+extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 51d27f00ebb4..05bbb124699f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -40,7 +40,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
{
struct gfs2_bufdata *bd;
- gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+ BUG_ON(!current->journal_info);
clear_buffer_dirty(bh);
if (test_set_buffer_pinned(bh))
@@ -65,6 +65,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
* @sdp: the filesystem the buffer belongs to
* @bh: The buffer to unpin
* @ai:
+ * @flags: The inode dirty flags
*
*/
@@ -73,10 +74,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
{
struct gfs2_bufdata *bd = bh->b_private;
- gfs2_assert_withdraw(sdp, buffer_uptodate(bh));
-
- if (!buffer_pinned(bh))
- gfs2_assert_withdraw(sdp, 0);
+ BUG_ON(!buffer_uptodate(bh));
+ BUG_ON(!buffer_pinned(bh));
lock_buffer(bh);
mark_buffer_dirty(bh);
@@ -95,8 +94,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
- gfs2_glock_schedule_for_reclaim(bd->bd_gl);
+ clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
trace_gfs2_pin(bd, 0);
unlock_buffer(bh);
atomic_dec(&sdp->sd_log_pinned);
@@ -322,12 +320,16 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
{
+ struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+ struct gfs2_glock *gl = bd->bd_gl;
struct gfs2_trans *tr;
tr = current->journal_info;
tr->tr_touched = 1;
tr->tr_num_revoke++;
sdp->sd_log_num_revoke++;
+ atomic_inc(&gl->gl_revokes);
+ set_bit(GLF_LFLUSH, &gl->gl_flags);
list_add(&le->le_list, &sdp->sd_log_le_revoke);
}
@@ -350,9 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke);
offset = sizeof(struct gfs2_log_descriptor);
- while (!list_empty(head)) {
- bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
- list_del_init(&bd->bd_le.le_list);
+ list_for_each_entry(bd, head, bd_le.le_list) {
sdp->sd_log_num_revoke--;
if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
@@ -367,8 +367,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
}
*(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno);
- kmem_cache_free(gfs2_bufdata_cachep, bd);
-
offset += sizeof(u64);
}
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
@@ -376,6 +374,22 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
submit_bh(WRITE_SYNC, bh);
}
+static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+ struct list_head *head = &sdp->sd_log_le_revoke;
+ struct gfs2_bufdata *bd;
+ struct gfs2_glock *gl;
+
+ while (!list_empty(head)) {
+ bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+ list_del_init(&bd->bd_le.le_list);
+ gl = bd->bd_gl;
+ atomic_dec(&gl->gl_revokes);
+ clear_bit(GLF_LFLUSH, &gl->gl_flags);
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
+ }
+}
+
static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass)
{
@@ -749,6 +763,7 @@ const struct gfs2_log_operations gfs2_buf_lops = {
const struct gfs2_log_operations gfs2_revoke_lops = {
.lo_add = revoke_lo_add,
.lo_before_commit = revoke_lo_before_commit,
+ .lo_after_commit = revoke_lo_after_commit,
.lo_before_scan = revoke_lo_before_scan,
.lo_scan_elements = revoke_lo_scan_elements,
.lo_after_scan = revoke_lo_after_scan,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 888a5f5a1a58..cfa327d33194 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -53,6 +53,7 @@ static void gfs2_init_glock_once(void *foo)
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
atomic_set(&gl->gl_ail_count, 0);
+ atomic_set(&gl->gl_revokes, 0);
}
static void gfs2_init_gl_aspace_once(void *foo)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 675349b5a133..747238cd9f96 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,6 +31,7 @@
#include "rgrp.h"
#include "trans.h"
#include "util.h"
+#include "trace_gfs2.h"
static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
{
@@ -310,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
struct gfs2_bufdata *bd = bh->b_private;
if (test_clear_buffer_pinned(bh)) {
+ trace_gfs2_pin(bd, 0);
atomic_dec(&sdp->sd_log_pinned);
list_del_init(&bd->bd_le.le_list);
if (meta) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 6a1d9ba16411..22c526593131 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -77,8 +77,6 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen);
#define buffer_busy(bh) \
((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned)))
-#define buffer_in_io(bh) \
-((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock)))
#endif /* __DIO_DOT_H__ */
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d3c69eb91c74..8ac9ae189b53 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -126,8 +126,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
* changed.
*/
-static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
+static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
{
+ struct gfs2_sb_host *sb = &sdp->sd_sb;
+
if (sb->sb_magic != GFS2_MAGIC ||
sb->sb_type != GFS2_METATYPE_SB) {
if (!silent)
@@ -157,8 +159,10 @@ static void end_bio_io_page(struct bio *bio, int error)
unlock_page(page);
}
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
{
+ struct gfs2_sb_host *sb = &sdp->sd_sb;
+ struct super_block *s = sdp->sd_vfs;
const struct gfs2_sb *str = buf;
sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
@@ -175,7 +179,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
- memcpy(sb->sb_uuid, str->sb_uuid, 16);
+ memcpy(s->s_uuid, str->sb_uuid, 16);
}
/**
@@ -197,7 +201,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
* Returns: 0 on success or error
*/
-static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
{
struct super_block *sb = sdp->sd_vfs;
struct gfs2_sb *p;
@@ -227,10 +231,10 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
return -EIO;
}
p = kmap(page);
- gfs2_sb_in(&sdp->sd_sb, p);
+ gfs2_sb_in(sdp, p);
kunmap(page);
__free_page(page);
- return 0;
+ return gfs2_check_sb(sdp, silent);
}
/**
@@ -247,17 +251,13 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
unsigned int x;
int error;
- error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+ error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
if (error) {
if (!silent)
fs_err(sdp, "can't read superblock\n");
return error;
}
- error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
- if (error)
- return error;
-
sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
GFS2_BASIC_BLOCK_SHIFT;
sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
@@ -340,14 +340,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
/* Try to autodetect */
if (!proto[0] || !table[0]) {
- error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+ error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent);
if (error)
return error;
- error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
- if (error)
- goto out;
-
if (!proto[0])
proto = sdp->sd_sb.sb_lockproto;
if (!table[0])
@@ -364,7 +360,6 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
while ((table = strchr(table, '/')))
*table = '_';
-out:
return error;
}
@@ -1119,8 +1114,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (sdp->sd_args.ar_statfs_quantum) {
sdp->sd_tune.gt_statfs_slow = 0;
sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum;
- }
- else {
+ } else {
sdp->sd_tune.gt_statfs_slow = 1;
sdp->sd_tune.gt_statfs_quantum = 30;
}
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
deleted file mode 100644
index 09e436a50723..000000000000
--- a/fs/gfs2/ops_inode.c
+++ /dev/null
@@ -1,1344 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/namei.h>
-#include <linux/mm.h>
-#include <linux/xattr.h>
-#include <linux/posix_acl.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/fiemap.h>
-#include <asm/uaccess.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "bmap.h"
-#include "dir.h"
-#include "xattr.h"
-#include "glock.h"
-#include "inode.h"
-#include "meta_io.h"
-#include "quota.h"
-#include "rgrp.h"
-#include "trans.h"
-#include "util.h"
-#include "super.h"
-
-/**
- * gfs2_create - Create a file
- * @dir: The directory in which to create the file
- * @dentry: The dentry of the new file
- * @mode: The mode of the new file
- *
- * Returns: errno
- */
-
-static int gfs2_create(struct inode *dir, struct dentry *dentry,
- int mode, struct nameidata *nd)
-{
- struct gfs2_inode *dip = GFS2_I(dir);
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct gfs2_holder ghs[2];
- struct inode *inode;
-
- gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
- for (;;) {
- inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0);
- if (!IS_ERR(inode)) {
- gfs2_trans_end(sdp);
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
- gfs2_quota_unlock(dip);
- gfs2_alloc_put(dip);
- gfs2_glock_dq_uninit_m(2, ghs);
- mark_inode_dirty(inode);
- break;
- } else if (PTR_ERR(inode) != -EEXIST ||
- (nd && nd->flags & LOOKUP_EXCL)) {
- gfs2_holder_uninit(ghs);
- return PTR_ERR(inode);
- }
-
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode) {
- if (!IS_ERR(inode)) {
- gfs2_holder_uninit(ghs);
- break;
- } else {
- gfs2_holder_uninit(ghs);
- return PTR_ERR(inode);
- }
- }
- }
-
- d_instantiate(dentry, inode);
-
- return 0;
-}
-
-/**
- * gfs2_lookup - Look up a filename in a directory and return its inode
- * @dir: The directory inode
- * @dentry: The dentry of the new inode
- * @nd: passed from Linux VFS, ignored by us
- *
- * Called by the VFS layer. Lock dir and call gfs2_lookupi()
- *
- * Returns: errno
- */
-
-static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
-{
- struct inode *inode = NULL;
-
- inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- if (inode && IS_ERR(inode))
- return ERR_CAST(inode);
-
- if (inode) {
- struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
- struct gfs2_holder gh;
- int error;
- error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error) {
- iput(inode);
- return ERR_PTR(error);
- }
- gfs2_glock_dq_uninit(&gh);
- return d_splice_alias(inode, dentry);
- }
- d_add(dentry, inode);
-
- return NULL;
-}
-
-/**
- * gfs2_link - Link to a file
- * @old_dentry: The inode to link
- * @dir: Add link to this directory
- * @dentry: The name of the link
- *
- * Link the inode in "old_dentry" into the directory "dir" with the
- * name in "dentry".
- *
- * Returns: errno
- */
-
-static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *dentry)
-{
- struct gfs2_inode *dip = GFS2_I(dir);
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct inode *inode = old_dentry->d_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder ghs[2];
- int alloc_required;
- int error;
-
- if (S_ISDIR(inode->i_mode))
- return -EPERM;
-
- gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-
- error = gfs2_glock_nq(ghs); /* parent */
- if (error)
- goto out_parent;
-
- error = gfs2_glock_nq(ghs + 1); /* child */
- if (error)
- goto out_child;
-
- error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
- if (error)
- goto out_gunlock;
-
- error = gfs2_dir_check(dir, &dentry->d_name, NULL);
- switch (error) {
- case -ENOENT:
- break;
- case 0:
- error = -EEXIST;
- default:
- goto out_gunlock;
- }
-
- error = -EINVAL;
- if (!dip->i_inode.i_nlink)
- goto out_gunlock;
- error = -EFBIG;
- if (dip->i_entries == (u32)-1)
- goto out_gunlock;
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_gunlock;
- error = -EINVAL;
- if (!ip->i_inode.i_nlink)
- goto out_gunlock;
- error = -EMLINK;
- if (ip->i_inode.i_nlink == (u32)-1)
- goto out_gunlock;
-
- alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
- if (error < 0)
- goto out_gunlock;
- error = 0;
-
- if (alloc_required) {
- struct gfs2_alloc *al = gfs2_alloc_get(dip);
- if (!al) {
- error = -ENOMEM;
- goto out_gunlock;
- }
-
- error = gfs2_quota_lock_check(dip);
- if (error)
- goto out_alloc;
-
- al->al_requested = sdp->sd_max_dirres;
-
- error = gfs2_inplace_reserve(dip);
- if (error)
- goto out_gunlock_q;
-
- error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(al) +
- 2 * RES_DINODE + RES_STATFS +
- RES_QUOTA, 0);
- if (error)
- goto out_ipres;
- } else {
- error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0);
- if (error)
- goto out_ipres;
- }
-
- error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode));
- if (error)
- goto out_end_trans;
-
- error = gfs2_change_nlink(ip, +1);
-
-out_end_trans:
- gfs2_trans_end(sdp);
-out_ipres:
- if (alloc_required)
- gfs2_inplace_release(dip);
-out_gunlock_q:
- if (alloc_required)
- gfs2_quota_unlock(dip);
-out_alloc:
- if (alloc_required)
- gfs2_alloc_put(dip);
-out_gunlock:
- gfs2_glock_dq(ghs + 1);
-out_child:
- gfs2_glock_dq(ghs);
-out_parent:
- gfs2_holder_uninit(ghs);
- gfs2_holder_uninit(ghs + 1);
- if (!error) {
- ihold(inode);
- d_instantiate(dentry, inode);
- mark_inode_dirty(inode);
- }
- return error;
-}
-
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-
-static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
- const struct gfs2_inode *ip)
-{
- int error;
-
- if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
- return -EPERM;
-
- if ((dip->i_inode.i_mode & S_ISVTX) &&
- dip->i_inode.i_uid != current_fsuid() &&
- ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
- return -EPERM;
-
- if (IS_APPEND(&dip->i_inode))
- return -EPERM;
-
- error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
- if (error)
- return error;
-
- error = gfs2_dir_check(&dip->i_inode, name, ip);
- if (error)
- return error;
-
- return 0;
-}
-
-/**
- * gfs2_unlink - Unlink a file
- * @dir: The inode of the directory containing the file to unlink
- * @dentry: The file itself
- *
- * Unlink a file. Call gfs2_unlinki()
- *
- * Returns: errno
- */
-
-static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
-{
- struct gfs2_inode *dip = GFS2_I(dir);
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
- struct gfs2_holder ghs[3];
- struct gfs2_rgrpd *rgd;
- struct gfs2_holder ri_gh;
- int error;
-
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
-
- gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-
- rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
- gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-
-
- error = gfs2_glock_nq(ghs); /* parent */
- if (error)
- goto out_parent;
-
- error = gfs2_glock_nq(ghs + 1); /* child */
- if (error)
- goto out_child;
-
- error = gfs2_glock_nq(ghs + 2); /* rgrp */
- if (error)
- goto out_rgrp;
-
- error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
- if (error)
- goto out_gunlock;
-
- error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
- if (error)
- goto out_gunlock;
-
- error = gfs2_dir_del(dip, &dentry->d_name);
- if (error)
- goto out_end_trans;
-
- error = gfs2_change_nlink(ip, -1);
-
-out_end_trans:
- gfs2_trans_end(sdp);
-out_gunlock:
- gfs2_glock_dq(ghs + 2);
-out_rgrp:
- gfs2_holder_uninit(ghs + 2);
- gfs2_glock_dq(ghs + 1);
-out_child:
- gfs2_holder_uninit(ghs + 1);
- gfs2_glock_dq(ghs);
-out_parent:
- gfs2_holder_uninit(ghs);
- gfs2_glock_dq_uninit(&ri_gh);
- return error;
-}
-
-/**
- * gfs2_symlink - Create a symlink
- * @dir: The directory to create the symlink in
- * @dentry: The dentry to put the symlink in
- * @symname: The thing which the link points to
- *
- * Returns: errno
- */
-
-static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
- const char *symname)
-{
- struct gfs2_inode *dip = GFS2_I(dir), *ip;
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct gfs2_holder ghs[2];
- struct inode *inode;
- struct buffer_head *dibh;
- int size;
- int error;
-
- /* Must be stuffed with a null terminator for gfs2_follow_link() */
- size = strlen(symname);
- if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1)
- return -ENAMETOOLONG;
-
- gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
- inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO, 0);
- if (IS_ERR(inode)) {
- gfs2_holder_uninit(ghs);
- return PTR_ERR(inode);
- }
-
- ip = ghs[1].gh_gl->gl_object;
-
- i_size_write(inode, size);
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
-
- if (!gfs2_assert_withdraw(sdp, !error)) {
- gfs2_dinode_out(ip, dibh->b_data);
- memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname,
- size);
- brelse(dibh);
- }
-
- gfs2_trans_end(sdp);
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
- gfs2_quota_unlock(dip);
- gfs2_alloc_put(dip);
-
- gfs2_glock_dq_uninit_m(2, ghs);
-
- d_instantiate(dentry, inode);
- mark_inode_dirty(inode);
-
- return 0;
-}
-
-/**
- * gfs2_mkdir - Make a directory
- * @dir: The parent directory of the new one
- * @dentry: The dentry of the new directory
- * @mode: The mode of the new directory
- *
- * Returns: errno
- */
-
-static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
-{
- struct gfs2_inode *dip = GFS2_I(dir), *ip;
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct gfs2_holder ghs[2];
- struct inode *inode;
- struct buffer_head *dibh;
- int error;
-
- gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
- inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode, 0);
- if (IS_ERR(inode)) {
- gfs2_holder_uninit(ghs);
- return PTR_ERR(inode);
- }
-
- ip = ghs[1].gh_gl->gl_object;
-
- ip->i_inode.i_nlink = 2;
- i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
- ip->i_diskflags |= GFS2_DIF_JDATA;
- ip->i_entries = 2;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
-
- if (!gfs2_assert_withdraw(sdp, !error)) {
- struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
- struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
- dent->de_inum = di->di_num; /* already GFS2 endian */
- dent->de_type = cpu_to_be16(DT_DIR);
- di->di_entries = cpu_to_be32(1);
-
- dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
- gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
-
- gfs2_inum_out(dip, dent);
- dent->de_type = cpu_to_be16(DT_DIR);
-
- gfs2_dinode_out(ip, di);
-
- brelse(dibh);
- }
-
- error = gfs2_change_nlink(dip, +1);
- gfs2_assert_withdraw(sdp, !error); /* dip already pinned */
-
- gfs2_trans_end(sdp);
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
- gfs2_quota_unlock(dip);
- gfs2_alloc_put(dip);
-
- gfs2_glock_dq_uninit_m(2, ghs);
-
- d_instantiate(dentry, inode);
- mark_inode_dirty(inode);
-
- return 0;
-}
-
-/**
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-
-static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
- struct gfs2_inode *ip)
-{
- int error;
-
- if (ip->i_entries != 2) {
- if (gfs2_consist_inode(ip))
- gfs2_dinode_print(ip);
- return -EIO;
- }
-
- error = gfs2_dir_del(dip, name);
- if (error)
- return error;
-
- error = gfs2_change_nlink(dip, -1);
- if (error)
- return error;
-
- error = gfs2_dir_del(ip, &gfs2_qdot);
- if (error)
- return error;
-
- error = gfs2_dir_del(ip, &gfs2_qdotdot);
- if (error)
- return error;
-
- /* It looks odd, but it really should be done twice */
- error = gfs2_change_nlink(ip, -1);
- if (error)
- return error;
-
- error = gfs2_change_nlink(ip, -1);
- if (error)
- return error;
-
- return error;
-}
-
-/**
- * gfs2_rmdir - Remove a directory
- * @dir: The parent directory of the directory to be removed
- * @dentry: The dentry of the directory to remove
- *
- * Remove a directory. Call gfs2_rmdiri()
- *
- * Returns: errno
- */
-
-static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
-{
- struct gfs2_inode *dip = GFS2_I(dir);
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
- struct gfs2_holder ghs[3];
- struct gfs2_rgrpd *rgd;
- struct gfs2_holder ri_gh;
- int error;
-
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
- gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
-
- rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
- gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
-
- error = gfs2_glock_nq(ghs); /* parent */
- if (error)
- goto out_parent;
-
- error = gfs2_glock_nq(ghs + 1); /* child */
- if (error)
- goto out_child;
-
- error = gfs2_glock_nq(ghs + 2); /* rgrp */
- if (error)
- goto out_rgrp;
-
- error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
- if (error)
- goto out_gunlock;
-
- if (ip->i_entries < 2) {
- if (gfs2_consist_inode(ip))
- gfs2_dinode_print(ip);
- error = -EIO;
- goto out_gunlock;
- }
- if (ip->i_entries > 2) {
- error = -ENOTEMPTY;
- goto out_gunlock;
- }
-
- error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0);
- if (error)
- goto out_gunlock;
-
- error = gfs2_rmdiri(dip, &dentry->d_name, ip);
-
- gfs2_trans_end(sdp);
-
-out_gunlock:
- gfs2_glock_dq(ghs + 2);
-out_rgrp:
- gfs2_holder_uninit(ghs + 2);
- gfs2_glock_dq(ghs + 1);
-out_child:
- gfs2_holder_uninit(ghs + 1);
- gfs2_glock_dq(ghs);
-out_parent:
- gfs2_holder_uninit(ghs);
- gfs2_glock_dq_uninit(&ri_gh);
- return error;
-}
-
-/**
- * gfs2_mknod - Make a special file
- * @dir: The directory in which the special file will reside
- * @dentry: The dentry of the special file
- * @mode: The mode of the special file
- * @rdev: The device specification of the special file
- *
- */
-
-static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
- dev_t dev)
-{
- struct gfs2_inode *dip = GFS2_I(dir);
- struct gfs2_sbd *sdp = GFS2_SB(dir);
- struct gfs2_holder ghs[2];
- struct inode *inode;
-
- gfs2_holder_init(dip->i_gl, 0, 0, ghs);
-
- inode = gfs2_createi(ghs, &dentry->d_name, mode, dev);
- if (IS_ERR(inode)) {
- gfs2_holder_uninit(ghs);
- return PTR_ERR(inode);
- }
-
- gfs2_trans_end(sdp);
- if (dip->i_alloc->al_rgd)
- gfs2_inplace_release(dip);
- gfs2_quota_unlock(dip);
- gfs2_alloc_put(dip);
-
- gfs2_glock_dq_uninit_m(2, ghs);
-
- d_instantiate(dentry, inode);
- mark_inode_dirty(inode);
-
- return 0;
-}
-
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-
-static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
- struct inode *dir = &to->i_inode;
- struct super_block *sb = dir->i_sb;
- struct inode *tmp;
- int error = 0;
-
- igrab(dir);
-
- for (;;) {
- if (dir == &this->i_inode) {
- error = -EINVAL;
- break;
- }
- if (dir == sb->s_root->d_inode) {
- error = 0;
- break;
- }
-
- tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
- if (IS_ERR(tmp)) {
- error = PTR_ERR(tmp);
- break;
- }
-
- iput(dir);
- dir = tmp;
- }
-
- iput(dir);
-
- return error;
-}
-
-/**
- * gfs2_rename - Rename a file
- * @odir: Parent directory of old file name
- * @odentry: The old dentry of the file
- * @ndir: Parent directory of new file name
- * @ndentry: The new dentry of the file
- *
- * Returns: errno
- */
-
-static int gfs2_rename(struct inode *odir, struct dentry *odentry,
- struct inode *ndir, struct dentry *ndentry)
-{
- struct gfs2_inode *odip = GFS2_I(odir);
- struct gfs2_inode *ndip = GFS2_I(ndir);
- struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
- struct gfs2_inode *nip = NULL;
- struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
- struct gfs2_rgrpd *nrgd;
- unsigned int num_gh;
- int dir_rename = 0;
- int alloc_required = 0;
- unsigned int x;
- int error;
-
- if (ndentry->d_inode) {
- nip = GFS2_I(ndentry->d_inode);
- if (ip == nip)
- return 0;
- }
-
- error = gfs2_rindex_hold(sdp, &ri_gh);
- if (error)
- return error;
-
- if (odip != ndip) {
- error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
- 0, &r_gh);
- if (error)
- goto out;
-
- if (S_ISDIR(ip->i_inode.i_mode)) {
- dir_rename = 1;
- /* don't move a dirctory into it's subdir */
- error = gfs2_ok_to_move(ip, ndip);
- if (error)
- goto out_gunlock_r;
- }
- }
-
- num_gh = 1;
- gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
- if (odip != ndip) {
- gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
- num_gh++;
- }
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
- num_gh++;
-
- if (nip) {
- gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh);
- num_gh++;
- /* grab the resource lock for unlink flag twiddling
- * this is the case of the target file already existing
- * so we unlink before doing the rename
- */
- nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr);
- if (nrgd)
- gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
- }
-
- for (x = 0; x < num_gh; x++) {
- error = gfs2_glock_nq(ghs + x);
- if (error)
- goto out_gunlock;
- }
-
- /* Check out the old directory */
-
- error = gfs2_unlink_ok(odip, &odentry->d_name, ip);
- if (error)
- goto out_gunlock;
-
- /* Check out the new directory */
-
- if (nip) {
- error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip);
- if (error)
- goto out_gunlock;
-
- if (S_ISDIR(nip->i_inode.i_mode)) {
- if (nip->i_entries < 2) {
- if (gfs2_consist_inode(nip))
- gfs2_dinode_print(nip);
- error = -EIO;
- goto out_gunlock;
- }
- if (nip->i_entries > 2) {
- error = -ENOTEMPTY;
- goto out_gunlock;
- }
- }
- } else {
- error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
- if (error)
- goto out_gunlock;
-
- error = gfs2_dir_check(ndir, &ndentry->d_name, NULL);
- switch (error) {
- case -ENOENT:
- error = 0;
- break;
- case 0:
- error = -EEXIST;
- default:
- goto out_gunlock;
- };
-
- if (odip != ndip) {
- if (!ndip->i_inode.i_nlink) {
- error = -EINVAL;
- goto out_gunlock;
- }
- if (ndip->i_entries == (u32)-1) {
- error = -EFBIG;
- goto out_gunlock;
- }
- if (S_ISDIR(ip->i_inode.i_mode) &&
- ndip->i_inode.i_nlink == (u32)-1) {
- error = -EMLINK;
- goto out_gunlock;
- }
- }
- }
-
- /* Check out the dir to be renamed */
-
- if (dir_rename) {
- error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
- if (error)
- goto out_gunlock;
- }
-
- if (nip == NULL)
- alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
- error = alloc_required;
- if (error < 0)
- goto out_gunlock;
- error = 0;
-
- if (alloc_required) {
- struct gfs2_alloc *al = gfs2_alloc_get(ndip);
- if (!al) {
- error = -ENOMEM;
- goto out_gunlock;
- }
-
- error = gfs2_quota_lock_check(ndip);
- if (error)
- goto out_alloc;
-
- al->al_requested = sdp->sd_max_dirres;
-
- error = gfs2_inplace_reserve_ri(ndip);
- if (error)
- goto out_gunlock_q;
-
- error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(al) +
- 4 * RES_DINODE + 4 * RES_LEAF +
- RES_STATFS + RES_QUOTA + 4, 0);
- if (error)
- goto out_ipreserv;
- } else {
- error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
- 5 * RES_LEAF + 4, 0);
- if (error)
- goto out_gunlock;
- }
-
- /* Remove the target file, if it exists */
-
- if (nip) {
- if (S_ISDIR(nip->i_inode.i_mode))
- error = gfs2_rmdiri(ndip, &ndentry->d_name, nip);
- else {
- error = gfs2_dir_del(ndip, &ndentry->d_name);
- if (error)
- goto out_end_trans;
- error = gfs2_change_nlink(nip, -1);
- }
- if (error)
- goto out_end_trans;
- }
-
- if (dir_rename) {
- error = gfs2_change_nlink(ndip, +1);
- if (error)
- goto out_end_trans;
- error = gfs2_change_nlink(odip, -1);
- if (error)
- goto out_end_trans;
-
- error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
- if (error)
- goto out_end_trans;
- } else {
- struct buffer_head *dibh;
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto out_end_trans;
- ip->i_inode.i_ctime = CURRENT_TIME;
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
- }
-
- error = gfs2_dir_del(odip, &odentry->d_name);
- if (error)
- goto out_end_trans;
-
- error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode));
- if (error)
- goto out_end_trans;
-
-out_end_trans:
- gfs2_trans_end(sdp);
-out_ipreserv:
- if (alloc_required)
- gfs2_inplace_release(ndip);
-out_gunlock_q:
- if (alloc_required)
- gfs2_quota_unlock(ndip);
-out_alloc:
- if (alloc_required)
- gfs2_alloc_put(ndip);
-out_gunlock:
- while (x--) {
- gfs2_glock_dq(ghs + x);
- gfs2_holder_uninit(ghs + x);
- }
-out_gunlock_r:
- if (r_gh.gh_gl)
- gfs2_glock_dq_uninit(&r_gh);
-out:
- gfs2_glock_dq_uninit(&ri_gh);
- return error;
-}
-
-/**
- * gfs2_follow_link - Follow a symbolic link
- * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
- *
- * This can handle symlinks of any size.
- *
- * Returns: 0 on success or error code
- */
-
-static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
- struct gfs2_holder i_gh;
- struct buffer_head *dibh;
- unsigned int x, size;
- char *buf;
- int error;
-
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
- error = gfs2_glock_nq(&i_gh);
- if (error) {
- gfs2_holder_uninit(&i_gh);
- nd_set_link(nd, ERR_PTR(error));
- return NULL;
- }
-
- size = (unsigned int)i_size_read(&ip->i_inode);
- if (size == 0) {
- gfs2_consist_inode(ip);
- buf = ERR_PTR(-EIO);
- goto out;
- }
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error) {
- buf = ERR_PTR(error);
- goto out;
- }
-
- x = size + 1;
- buf = kmalloc(x, GFP_NOFS);
- if (!buf)
- buf = ERR_PTR(-ENOMEM);
- else
- memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
- brelse(dibh);
-out:
- gfs2_glock_dq_uninit(&i_gh);
- nd_set_link(nd, buf);
- return NULL;
-}
-
-static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
-{
- char *s = nd_get_link(nd);
- if (!IS_ERR(s))
- kfree(s);
-}
-
-/**
- * gfs2_permission -
- * @inode: The inode
- * @mask: The mask to be tested
- * @flags: Indicates whether this is an RCU path walk or not
- *
- * This may be called from the VFS directly, or from within GFS2 with the
- * inode locked, so we look to see if the glock is already locked and only
- * lock the glock if its not already been done.
- *
- * Returns: errno
- */
-
-int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
-{
- struct gfs2_inode *ip;
- struct gfs2_holder i_gh;
- int error;
- int unlock = 0;
-
-
- ip = GFS2_I(inode);
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- if (flags & IPERM_FLAG_RCU)
- return -ECHILD;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- if (error)
- return error;
- unlock = 1;
- }
-
- if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
- error = -EACCES;
- else
- error = generic_permission(inode, mask, flags, gfs2_check_acl);
- if (unlock)
- gfs2_glock_dq_uninit(&i_gh);
-
- return error;
-}
-
-static int setattr_chown(struct inode *inode, struct iattr *attr)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- u32 ouid, ogid, nuid, ngid;
- int error;
-
- ouid = inode->i_uid;
- ogid = inode->i_gid;
- nuid = attr->ia_uid;
- ngid = attr->ia_gid;
-
- if (!(attr->ia_valid & ATTR_UID) || ouid == nuid)
- ouid = nuid = NO_QUOTA_CHANGE;
- if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
- ogid = ngid = NO_QUOTA_CHANGE;
-
- if (!gfs2_alloc_get(ip))
- return -ENOMEM;
-
- error = gfs2_quota_lock(ip, nuid, ngid);
- if (error)
- goto out_alloc;
-
- if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
- error = gfs2_quota_check(ip, nuid, ngid);
- if (error)
- goto out_gunlock_q;
- }
-
- error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0);
- if (error)
- goto out_gunlock_q;
-
- error = gfs2_setattr_simple(ip, attr);
- if (error)
- goto out_end_trans;
-
- if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
- u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
- gfs2_quota_change(ip, -blocks, ouid, ogid);
- gfs2_quota_change(ip, blocks, nuid, ngid);
- }
-
-out_end_trans:
- gfs2_trans_end(sdp);
-out_gunlock_q:
- gfs2_quota_unlock(ip);
-out_alloc:
- gfs2_alloc_put(ip);
- return error;
-}
-
-/**
- * gfs2_setattr - Change attributes on an inode
- * @dentry: The dentry which is changing
- * @attr: The structure describing the change
- *
- * The VFS layer wants to change one or more of an inodes attributes. Write
- * that change out to disk.
- *
- * Returns: errno
- */
-
-static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder i_gh;
- int error;
-
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
- if (error)
- return error;
-
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out;
-
- error = inode_change_ok(inode, attr);
- if (error)
- goto out;
-
- if (attr->ia_valid & ATTR_SIZE)
- error = gfs2_setattr_size(inode, attr->ia_size);
- else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
- error = setattr_chown(inode, attr);
- else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
- error = gfs2_acl_chmod(ip, attr);
- else
- error = gfs2_setattr_simple(ip, attr);
-
-out:
- gfs2_glock_dq_uninit(&i_gh);
- if (!error)
- mark_inode_dirty(inode);
- return error;
-}
-
-/**
- * gfs2_getattr - Read out an inode's attributes
- * @mnt: The vfsmount the inode is being accessed from
- * @dentry: The dentry to stat
- * @stat: The inode's stats
- *
- * This may be called from the VFS directly, or from within GFS2 with the
- * inode locked, so we look to see if the glock is already locked and only
- * lock the glock if its not already been done. Note that its the NFS
- * readdirplus operation which causes this to be called (from filldir)
- * with the glock already held.
- *
- * Returns: errno
- */
-
-static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int error;
- int unlock = 0;
-
- if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error)
- return error;
- unlock = 1;
- }
-
- generic_fillattr(inode, stat);
- if (unlock)
- gfs2_glock_dq_uninit(&gh);
-
- return 0;
-}
-
-static int gfs2_setxattr(struct dentry *dentry, const char *name,
- const void *data, size_t size, int flags)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret == 0) {
- ret = generic_setxattr(dentry, name, data, size, flags);
- gfs2_glock_dq(&gh);
- }
- gfs2_holder_uninit(&gh);
- return ret;
-}
-
-static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
- void *data, size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret == 0) {
- ret = generic_getxattr(dentry, name, data, size);
- gfs2_glock_dq(&gh);
- }
- gfs2_holder_uninit(&gh);
- return ret;
-}
-
-static int gfs2_removexattr(struct dentry *dentry, const char *name)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
- ret = gfs2_glock_nq(&gh);
- if (ret == 0) {
- ret = generic_removexattr(dentry, name);
- gfs2_glock_dq(&gh);
- }
- gfs2_holder_uninit(&gh);
- return ret;
-}
-
-static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
- u64 start, u64 len)
-{
- struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_holder gh;
- int ret;
-
- ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
- if (ret)
- return ret;
-
- mutex_lock(&inode->i_mutex);
-
- ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
- if (ret)
- goto out;
-
- if (gfs2_is_stuffed(ip)) {
- u64 phys = ip->i_no_addr << inode->i_blkbits;
- u64 size = i_size_read(inode);
- u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED|
- FIEMAP_EXTENT_DATA_INLINE;
- phys += sizeof(struct gfs2_dinode);
- phys += start;
- if (start + len > size)
- len = size - start;
- if (start < size)
- ret = fiemap_fill_next_extent(fieinfo, start, phys,
- len, flags);
- if (ret == 1)
- ret = 0;
- } else {
- ret = __generic_block_fiemap(inode, fieinfo, start, len,
- gfs2_block_map);
- }
-
- gfs2_glock_dq_uninit(&gh);
-out:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-
-const struct inode_operations gfs2_file_iops = {
- .permission = gfs2_permission,
- .setattr = gfs2_setattr,
- .getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
- .listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
- .fiemap = gfs2_fiemap,
-};
-
-const struct inode_operations gfs2_dir_iops = {
- .create = gfs2_create,
- .lookup = gfs2_lookup,
- .link = gfs2_link,
- .unlink = gfs2_unlink,
- .symlink = gfs2_symlink,
- .mkdir = gfs2_mkdir,
- .rmdir = gfs2_rmdir,
- .mknod = gfs2_mknod,
- .rename = gfs2_rename,
- .permission = gfs2_permission,
- .setattr = gfs2_setattr,
- .getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
- .listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
- .fiemap = gfs2_fiemap,
-};
-
-const struct inode_operations gfs2_symlink_iops = {
- .readlink = generic_readlink,
- .follow_link = gfs2_follow_link,
- .put_link = gfs2_put_link,
- .permission = gfs2_permission,
- .setattr = gfs2_setattr,
- .getattr = gfs2_getattr,
- .setxattr = gfs2_setxattr,
- .getxattr = gfs2_getxattr,
- .listxattr = gfs2_listxattr,
- .removexattr = gfs2_removexattr,
- .fiemap = gfs2_fiemap,
-};
-
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index e23d9864c418..42e8d23bc047 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -38,6 +38,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
@@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list);
static atomic_t qd_lru_count = ATOMIC_INIT(0);
static DEFINE_SPINLOCK(qd_lru_lock);
-int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
{
struct gfs2_quota_data *qd;
struct gfs2_sbd *sdp;
+ int nr_to_scan = sc->nr_to_scan;
- if (nr == 0)
+ if (nr_to_scan == 0)
goto out;
- if (!(gfp_mask & __GFP_FS))
+ if (!(sc->gfp_mask & __GFP_FS))
return -1;
spin_lock(&qd_lru_lock);
- while (nr && !list_empty(&qd_lru_list)) {
+ while (nr_to_scan && !list_empty(&qd_lru_list)) {
qd = list_entry(qd_lru_list.next,
struct gfs2_quota_data, qd_reclaim);
sdp = qd->qd_gl->gl_sbd;
@@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
spin_unlock(&qd_lru_lock);
kmem_cache_free(gfs2_quotad_cachep, qd);
spin_lock(&qd_lru_lock);
- nr--;
+ nr_to_scan--;
}
spin_unlock(&qd_lru_lock);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index e7d236ca48bd..90bf1c302a98 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -12,6 +12,7 @@
struct gfs2_inode;
struct gfs2_sbd;
+struct shrink_control;
#define NO_QUOTA_CHANGE ((u32)-1)
@@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
return ret;
}
-extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
+ struct shrink_control *sc);
extern const struct quotactl_ops gfs2_quotactl_ops;
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6fcae8469f6d..9b780df3fd54 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -78,10 +78,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
unsigned char *buf2, unsigned int offset,
- unsigned int buflen, u32 block,
+ struct gfs2_bitmap *bi, u32 block,
unsigned char new_state)
{
unsigned char *byte1, *byte2, *end, cur_state;
+ unsigned int buflen = bi->bi_len;
const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
byte1 = buf1 + offset + (block / GFS2_NBBY);
@@ -92,6 +93,16 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
if (unlikely(!valid_change[new_state * 4 + cur_state])) {
+ printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, "
+ "new_state=%d\n",
+ (unsigned long long)block, cur_state, new_state);
+ printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n",
+ (unsigned long long)rgd->rd_addr,
+ (unsigned long)bi->bi_start);
+ printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n",
+ (unsigned long)bi->bi_offset,
+ (unsigned long)bi->bi_len);
+ dump_stack();
gfs2_consist_rgrpd(rgd);
return;
}
@@ -381,6 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
if (gl) {
gl->gl_object = NULL;
+ gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
@@ -1365,7 +1377,7 @@ skip:
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
- bi->bi_len, blk, new_state);
+ bi, blk, new_state);
goal = blk;
while (*n < elen) {
goal++;
@@ -1375,7 +1387,7 @@ skip:
GFS2_BLKST_FREE)
break;
gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
- bi->bi_len, goal, new_state);
+ bi, goal, new_state);
(*n)++;
}
out:
@@ -1432,7 +1444,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
}
gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
- bi->bi_len, buf_blk, new_state);
+ bi, buf_blk, new_state);
}
return rgd;
@@ -1617,6 +1629,10 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
gfs2_trans_add_rg(rgd);
+
+ /* Directories keep their data in the metadata address space */
+ if (ip->i_depth)
+ gfs2_meta_wipe(ip, bstart, blen);
}
/**
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b9f28e66dad1..ed540e7018be 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -23,6 +23,7 @@
#include <linux/time.h>
#include <linux/wait.h>
#include <linux/writeback.h>
+#include <linux/backing-dev.h>
#include "gfs2.h"
#include "incore.h"
@@ -700,11 +701,47 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
mutex_unlock(&sdp->sd_freeze_lock);
}
+void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
+{
+ struct gfs2_dinode *str = buf;
+
+ str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+ str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI);
+ str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
+ str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
+ str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
+ str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
+ str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
+ str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
+ str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
+ str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
+ str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
+ str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
+ str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
+ str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+
+ str->di_goal_meta = cpu_to_be64(ip->i_goal);
+ str->di_goal_data = cpu_to_be64(ip->i_goal);
+ str->di_generation = cpu_to_be64(ip->i_generation);
+
+ str->di_flags = cpu_to_be32(ip->i_diskflags);
+ str->di_height = cpu_to_be16(ip->i_height);
+ str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+ !(ip->i_diskflags & GFS2_DIF_EXHASH) ?
+ GFS2_FORMAT_DE : 0);
+ str->di_depth = cpu_to_be16(ip->i_depth);
+ str->di_entries = cpu_to_be32(ip->i_entries);
+
+ str->di_eattr = cpu_to_be64(ip->i_eattr);
+ str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
+ str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
+ str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+}
/**
* gfs2_write_inode - Make sure the inode is stable on the disk
* @inode: The inode
- * @sync: synchronous write flag
+ * @wbc: The writeback control structure
*
* Returns: errno
*/
@@ -713,15 +750,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
+ struct backing_dev_info *bdi = metamapping->backing_dev_info;
struct gfs2_holder gh;
struct buffer_head *bh;
struct timespec atime;
struct gfs2_dinode *di;
- int ret = 0;
+ int ret = -EAGAIN;
- /* Check this is a "normal" inode, etc */
+ /* Skip timestamp update, if this is from a memalloc */
if (current->flags & PF_MEMALLOC)
- return 0;
+ goto do_flush;
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
if (ret)
goto do_flush;
@@ -745,6 +784,13 @@ do_unlock:
do_flush:
if (wbc->sync_mode == WB_SYNC_ALL)
gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ filemap_fdatawrite(metamapping);
+ if (bdi->dirty_exceeded)
+ gfs2_ail1_flush(sdp, wbc);
+ if (!ret && (wbc->sync_mode == WB_SYNC_ALL))
+ ret = filemap_fdatawait(metamapping);
+ if (ret)
+ mark_inode_dirty_sync(inode);
return ret;
}
@@ -874,8 +920,9 @@ restart:
static int gfs2_sync_fs(struct super_block *sb, int wait)
{
- if (wait && sb->s_fs_info)
- gfs2_log_flush(sb->s_fs_info, NULL);
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ if (wait && sdp)
+ gfs2_log_flush(sdp, NULL);
return 0;
}
@@ -1308,6 +1355,78 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
return 0;
}
+static void gfs2_final_release_pages(struct gfs2_inode *ip)
+{
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_glock *gl = ip->i_gl;
+
+ truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (atomic_read(&gl->gl_revokes) == 0) {
+ clear_bit(GLF_LFLUSH, &gl->gl_flags);
+ clear_bit(GLF_DIRTY, &gl->gl_flags);
+ }
+}
+
+static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc *al;
+ struct gfs2_rgrpd *rgd;
+ int error;
+
+ if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ al = gfs2_alloc_get(ip);
+ if (!al)
+ return -ENOMEM;
+
+ error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
+ if (error)
+ goto out;
+
+ error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
+ if (error)
+ goto out_qs;
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ error = -EIO;
+ goto out_rindex_relse;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0,
+ &al->al_rgd_gh);
+ if (error)
+ goto out_rindex_relse;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
+ sdp->sd_jdesc->jd_blocks);
+ if (error)
+ goto out_rg_gunlock;
+
+ gfs2_free_di(rgd, ip);
+
+ gfs2_final_release_pages(ip);
+
+ gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+ gfs2_glock_dq_uninit(&al->al_rgd_gh);
+out_rindex_relse:
+ gfs2_glock_dq_uninit(&al->al_ri_gh);
+out_qs:
+ gfs2_quota_unhold(ip);
+out:
+ gfs2_alloc_put(ip);
+ return error;
+}
+
/*
* We have to (at the moment) hold the inodes main lock to cover
* the gap between unlocking the shared lock on the iopen lock and
@@ -1371,15 +1490,13 @@ static void gfs2_evict_inode(struct inode *inode)
}
error = gfs2_dinode_dealloc(ip);
- if (error)
- goto out_unlock;
+ goto out_unlock;
out_truncate:
error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
if (error)
goto out_unlock;
- /* Needs to be done before glock release & also in a transaction */
- truncate_inode_pages(&inode->i_data, 0);
+ gfs2_final_release_pages(ip);
gfs2_trans_end(sdp);
out_unlock:
@@ -1394,6 +1511,7 @@ out:
end_writeback(inode);
ip->i_gl->gl_object = NULL;
+ gfs2_glock_add_to_lru(ip->i_gl);
gfs2_glock_put(ip->i_gl);
ip->i_gl = NULL;
if (ip->i_iopen_gh.gh_gl) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 748ccb557c18..e20eab37bc80 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -81,7 +81,8 @@ static int gfs2_uuid_valid(const u8 *uuid)
static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
{
- const u8 *uuid = sdp->sd_sb.sb_uuid;
+ struct super_block *s = sdp->sd_vfs;
+ const u8 *uuid = s->s_uuid;
buf[0] = '\0';
if (!gfs2_uuid_valid(uuid))
return 0;
@@ -616,7 +617,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
struct kobj_uevent_env *env)
{
struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
- const u8 *uuid = sdp->sd_sb.sb_uuid;
+ struct super_block *s = sdp->sd_vfs;
+ const u8 *uuid = s->s_uuid;
add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index cedb0bb96d96..5d07609ec57d 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -10,6 +10,7 @@
#include <linux/buffer_head.h>
#include <linux/dlmconstants.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/writeback.h>
#include "incore.h"
#include "glock.h"
@@ -40,7 +41,9 @@
{(1UL << GLF_REPLY_PENDING), "r" }, \
{(1UL << GLF_INITIAL), "I" }, \
{(1UL << GLF_FROZEN), "F" }, \
- {(1UL << GLF_QUEUED), "q" })
+ {(1UL << GLF_QUEUED), "q" }, \
+ {(1UL << GLF_LRU), "L" }, \
+ {(1UL << GLF_OBJECT), "o" })
#ifndef NUMPTY
#define NUMPTY
@@ -94,7 +97,7 @@ TRACE_EVENT(gfs2_glock_state_change,
__entry->new_state = glock_trace_state(new_state);
__entry->tgt_state = glock_trace_state(gl->gl_target);
__entry->dmt_state = glock_trace_state(gl->gl_demote_state);
- __entry->flags = gl->gl_flags;
+ __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
),
TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s",
@@ -127,7 +130,7 @@ TRACE_EVENT(gfs2_glock_put,
__entry->gltype = gl->gl_name.ln_type;
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
- __entry->flags = gl->gl_flags;
+ __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
),
TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s",
@@ -161,7 +164,7 @@ TRACE_EVENT(gfs2_demote_rq,
__entry->glnum = gl->gl_name.ln_number;
__entry->cur_state = glock_trace_state(gl->gl_state);
__entry->dmt_state = glock_trace_state(gl->gl_demote_state);
- __entry->flags = gl->gl_flags;
+ __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0);
),
TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s",
@@ -318,6 +321,33 @@ TRACE_EVENT(gfs2_log_blocks,
MINOR(__entry->dev), __entry->blocks)
);
+/* Writing back the AIL */
+TRACE_EVENT(gfs2_ail_flush,
+
+ TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start),
+
+ TP_ARGS(sdp, wbc, start),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( int, start )
+ __field( int, sync_mode )
+ __field( long, nr_to_write )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sdp->sd_vfs->s_dev;
+ __entry->start = start;
+ __entry->sync_mode = wbc->sync_mode;
+ __entry->nr_to_write = wbc->nr_to_write;
+ ),
+
+ TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev),
+ MINOR(__entry->dev), __entry->start ? "start" : "end",
+ __entry->sync_mode == WB_SYNC_ALL ? "all" : "none",
+ __entry->nr_to_write)
+);
+
/* Section 3 - bmap
*
* Objectives:
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b4d70b13be92..1cb70cdba2c1 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,6 +253,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
+ if (S_ISDIR(inode->i_mode))
+ dentry_unhash(dentry);
+
if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
return -ENOTEMPTY;
res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -283,6 +286,9 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
+ if (S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
res = hfs_remove(new_dir, new_dentry);
if (res)
return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4df5059c25da..b28835091dd0 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,6 +370,8 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int res;
+ dentry_unhash(dentry);
+
if (inode->i_size != 2)
return -ENOTEMPTY;
@@ -467,10 +469,12 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
/* Unlink destination if it already exists */
if (new_dentry->d_inode) {
- if (S_ISDIR(new_dentry->d_inode->i_mode))
+ if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+ dentry_unhash(new_dentry);
res = hfsplus_rmdir(new_dir, new_dentry);
- else
+ } else {
res = hfsplus_unlink(new_dir, new_dentry);
+ }
if (res)
return res;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2638c834ed28..e6816b9e6903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,6 +683,8 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
char *file;
int err;
+ dentry_unhash(dentry);
+
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = do_rmdir(file);
@@ -736,6 +738,9 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
char *from_name, *to_name;
int err;
+ if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
+ dentry_unhash(to);
+
if ((from_name = dentry_name(from)) == NULL)
return -ENOMEM;
if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 1f05839c27a7..ff0ce21c0867 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -395,7 +395,6 @@ again:
dentry_unhash(dentry);
if (!d_unhashed(dentry)) {
- dput(dentry);
hpfs_unlock(dir->i_sb);
return -ENOSPC;
}
@@ -403,7 +402,6 @@ again:
!S_ISREG(inode->i_mode) ||
get_write_access(inode)) {
d_rehash(dentry);
- dput(dentry);
} else {
struct iattr newattrs;
/*printk("HPFS: truncating file before delete.\n");*/
@@ -411,7 +409,6 @@ again:
newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
err = notify_change(dentry, &newattrs);
put_write_access(inode);
- dput(dentry);
if (!err)
goto again;
}
@@ -442,6 +439,8 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
int err;
int r;
+ dentry_unhash(dentry);
+
hpfs_adjust_length(name, &len);
hpfs_lock(dir->i_sb);
err = -ENOENT;
@@ -535,6 +534,10 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh;
struct fnode *fnode;
int err;
+
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
if ((err = hpfs_chk_name(new_name, &new_len))) return err;
err = 0;
hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9eeb1cd03ff..7aafeb8fa300 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (!prio_tree_empty(&mapping->i_mmap))
hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
truncate_hugepages(inode, offset);
return 0;
}
@@ -921,7 +921,8 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
}
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+struct file *hugetlb_file_setup(const char *name, size_t size,
+ vm_flags_t acctflag,
struct user_struct **user, int creat_flags)
{
int error = -ENOMEM;
diff --git a/fs/inode.c b/fs/inode.c
index 33c963d08ab4..990d284877a1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -24,6 +24,7 @@
#include <linux/mount.h>
#include <linux/async.h>
#include <linux/posix_acl.h>
+#include <linux/prefetch.h>
#include <linux/ima.h>
#include <linux/cred.h>
#include "internal.h"
@@ -325,12 +326,11 @@ void address_space_init_once(struct address_space *mapping)
memset(mapping, 0, sizeof(*mapping));
INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
spin_lock_init(&mapping->tree_lock);
- spin_lock_init(&mapping->i_mmap_lock);
+ mutex_init(&mapping->i_mmap_mutex);
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
- mutex_init(&mapping->unmap_mutex);
}
EXPORT_SYMBOL(address_space_init_once);
@@ -751,8 +751,12 @@ static void prune_icache(int nr_to_scan)
* This function is passed the number of inodes to scan, and it returns the
* total number of remaining possibly-reclaimable inodes.
*/
-static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
+
if (nr) {
/*
* Nasty deadlock avoidance. We may hold various FS locks,
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 69b180459463..72ffa974b0b8 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal)
* all outstanding updates to complete.
*/
-#ifdef COMMIT_STATS
- spin_lock(&journal->j_list_lock);
- summarise_journal_usage(journal);
- spin_unlock(&journal->j_list_lock);
-#endif
-
/* Do we need to erase the effects of a prior journal_flush? */
if (journal->j_flags & JFS_FLUSHED) {
jbd_debug(3, "super block updated\n");
@@ -722,8 +716,13 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /* Wake up any transactions which were waiting for this
- IO to complete */
+ /*
+ * Wake up any transactions which were waiting for this
+ * IO to complete. The barrier must be here so that changes
+ * by journal_file_buffer() take effect before wake_up_bit()
+ * does the waitqueue check.
+ */
+ smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b3713afaaa9e..e2d4285fbe90 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal)
int __log_start_commit(journal_t *journal, tid_t target)
{
/*
- * Are we already doing a recent enough commit?
+ * The only transaction we can possibly wait upon is the
+ * currently running transaction (if it exists). Otherwise,
+ * the target tid must be an old one.
*/
- if (!tid_geq(journal->j_commit_request, target)) {
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == target) {
/*
* We want a new commit: OK, mark the request and wakeup the
* commit thread. We do _not_ do the commit ourselves.
@@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target)
journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
return 1;
- }
+ } else if (!tid_geq(journal->j_commit_request, target))
+ /* This should never happen, but if it does, preserve
+ the evidence before kjournald goes into a loop and
+ increments j_commit_sequence beyond all recognition. */
+ WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+ journal->j_commit_request, journal->j_commit_sequence,
+ target, journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0);
return 0;
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 60d2319651b2..f7ee81a065da 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked.
*
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
*/
handle_t *journal_start(journal_t *journal, int nblocks)
{
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6e28000a4b21..7f21cf3aaf92 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -219,7 +219,6 @@ static int journal_submit_data_buffers(journal_t *journal,
ret = err;
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
- commit_transaction->t_flushed_data_blocks = 1;
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
smp_mb__after_clear_bit();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
@@ -338,12 +337,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* all outstanding updates to complete.
*/
-#ifdef COMMIT_STATS
- spin_lock(&journal->j_list_lock);
- summarise_journal_usage(journal);
- spin_unlock(&journal->j_list_lock);
-#endif
-
/* Do we need to erase the effects of a prior jbd2_journal_flush? */
if (journal->j_flags & JBD2_FLUSHED) {
jbd_debug(3, "super block updated\n");
@@ -678,12 +671,16 @@ start_journal_io:
err = 0;
}
+ write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT);
+ commit_transaction->t_state = T_COMMIT_DFLUSH;
+ write_unlock(&journal->j_state_lock);
/*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
* the commit record
*/
- if (commit_transaction->t_flushed_data_blocks &&
+ if (commit_transaction->t_need_data_flush &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
@@ -760,8 +757,13 @@ wait_for_iobuf:
required. */
JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
- /* Wake up any transactions which were waiting for this
- IO to complete */
+ /*
+ * Wake up any transactions which were waiting for this IO to
+ * complete. The barrier must be here so that changes by
+ * jbd2_journal_file_buffer() take effect before wake_up_bit()
+ * does the waitqueue check.
+ */
+ smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh);
@@ -800,6 +802,10 @@ wait_for_iobuf:
jbd2_journal_abort(journal, err);
jbd_debug(3, "JBD: commit phase 5\n");
+ write_lock(&journal->j_state_lock);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
+ commit_transaction->t_state = T_COMMIT_JFLUSH;
+ write_unlock(&journal->j_state_lock);
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -955,7 +961,7 @@ restart_loop:
jbd_debug(3, "JBD: commit phase 7\n");
- J_ASSERT(commit_transaction->t_state == T_COMMIT);
+ J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
commit_transaction->t_start = jiffies;
stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0ec3db1c395..9a7826990304 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -479,9 +479,12 @@ int __jbd2_log_space_left(journal_t *journal)
int __jbd2_log_start_commit(journal_t *journal, tid_t target)
{
/*
- * Are we already doing a recent enough commit?
+ * The only transaction we can possibly wait upon is the
+ * currently running transaction (if it exists). Otherwise,
+ * the target tid must be an old one.
*/
- if (!tid_geq(journal->j_commit_request, target)) {
+ if (journal->j_running_transaction &&
+ journal->j_running_transaction->t_tid == target) {
/*
* We want a new commit: OK, mark the request and wakeup the
* commit thread. We do _not_ do the commit ourselves.
@@ -493,7 +496,15 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target)
journal->j_commit_sequence);
wake_up(&journal->j_wait_commit);
return 1;
- }
+ } else if (!tid_geq(journal->j_commit_request, target))
+ /* This should never happen, but if it does, preserve
+ the evidence before kjournald goes into a loop and
+ increments j_commit_sequence beyond all recognition. */
+ WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n",
+ journal->j_commit_request,
+ journal->j_commit_sequence,
+ target, journal->j_running_transaction ?
+ journal->j_running_transaction->t_tid : 0);
return 0;
}
@@ -577,6 +588,47 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
}
/*
+ * Return 1 if a given transaction has not yet sent barrier request
+ * connected with a transaction commit. If 0 is returned, transaction
+ * may or may not have sent the barrier. Used to avoid sending barrier
+ * twice in common cases.
+ */
+int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
+{
+ int ret = 0;
+ transaction_t *commit_trans;
+
+ if (!(journal->j_flags & JBD2_BARRIER))
+ return 0;
+ read_lock(&journal->j_state_lock);
+ /* Transaction already committed? */
+ if (tid_geq(journal->j_commit_sequence, tid))
+ goto out;
+ commit_trans = journal->j_committing_transaction;
+ if (!commit_trans || commit_trans->t_tid != tid) {
+ ret = 1;
+ goto out;
+ }
+ /*
+ * Transaction is being committed and we already proceeded to
+ * submitting a flush to fs partition?
+ */
+ if (journal->j_fs_dev != journal->j_dev) {
+ if (!commit_trans->t_need_data_flush ||
+ commit_trans->t_state >= T_COMMIT_DFLUSH)
+ goto out;
+ } else {
+ if (commit_trans->t_state >= T_COMMIT_JFLUSH)
+ goto out;
+ }
+ ret = 1;
+out:
+ read_unlock(&journal->j_state_lock);
+ return ret;
+}
+EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
+
+/*
* Wait for a specified commit to complete.
* The caller may not hold the journal lock.
*/
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 05fa77a23711..3eec82d32fd4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -82,7 +82,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
*/
/*
- * Update transiaction's maximum wait time, if debugging is enabled.
+ * Update transaction's maximum wait time, if debugging is enabled.
*
* In order for t_max_wait to be reliable, it must be protected by a
* lock. But doing so will mean that start_this_handle() can not be
@@ -91,11 +91,10 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
* means that maximum wait time reported by the jbd2_run_stats
* tracepoint will always be zero.
*/
-static inline void update_t_max_wait(transaction_t *transaction)
+static inline void update_t_max_wait(transaction_t *transaction,
+ unsigned long ts)
{
#ifdef CONFIG_JBD2_DEBUG
- unsigned long ts = jiffies;
-
if (jbd2_journal_enable_debug &&
time_after(transaction->t_start, ts)) {
ts = jbd2_time_diff(ts, transaction->t_start);
@@ -121,6 +120,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
tid_t tid;
int needed, need_to_start;
int nblocks = handle->h_buffer_credits;
+ unsigned long ts = jiffies;
if (nblocks > journal->j_max_transaction_buffers) {
printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -271,7 +271,7 @@ repeat:
/* OK, account for the buffers that this operation expects to
* use and add the handle to the running transaction.
*/
- update_t_max_wait(transaction);
+ update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
atomic_inc(&transaction->t_updates);
atomic_inc(&transaction->t_handle_count);
@@ -316,7 +316,8 @@ static handle_t *new_handle(int nblocks)
* This function is visible to journal users (like ext3fs), so is not
* called with the journal already locked.
*
- * Return a pointer to a newly allocated handle, or NULL on failure
+ * Return a pointer to a newly allocated handle, or an ERR_PTR() value
+ * on failure.
*/
handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
{
@@ -921,8 +922,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
*/
JBUFFER_TRACE(jh, "cancelling revoke");
jbd2_journal_cancel_revoke(handle, jh);
- jbd2_journal_put_journal_head(jh);
out:
+ jbd2_journal_put_journal_head(jh);
return err;
}
@@ -2147,6 +2148,13 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
jinode->i_next_transaction == transaction)
goto done;
+ /*
+ * We only ever set this variable to 1 so the test is safe. Since
+ * t_need_data_flush is likely to be set, we do the test to save some
+ * cacheline bouncing
+ */
+ if (!transaction->t_need_data_flush)
+ transaction->t_need_data_flush = 1;
/* On some different transaction's list - should be
* the committing one */
if (jinode->i_transaction) {
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 82faddd1f321..05f73328b28b 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -609,6 +609,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
int ret;
uint32_t now = get_seconds();
+ dentry_unhash(dentry);
+
for (fd = f->dents ; fd; fd = fd->next) {
if (fd->ino)
return -ENOTEMPTY;
@@ -784,6 +786,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
uint8_t type;
uint32_t now;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* The VFS will check for us and prevent trying to rename a
* file over a directory and vice versa, but if it's a directory,
* the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index eaaf2b511e89..865df16a6cf3 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,6 +360,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
+ dentry_unhash(dentry);
+
/* Init inode for quota operations. */
dquot_initialize(dip);
dquot_initialize(ip);
@@ -1095,6 +1097,9 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
new_dentry->d_name.name);
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
dquot_initialize(old_dir);
dquot_initialize(new_dir);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 1adc8d455f0e..df0de27c2733 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -10,6 +10,7 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
+#include <linux/prefetch.h>
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 9ed89d1663f8..f34c9cde9e94 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,6 +273,8 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ dentry_unhash(dentry);
+
if (!logfs_empty_dir(inode))
return -ENOTEMPTY;
@@ -622,6 +624,9 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
loff_t pos;
int err;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* 1. locate source dd */
err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
if (err)
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9e22085231b3..d8d09380c7de 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -481,7 +481,7 @@ static int inode_write_alias(struct super_block *sb,
val = inode_val0(inode);
break;
case INODE_USED_OFS:
- val = cpu_to_be64(li->li_used_bytes);;
+ val = cpu_to_be64(li->li_used_bytes);
break;
case INODE_SIZE_OFS:
val = cpu_to_be64(i_size_read(inode));
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 2f174be06555..8c32ef3ba88e 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock);
* What the mbcache registers as to get shrunk dynamically.
*/
-static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink,
+ struct shrink_control *sc);
static struct shrinker mb_cache_shrinker = {
.shrink = mb_cache_shrink_fn,
@@ -156,18 +157,19 @@ forget:
* gets low.
*
* @shrink: (ignored)
- * @nr_to_scan: Number of objects to scan
- * @gfp_mask: (ignored)
+ * @sc: shrink_control passed from reclaim
*
* Returns the number of objects which are present in the cache.
*/
static int
-mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
{
LIST_HEAD(free_list);
struct mb_cache *cache;
struct mb_cache_entry *entry, *tmp;
int count = 0;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
mb_debug("trying to free %d entries", nr_to_scan);
spin_lock(&mb_cache_spinlock);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 6e6777f1b4b2..f60aed8db9c4 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,6 +168,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err = -ENOTEMPTY;
+ dentry_unhash(dentry);
+
if (minix_empty_dir(inode)) {
err = minix_unlink(dir, dentry);
if (!err) {
@@ -190,6 +192,9 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
struct minix_dir_entry * old_de;
int err = -ENOENT;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_de = minix_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/namei.c b/fs/namei.c
index e3c4f112ebf7..2358b326b221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -391,79 +391,28 @@ void path_put(struct path *path)
}
EXPORT_SYMBOL(path_put);
-/**
- * nameidata_drop_rcu - drop this nameidata out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
+/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
- * Documentation/filesystems/path-lookup.txt). __drop_rcu* functions attempt
- * to drop out of rcu-walk mode and take normal reference counts on dentries
- * and vfsmounts to transition to rcu-walk mode. __drop_rcu* functions take
- * refcounts at the last known good point before rcu-walk got stuck, so
- * ref-walk may continue from there. If this is not successful (eg. a seqcount
- * has changed), then failure is returned and path walk restarts from the
- * beginning in ref-walk mode.
- *
- * nameidata_drop_rcu attempts to drop the current nd->path and nd->root into
- * ref-walk. Must be called from rcu-walk context.
+ * Documentation/filesystems/path-lookup.txt). In situations when we can't
+ * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
+ * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * mode. Refcounts are grabbed at the last known good point before rcu-walk
+ * got stuck, so ref-walk may continue from there. If this is not successful
+ * (eg. a seqcount has changed), then failure is returned and it's up to caller
+ * to restart the path walk from the beginning in ref-walk mode.
*/
-static int nameidata_drop_rcu(struct nameidata *nd)
-{
- struct fs_struct *fs = current->fs;
- struct dentry *dentry = nd->path.dentry;
- int want_root = 0;
-
- BUG_ON(!(nd->flags & LOOKUP_RCU));
- if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
- want_root = 1;
- spin_lock(&fs->lock);
- if (nd->root.mnt != fs->root.mnt ||
- nd->root.dentry != fs->root.dentry)
- goto err_root;
- }
- spin_lock(&dentry->d_lock);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err;
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
- if (want_root) {
- path_get(&nd->root);
- spin_unlock(&fs->lock);
- }
- mntget(nd->path.mnt);
-
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- nd->flags &= ~LOOKUP_RCU;
- return 0;
-err:
- spin_unlock(&dentry->d_lock);
-err_root:
- if (want_root)
- spin_unlock(&fs->lock);
- return -ECHILD;
-}
-
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
-static inline int nameidata_drop_rcu_maybe(struct nameidata *nd)
-{
- if (nd->flags & LOOKUP_RCU)
- return nameidata_drop_rcu(nd);
- return 0;
-}
/**
- * nameidata_dentry_drop_rcu - drop nameidata and dentry out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * @dentry: dentry to drop
+ * unlazy_walk - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry or NULL
* Returns: 0 on success, -ECHILD on failure
*
- * nameidata_dentry_drop_rcu attempts to drop the current nd->path and nd->root,
- * and dentry into ref-walk. @dentry must be a path found by a do_lookup call on
- * @nd. Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode. @dentry must be a path found by a do_lookup call on
+ * @nd or NULL. Must be called from rcu-walk context.
*/
-static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry)
+static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
{
struct fs_struct *fs = current->fs;
struct dentry *parent = nd->path.dentry;
@@ -478,18 +427,25 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
goto err_root;
}
spin_lock(&parent->d_lock);
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err;
- /*
- * If the sequence check on the child dentry passed, then the child has
- * not been removed from its parent. This means the parent dentry must
- * be valid and able to take a reference at this point.
- */
- BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
- BUG_ON(!parent->d_count);
- parent->d_count++;
- spin_unlock(&dentry->d_lock);
+ if (!dentry) {
+ if (!__d_rcu_to_refcount(parent, nd->seq))
+ goto err_parent;
+ BUG_ON(nd->inode != parent->d_inode);
+ } else {
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ if (!__d_rcu_to_refcount(dentry, nd->seq))
+ goto err_child;
+ /*
+ * If the sequence check on the child dentry passed, then
+ * the child has not been removed from its parent. This
+ * means the parent dentry must be valid and able to take
+ * a reference at this point.
+ */
+ BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
+ BUG_ON(!parent->d_count);
+ parent->d_count++;
+ spin_unlock(&dentry->d_lock);
+ }
spin_unlock(&parent->d_lock);
if (want_root) {
path_get(&nd->root);
@@ -501,8 +457,10 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
br_read_unlock(vfsmount_lock);
nd->flags &= ~LOOKUP_RCU;
return 0;
-err:
+
+err_child:
spin_unlock(&dentry->d_lock);
+err_parent:
spin_unlock(&parent->d_lock);
err_root:
if (want_root)
@@ -510,59 +468,6 @@ err_root:
return -ECHILD;
}
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
-static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
-{
- if (nd->flags & LOOKUP_RCU) {
- if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- return -ECHILD;
- }
- }
- return 0;
-}
-
-/**
- * nameidata_drop_rcu_last - drop nameidata ending path walk out of rcu-walk
- * @nd: nameidata pathwalk data to drop
- * Returns: 0 on success, -ECHILD on failure
- *
- * nameidata_drop_rcu_last attempts to drop the current nd->path into ref-walk.
- * nd->path should be the final element of the lookup, so nd->root is discarded.
- * Must be called from rcu-walk context.
- */
-static int nameidata_drop_rcu_last(struct nameidata *nd)
-{
- struct dentry *dentry = nd->path.dentry;
-
- BUG_ON(!(nd->flags & LOOKUP_RCU));
- nd->flags &= ~LOOKUP_RCU;
- if (!(nd->flags & LOOKUP_ROOT))
- nd->root.mnt = NULL;
- spin_lock(&dentry->d_lock);
- if (!__d_rcu_to_refcount(dentry, nd->seq))
- goto err_unlock;
- BUG_ON(nd->inode != dentry->d_inode);
- spin_unlock(&dentry->d_lock);
-
- mntget(nd->path.mnt);
-
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
-
- return 0;
-
-err_unlock:
- spin_unlock(&dentry->d_lock);
- rcu_read_unlock();
- br_read_unlock(vfsmount_lock);
- return -ECHILD;
-}
-
/**
* release_open_intent - free up open intent resources
* @nd: pointer to nameidata
@@ -606,26 +511,39 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
return dentry;
}
-/*
- * handle_reval_path - force revalidation of a dentry
- *
- * In some situations the path walking code will trust dentries without
- * revalidating them. This causes problems for filesystems that depend on
- * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
- * (which indicates that it's possible for the dentry to go stale), force
- * a d_revalidate call before proceeding.
+/**
+ * complete_walk - successful completion of path walk
+ * @nd: pointer nameidata
*
- * Returns 0 if the revalidation was successful. If the revalidation fails,
- * either return the error returned by d_revalidate or -ESTALE if the
- * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
- * invalidate the dentry. It's up to the caller to handle putting references
- * to the path if necessary.
+ * If we had been in RCU mode, drop out of it and legitimize nd->path.
+ * Revalidate the final result, unless we'd already done that during
+ * the path walk or the filesystem doesn't ask for it. Return 0 on
+ * success, -error on failure. In case of failure caller does not
+ * need to drop nd->path.
*/
-static inline int handle_reval_path(struct nameidata *nd)
+static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;
+ if (nd->flags & LOOKUP_RCU) {
+ nd->flags &= ~LOOKUP_RCU;
+ if (!(nd->flags & LOOKUP_ROOT))
+ nd->root.mnt = NULL;
+ spin_lock(&dentry->d_lock);
+ if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
+ spin_unlock(&dentry->d_lock);
+ rcu_read_unlock();
+ br_read_unlock(vfsmount_lock);
+ return -ECHILD;
+ }
+ BUG_ON(nd->inode != dentry->d_inode);
+ spin_unlock(&dentry->d_lock);
+ mntget(nd->path.mnt);
+ rcu_read_unlock();
+ br_read_unlock(vfsmount_lock);
+ }
+
if (likely(!(nd->flags & LOOKUP_JUMPED)))
return 0;
@@ -643,6 +561,7 @@ static inline int handle_reval_path(struct nameidata *nd)
if (!status)
status = -ESTALE;
+ path_put(&nd->path);
return status;
}
@@ -1241,13 +1160,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
if (likely(__follow_mount_rcu(nd, path, inode, false)))
return 0;
unlazy:
- if (dentry) {
- if (nameidata_dentry_drop_rcu(nd, dentry))
- return -ECHILD;
- } else {
- if (nameidata_drop_rcu(nd))
- return -ECHILD;
- }
+ if (unlazy_walk(nd, dentry))
+ return -ECHILD;
} else {
dentry = __d_lookup(parent, name);
}
@@ -1303,7 +1217,7 @@ static inline int may_lookup(struct nameidata *nd)
int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
if (err != -ECHILD)
return err;
- if (nameidata_drop_rcu(nd))
+ if (unlazy_walk(nd, NULL))
return -ECHILD;
}
return exec_permission(nd->inode, 0);
@@ -1357,8 +1271,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
return -ENOENT;
}
if (unlikely(inode->i_op->follow_link) && follow) {
- if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
- return -ECHILD;
+ if (nd->flags & LOOKUP_RCU) {
+ if (unlikely(unlazy_walk(nd, path->dentry))) {
+ terminate_walk(nd);
+ return -ECHILD;
+ }
+ }
BUG_ON(inode != path->dentry->d_inode);
return 1;
}
@@ -1378,12 +1296,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
{
int res;
- BUG_ON(nd->depth >= MAX_NESTED_LINKS);
if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
path_put_conditional(path, nd);
path_put(&nd->path);
return -ELOOP;
}
+ BUG_ON(nd->depth >= MAX_NESTED_LINKS);
nd->depth++;
current->link_count++;
@@ -1657,18 +1575,8 @@ static int path_lookupat(int dfd, const char *name,
}
}
- if (nd->flags & LOOKUP_RCU) {
- /* went all way through without dropping RCU */
- BUG_ON(err);
- if (nameidata_drop_rcu_last(nd))
- err = -ECHILD;
- }
-
- if (!err) {
- err = handle_reval_path(nd);
- if (err)
- path_put(&nd->path);
- }
+ if (!err)
+ err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
if (!nd->inode->i_op->lookup) {
@@ -2134,13 +2042,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
return ERR_PTR(error);
/* fallthrough */
case LAST_ROOT:
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
- error = handle_reval_path(nd);
+ error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
audit_inode(pathname, nd->path.dentry);
if (open_flag & O_CREAT) {
error = -EISDIR;
@@ -2148,10 +2052,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
goto ok;
case LAST_BIND:
- /* can't be RCU mode here */
- error = handle_reval_path(nd);
+ error = complete_walk(nd);
if (error)
- goto exit;
+ return ERR_PTR(error);
audit_inode(pathname, dir);
goto ok;
}
@@ -2170,10 +2073,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
if (error) /* symlink */
return NULL;
/* sayonara */
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
+ error = complete_walk(nd);
+ if (error)
+ return ERR_PTR(-ECHILD);
error = -ENOTDIR;
if (nd->flags & LOOKUP_DIRECTORY) {
@@ -2185,11 +2087,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
}
/* create side of things */
-
- if (nd->flags & LOOKUP_RCU) {
- if (nameidata_drop_rcu_last(nd))
- return ERR_PTR(-ECHILD);
- }
+ error = complete_walk(nd);
+ if (error)
+ return ERR_PTR(error);
audit_inode(pathname, dir);
error = -EISDIR;
@@ -2629,10 +2529,10 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
}
/*
- * We try to drop the dentry early: we should have
- * a usage count of 2 if we're the only user of this
- * dentry, and if that is true (possibly after pruning
- * the dcache), then we drop the dentry now.
+ * The dentry_unhash() helper will try to drop the dentry early: we
+ * should have a usage count of 2 if we're the only user of this
+ * dentry, and if that is true (possibly after pruning the dcache),
+ * then we drop the dentry now.
*
* A low-level filesystem can, if it choses, legally
* do a
@@ -2645,10 +2545,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, int, mode)
*/
void dentry_unhash(struct dentry *dentry)
{
- dget(dentry);
shrink_dcache_parent(dentry);
spin_lock(&dentry->d_lock);
- if (dentry->d_count == 2)
+ if (dentry->d_count == 1)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
@@ -2664,25 +2563,26 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EPERM;
mutex_lock(&dentry->d_inode->i_mutex);
- dentry_unhash(dentry);
+
+ error = -EBUSY;
if (d_mountpoint(dentry))
- error = -EBUSY;
- else {
- error = security_inode_rmdir(dir, dentry);
- if (!error) {
- error = dir->i_op->rmdir(dir, dentry);
- if (!error) {
- dentry->d_inode->i_flags |= S_DEAD;
- dont_mount(dentry);
- }
- }
- }
+ goto out;
+
+ error = security_inode_rmdir(dir, dentry);
+ if (error)
+ goto out;
+
+ error = dir->i_op->rmdir(dir, dentry);
+ if (error)
+ goto out;
+
+ dentry->d_inode->i_flags |= S_DEAD;
+ dont_mount(dentry);
+
+out:
mutex_unlock(&dentry->d_inode->i_mutex);
- if (!error) {
+ if (!error)
d_delete(dentry);
- }
- dput(dentry);
-
return error;
}
@@ -3053,12 +2953,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* HOWEVER, it relies on the assumption that any object with ->lookup()
* has no more than 1 dentry. If "hybrid" objects will ever appear,
* we'd better make sure that there's no link(2) for them.
- * d) some filesystems don't support opened-but-unlinked directories,
- * either because of layout or because they are not ready to deal with
- * all cases correctly. The latter will be fixed (taking this sort of
- * stuff into VFS), but the former is not going away. Solution: the same
- * trick as in rmdir().
- * e) conversion from fhandle to dentry may come in the wrong moment - when
+ * d) conversion from fhandle to dentry may come in the wrong moment - when
* we are removing the target. Solution: we will have to grab ->i_mutex
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
* ->i_mutex on parents, which works but leads to some truly excessive
@@ -3068,7 +2963,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
int error = 0;
- struct inode *target;
+ struct inode *target = new_dentry->d_inode;
/*
* If we are going to change the parent - check write permissions,
@@ -3084,26 +2979,24 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- target = new_dentry->d_inode;
if (target)
mutex_lock(&target->i_mutex);
- if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- error = -EBUSY;
- else {
- if (target)
- dentry_unhash(new_dentry);
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- }
+
+ error = -EBUSY;
+ if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
+ goto out;
+
+ error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (error)
+ goto out;
+
if (target) {
- if (!error) {
- target->i_flags |= S_DEAD;
- dont_mount(new_dentry);
- }
- mutex_unlock(&target->i_mutex);
- if (d_unhashed(new_dentry))
- d_rehash(new_dentry);
- dput(new_dentry);
+ target->i_flags |= S_DEAD;
+ dont_mount(new_dentry);
}
+out:
+ if (target)
+ mutex_unlock(&target->i_mutex);
if (!error)
if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
d_move(old_dentry,new_dentry);
@@ -3113,7 +3006,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
- struct inode *target;
+ struct inode *target = new_dentry->d_inode;
int error;
error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -3121,19 +3014,22 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
return error;
dget(new_dentry);
- target = new_dentry->d_inode;
if (target)
mutex_lock(&target->i_mutex);
+
+ error = -EBUSY;
if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
- error = -EBUSY;
- else
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- if (!error) {
- if (target)
- dont_mount(new_dentry);
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry, new_dentry);
- }
+ goto out;
+
+ error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (error)
+ goto out;
+
+ if (target)
+ dont_mount(new_dentry);
+ if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
+ d_move(old_dentry, new_dentry);
+out:
if (target)
mutex_unlock(&target->i_mutex);
dput(new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index d99bcf59e4c2..fe59bd145d21 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1695,7 +1695,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
static int flags_to_propagation_type(int flags)
{
- int type = flags & ~MS_REC;
+ int type = flags & ~(MS_REC | MS_SILENT);
/* Fail if any non-propagation flags are set */
if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f6946bb5cb55..e3e646b06404 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,6 +1033,8 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
DPRINTK("ncp_rmdir: removing %s/%s\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
+ dentry_unhash(dentry);
+
error = -EBUSY;
if (!d_unhashed(dentry))
goto out;
@@ -1139,6 +1141,9 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 0250e4ce4893..202f370526a7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
#endif
struct ncp_entry_info finfo;
- data.wdog_pid = NULL;
+ memset(&data, 0, sizeof(data));
server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL);
if (!server)
return -ENOMEM;
@@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data;
data.flags = md->flags;
- data.int_flags = 0;
data.mounted_uid = md->mounted_uid;
data.wdog_pid = find_get_pid(md->wdog_pid);
data.ncp_fd = md->ncp_fd;
@@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
data.file_mode = md->file_mode;
data.dir_mode = md->dir_mode;
data.info_fd = -1;
- data.mounted_vol[0] = 0;
}
break;
default:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 7237672216c8..424e47773a84 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head)
}
}
-int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc)
{
LIST_HEAD(head);
struct nfs_inode *nfsi, *next;
struct nfs_access_entry *cache;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
return (nr_to_scan == 0) ? 0 : -1;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index ce118ce885dd..2df6ca7b5898 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp,
/* dir.c */
extern int nfs_access_cache_shrinker(struct shrinker *shrink,
- int nr_to_scan, gfp_t gfp_mask);
+ struct shrink_control *sc);
/* inode.c */
extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 5232d3e8fb2f..a2e2402b2afb 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -8,7 +8,7 @@
* Statistsics for the reply cache
* fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
* statistics for filehandle lookup
- * io <bytes-read> <bytes-writtten>
+ * io <bytes-read> <bytes-written>
* statistics for IO throughput
* th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%>
* time (seconds) when nfsd thread usage above thresholds
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index f7684483785e..eed4d7b26249 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -489,8 +489,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
void nilfs_palloc_commit_alloc_entry(struct inode *inode,
struct nilfs_palloc_req *req)
{
- nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
- nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
+ mark_buffer_dirty(req->pr_bitmap_bh);
+ mark_buffer_dirty(req->pr_desc_bh);
nilfs_mdt_mark_dirty(inode);
brelse(req->pr_bitmap_bh);
@@ -527,8 +527,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
- nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
- nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
+ mark_buffer_dirty(req->pr_desc_bh);
+ mark_buffer_dirty(req->pr_bitmap_bh);
nilfs_mdt_mark_dirty(inode);
brelse(req->pr_bitmap_bh);
@@ -683,8 +683,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
kunmap(bitmap_bh->b_page);
kunmap(desc_bh->b_page);
- nilfs_mdt_mark_buffer_dirty(desc_bh);
- nilfs_mdt_mark_buffer_dirty(bitmap_bh);
+ mark_buffer_dirty(desc_bh);
+ mark_buffer_dirty(bitmap_bh);
nilfs_mdt_mark_dirty(inode);
brelse(bitmap_bh);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 4723f04e9b12..aadbd0b5e3e8 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -34,7 +34,9 @@
struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
{
- return NILFS_I_NILFS(bmap->b_inode)->ns_dat;
+ struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info;
+
+ return nilfs->ns_dat;
}
static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 609cd223eea8..a35ae35e6932 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,12 +34,6 @@
#include "page.h"
#include "btnode.h"
-void nilfs_btnode_cache_init(struct address_space *btnc,
- struct backing_dev_info *bdi)
-{
- nilfs_mapping_init(btnc, bdi);
-}
-
void nilfs_btnode_cache_clear(struct address_space *btnc)
{
invalidate_mapping_pages(btnc, 0, -1);
@@ -62,7 +56,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
BUG();
}
memset(bh->b_data, 0, 1 << inode->i_blkbits);
- bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+ bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = blocknr;
set_buffer_mapped(bh);
set_buffer_uptodate(bh);
@@ -94,10 +88,11 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
if (pblocknr == 0) {
pblocknr = blocknr;
if (inode->i_ino != NILFS_DAT_INO) {
- struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
/* blocknr is a virtual block number */
- err = nilfs_dat_translate(dat, blocknr, &pblocknr);
+ err = nilfs_dat_translate(nilfs->ns_dat, blocknr,
+ &pblocknr);
if (unlikely(err)) {
brelse(bh);
goto out_locked;
@@ -120,7 +115,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
goto found;
}
set_buffer_mapped(bh);
- bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+ bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = pblocknr; /* set block address for read */
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
@@ -259,7 +254,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
- nilfs_btnode_mark_dirty(obh);
+ mark_buffer_dirty(obh);
spin_lock_irq(&btnc->tree_lock);
radix_tree_delete(&btnc->page_tree, oldkey);
@@ -271,7 +266,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
unlock_page(opage);
} else {
nilfs_copy_buffer(nbh, obh);
- nilfs_btnode_mark_dirty(nbh);
+ mark_buffer_dirty(nbh);
nbh->b_blocknr = newkey;
ctxt->bh = nbh;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 1b8ebd888c28..3a4dd2d8d3fc 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
struct buffer_head *newbh;
};
-void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
void nilfs_btnode_cache_clear(struct address_space *);
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
__u64 blocknr);
@@ -51,7 +50,4 @@ void nilfs_btnode_commit_change_key(struct address_space *,
void nilfs_btnode_abort_change_key(struct address_space *,
struct nilfs_btnode_chkey_ctxt *);
-#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh)
-
-
#endif /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index d451ae0e0bf3..7eafe468a29c 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -714,7 +714,7 @@ static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
nilfs_btree_get_nonroot_node(path, level),
path[level].bp_index, key);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
} while ((path[level].bp_index == 0) &&
(++level < nilfs_btree_height(btree) - 1));
}
@@ -739,7 +739,7 @@ static void nilfs_btree_do_insert(struct nilfs_bmap *btree,
nilfs_btree_node_insert(node, path[level].bp_index,
*keyp, *ptrp, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (path[level].bp_index == 0)
nilfs_btree_promote_key(btree, path, level + 1,
@@ -777,9 +777,9 @@ static void nilfs_btree_carry_left(struct nilfs_bmap *btree,
nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
nilfs_btree_promote_key(btree, path, level + 1,
nilfs_btree_node_get_key(node, 0));
@@ -823,9 +823,9 @@ static void nilfs_btree_carry_right(struct nilfs_bmap *btree,
nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
path[level + 1].bp_index++;
nilfs_btree_promote_key(btree, path, level + 1,
@@ -870,9 +870,9 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
nilfs_btree_node_move_right(node, right, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
newkey = nilfs_btree_node_get_key(right, 0);
newptr = path[level].bp_newreq.bpr_ptr;
@@ -919,7 +919,7 @@ static void nilfs_btree_grow(struct nilfs_bmap *btree,
nilfs_btree_node_set_level(root, level + 1);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
path[level].bp_bh = path[level].bp_sib_bh;
path[level].bp_sib_bh = NULL;
@@ -1194,7 +1194,7 @@ static void nilfs_btree_do_delete(struct nilfs_bmap *btree,
nilfs_btree_node_delete(node, path[level].bp_index,
keyp, ptrp, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (path[level].bp_index == 0)
nilfs_btree_promote_key(btree, path, level + 1,
nilfs_btree_node_get_key(node, 0));
@@ -1226,9 +1226,9 @@ static void nilfs_btree_borrow_left(struct nilfs_bmap *btree,
nilfs_btree_node_move_right(left, node, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
nilfs_btree_promote_key(btree, path, level + 1,
nilfs_btree_node_get_key(node, 0));
@@ -1258,9 +1258,9 @@ static void nilfs_btree_borrow_right(struct nilfs_bmap *btree,
nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
path[level + 1].bp_index++;
nilfs_btree_promote_key(btree, path, level + 1,
@@ -1289,7 +1289,7 @@ static void nilfs_btree_concat_left(struct nilfs_bmap *btree,
nilfs_btree_node_move_left(left, node, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_sib_bh))
- nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
+ mark_buffer_dirty(path[level].bp_sib_bh);
nilfs_btnode_delete(path[level].bp_bh);
path[level].bp_bh = path[level].bp_sib_bh;
@@ -1315,7 +1315,7 @@ static void nilfs_btree_concat_right(struct nilfs_bmap *btree,
nilfs_btree_node_move_left(node, right, n, ncblk, ncblk);
if (!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
nilfs_btnode_delete(path[level].bp_sib_bh);
path[level].bp_sib_bh = NULL;
@@ -1709,7 +1709,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs);
nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk);
if (!buffer_dirty(bh))
- nilfs_btnode_mark_dirty(bh);
+ mark_buffer_dirty(bh);
if (!nilfs_bmap_dirty(btree))
nilfs_bmap_set_dirty(btree);
@@ -1787,7 +1787,7 @@ static int nilfs_btree_propagate_p(struct nilfs_bmap *btree,
{
while ((++level < nilfs_btree_height(btree) - 1) &&
!buffer_dirty(path[level].bp_bh))
- nilfs_btnode_mark_dirty(path[level].bp_bh);
+ mark_buffer_dirty(path[level].bp_bh);
return 0;
}
@@ -2229,7 +2229,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level)
}
if (!buffer_dirty(bh))
- nilfs_btnode_mark_dirty(bh);
+ mark_buffer_dirty(bh);
brelse(bh);
if (!nilfs_bmap_dirty(btree))
nilfs_bmap_set_dirty(btree);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 5ff15a8a1024..c9b342c8b503 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -216,14 +216,14 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
if (!nilfs_cpfile_is_in_first(cpfile, cno))
nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
kaddr, 1);
- nilfs_mdt_mark_buffer_dirty(cp_bh);
+ mark_buffer_dirty(cp_bh);
kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, 1);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(header_bh);
+ mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
}
@@ -326,7 +326,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
}
if (nicps > 0) {
tnicps += nicps;
- nilfs_mdt_mark_buffer_dirty(cp_bh);
+ mark_buffer_dirty(cp_bh);
nilfs_mdt_mark_dirty(cpfile);
if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
count =
@@ -358,7 +358,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
header = nilfs_cpfile_block_get_header(cpfile, header_bh,
kaddr);
le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
- nilfs_mdt_mark_buffer_dirty(header_bh);
+ mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
kunmap_atomic(kaddr, KM_USER0);
}
@@ -671,10 +671,10 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
le64_add_cpu(&header->ch_nsnapshots, 1);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(prev_bh);
- nilfs_mdt_mark_buffer_dirty(curr_bh);
- nilfs_mdt_mark_buffer_dirty(cp_bh);
- nilfs_mdt_mark_buffer_dirty(header_bh);
+ mark_buffer_dirty(prev_bh);
+ mark_buffer_dirty(curr_bh);
+ mark_buffer_dirty(cp_bh);
+ mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
brelse(prev_bh);
@@ -774,10 +774,10 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
le64_add_cpu(&header->ch_nsnapshots, -1);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(next_bh);
- nilfs_mdt_mark_buffer_dirty(prev_bh);
- nilfs_mdt_mark_buffer_dirty(cp_bh);
- nilfs_mdt_mark_buffer_dirty(header_bh);
+ mark_buffer_dirty(next_bh);
+ mark_buffer_dirty(prev_bh);
+ mark_buffer_dirty(cp_bh);
+ mark_buffer_dirty(header_bh);
nilfs_mdt_mark_dirty(cpfile);
brelse(prev_bh);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 59e5fe742f7b..fcc2f869af16 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -54,7 +54,7 @@ static int nilfs_dat_prepare_entry(struct inode *dat,
static void nilfs_dat_commit_entry(struct inode *dat,
struct nilfs_palloc_req *req)
{
- nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
+ mark_buffer_dirty(req->pr_entry_bh);
nilfs_mdt_mark_dirty(dat);
brelse(req->pr_entry_bh);
}
@@ -361,7 +361,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
entry->de_blocknr = cpu_to_le64(blocknr);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(entry_bh);
+ mark_buffer_dirty(entry_bh);
nilfs_mdt_mark_dirty(dat);
brelse(entry_bh);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 397e73258631..d7eeca62febd 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -111,7 +111,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- SetPageChecked(page);
wait_on_page_writeback(page);
return VM_FAULT_LOCKED;
}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1c2a3e23f8b2..08a07a218d26 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -48,9 +48,6 @@
#include "dat.h"
#include "ifile.h"
-static const struct address_space_operations def_gcinode_aops = {
-};
-
/*
* nilfs_gccache_submit_read_data() - add data buffer and submit read request
* @inode - gc inode
@@ -87,9 +84,9 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
goto out;
if (pbn == 0) {
- struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
- /* use original dat, not gc dat. */
- err = nilfs_dat_translate(dat_inode, vbn, &pbn);
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+ err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
brelse(bh);
goto failed;
@@ -103,7 +100,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
}
if (!buffer_mapped(bh)) {
- bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
+ bh->b_bdev = inode->i_sb->s_bdev;
set_buffer_mapped(bh);
}
bh->b_blocknr = pbn;
@@ -160,15 +157,11 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
if (buffer_dirty(bh))
return -EEXIST;
- if (buffer_nilfs_node(bh)) {
- if (nilfs_btree_broken_node_block(bh)) {
- clear_buffer_uptodate(bh);
- return -EIO;
- }
- nilfs_btnode_mark_dirty(bh);
- } else {
- nilfs_mark_buffer_dirty(bh);
+ if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) {
+ clear_buffer_uptodate(bh);
+ return -EIO;
}
+ mark_buffer_dirty(bh);
return 0;
}
@@ -178,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode)
inode->i_mode = S_IFREG;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- inode->i_mapping->a_ops = &def_gcinode_aops;
+ inode->i_mapping->a_ops = &empty_aops;
inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
ii->i_flags = 0;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index bfc73d3a30ed..684d76300a80 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -80,7 +80,7 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
return ret;
}
nilfs_palloc_commit_alloc_entry(ifile, &req);
- nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+ mark_buffer_dirty(req.pr_entry_bh);
nilfs_mdt_mark_dirty(ifile);
*out_ino = (ino_t)req.pr_entry_nr;
*out_bh = req.pr_entry_bh;
@@ -128,7 +128,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
raw_inode->i_flags = 0;
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
+ mark_buffer_dirty(req.pr_entry_bh);
brelse(req.pr_entry_bh);
nilfs_palloc_commit_free_entry(ifile, &req);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index c0aa27490c02..587f18432832 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -74,14 +74,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
struct buffer_head *bh_result, int create)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 blknum = 0;
int err = 0, ret;
- struct inode *dat = NILFS_I_NILFS(inode)->ns_dat;
unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
- down_read(&NILFS_MDT(dat)->mi_sem);
+ down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
- up_read(&NILFS_MDT(dat)->mi_sem);
+ up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
if (ret >= 0) { /* found */
map_bh(bh_result, inode->i_sb, blknum);
if (ret > 0)
@@ -596,6 +596,16 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_flags = cpu_to_le32(ii->i_flags);
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
+ if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+ /* zero-fill unused portion in the case of super root block */
+ raw_inode->i_xattr = 0;
+ raw_inode->i_pad = 0;
+ memset((void *)raw_inode + sizeof(*raw_inode), 0,
+ nilfs->ns_inode_size - sizeof(*raw_inode));
+ }
+
if (has_bmap)
nilfs_bmap_write(ii->i_bmap, raw_inode);
else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -872,8 +882,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
return -EINVAL; /* NILFS_I_DIRTY may remain for
freeing inode */
}
- list_del(&ii->i_dirty);
- list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
+ list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
set_bit(NILFS_I_QUEUED, &ii->i_state);
}
spin_unlock(&nilfs->ns_inode_lock);
@@ -892,7 +901,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
return err;
}
nilfs_update_inode(inode, ibh);
- nilfs_mdt_mark_buffer_dirty(ibh);
+ mark_buffer_dirty(ibh);
nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
brelse(ibh);
return 0;
@@ -931,7 +940,7 @@ void nilfs_dirty_inode(struct inode *inode)
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
- struct the_nilfs *nilfs = NILFS_I_NILFS(inode);
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 logical = 0, phys = 0, size = 0;
__u32 flags = 0;
loff_t isize;
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f2469ba6246b..41d6743d303c 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -698,6 +698,63 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return 0;
}
+static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
+ void __user *argp)
+{
+ __u64 newsize;
+ int ret = -EPERM;
+
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(&newsize, argp, sizeof(newsize)))
+ goto out_drop_write;
+
+ ret = nilfs_resize_fs(inode->i_sb, newsize);
+
+out_drop_write:
+ mnt_drop_write(filp->f_path.mnt);
+out:
+ return ret;
+}
+
+static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ __u64 range[2];
+ __u64 minseg, maxseg;
+ unsigned long segbytes;
+ int ret = -EPERM;
+
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(range, argp, sizeof(__u64[2])))
+ goto out;
+
+ ret = -ERANGE;
+ if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode))
+ goto out;
+
+ segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
+
+ minseg = range[0] + segbytes - 1;
+ do_div(minseg, segbytes);
+ maxseg = NILFS_SB2_OFFSET_BYTES(range[1]);
+ do_div(maxseg, segbytes);
+ maxseg--;
+
+ ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
+out:
+ return ret;
+}
+
static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp,
size_t membsz,
@@ -763,6 +820,10 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
case NILFS_IOCTL_SYNC:
return nilfs_ioctl_sync(inode, filp, cmd, argp);
+ case NILFS_IOCTL_RESIZE:
+ return nilfs_ioctl_resize(inode, filp, argp);
+ case NILFS_IOCTL_SET_ALLOC_RANGE:
+ return nilfs_ioctl_set_alloc_range(inode, argp);
default:
return -ENOTTY;
}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index a649b05f7069..800e8d78a83b 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -66,7 +66,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
kunmap_atomic(kaddr, KM_USER0);
set_buffer_uptodate(bh);
- nilfs_mark_buffer_dirty(bh);
+ mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
return 0;
}
@@ -355,7 +355,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
err = nilfs_mdt_read_block(inode, block, 0, &bh);
if (unlikely(err))
return err;
- nilfs_mark_buffer_dirty(bh);
+ mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
brelse(bh);
return 0;
@@ -450,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
INIT_LIST_HEAD(&shadow->frozen_buffers);
address_space_init_once(&shadow->frozen_data);
- nilfs_mapping_init(&shadow->frozen_data, bdi);
+ nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
address_space_init_once(&shadow->frozen_btnodes);
- nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
+ nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
mi->mi_shadow = shadow;
return 0;
}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ed68563ec708..ab20a4baa50f 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -64,11 +64,6 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
return inode->i_private;
}
-static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
-{
- return inode->i_sb->s_fs_info;
-}
-
/* Default GFP flags using highmem */
#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
@@ -93,8 +88,6 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh);
struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode,
struct buffer_head *bh);
-#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
-
static inline void nilfs_mdt_mark_dirty(struct inode *inode)
{
if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
@@ -108,7 +101,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode)
static inline __u64 nilfs_mdt_cno(struct inode *inode)
{
- return NILFS_I_NILFS(inode)->ns_cno;
+ return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno;
}
#define nilfs_mdt_bgl_lock(inode, bg) \
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 546849b3e88f..1102a5fbb744 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,6 +334,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
struct nilfs_transaction_info ti;
int err;
+ dentry_unhash(dentry);
+
err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
if (err)
return err;
@@ -369,6 +371,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct nilfs_transaction_info ti;
int err;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
if (unlikely(err))
return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a8dd344303cb..a9c6a531f80c 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -80,12 +80,6 @@ static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
return &ii->vfs_inode;
}
-static inline struct inode *NILFS_AS_I(struct address_space *mapping)
-{
- return (mapping->host) ? :
- container_of(mapping, struct inode, i_data);
-}
-
/*
* Dynamic state flags of NILFS on-memory inode (i_state)
*/
@@ -298,6 +292,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
int flip);
int nilfs_commit_super(struct super_block *sb, int flag);
int nilfs_cleanup_super(struct super_block *sb);
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize);
int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
struct nilfs_root **root);
int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 1168059c7efd..65221a04c6f0 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -37,8 +37,7 @@
#define NILFS_BUFFER_INHERENT_BITS \
((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
- (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \
- (1UL << BH_NILFS_Checked))
+ (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
static struct buffer_head *
__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -59,19 +58,6 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
return bh;
}
-/*
- * Since the page cache of B-tree node pages or data page cache of pseudo
- * inodes does not have a valid mapping->host pointer, calling
- * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
- * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
- * To avoid this problem, the old style mark_buffer_dirty() is used instead.
- */
-void nilfs_mark_buffer_dirty(struct buffer_head *bh)
-{
- if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
- __set_page_dirty_nobuffers(bh->b_page);
-}
-
struct buffer_head *nilfs_grab_buffer(struct inode *inode,
struct address_space *mapping,
unsigned long blkoff,
@@ -183,7 +169,7 @@ int nilfs_page_buffers_clean(struct page *page)
void nilfs_page_bug(struct page *page)
{
struct address_space *m;
- unsigned long ino = 0;
+ unsigned long ino;
if (unlikely(!page)) {
printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
@@ -191,11 +177,8 @@ void nilfs_page_bug(struct page *page)
}
m = page->mapping;
- if (m) {
- struct inode *inode = NILFS_AS_I(m);
- if (inode != NULL)
- ino = inode->i_ino;
- }
+ ino = m ? m->host->i_ino : 0;
+
printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
page, atomic_read(&page->_count),
@@ -217,56 +200,6 @@ void nilfs_page_bug(struct page *page)
}
/**
- * nilfs_alloc_private_page - allocate a private page with buffer heads
- *
- * Return Value: On success, a pointer to the allocated page is returned.
- * On error, NULL is returned.
- */
-struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
- unsigned long state)
-{
- struct buffer_head *bh, *head, *tail;
- struct page *page;
-
- page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
- if (unlikely(!page))
- return NULL;
-
- lock_page(page);
- head = alloc_page_buffers(page, size, 0);
- if (unlikely(!head)) {
- unlock_page(page);
- __free_page(page);
- return NULL;
- }
-
- bh = head;
- do {
- bh->b_state = (1UL << BH_NILFS_Allocated) | state;
- tail = bh;
- bh->b_bdev = bdev;
- bh = bh->b_this_page;
- } while (bh);
-
- tail->b_this_page = head;
- attach_page_buffers(page, head);
-
- return page;
-}
-
-void nilfs_free_private_page(struct page *page)
-{
- BUG_ON(!PageLocked(page));
- BUG_ON(page->mapping);
-
- if (page_has_buffers(page) && !try_to_free_buffers(page))
- NILFS_PAGE_BUG(page, "failed to free page");
-
- unlock_page(page);
- __free_page(page);
-}
-
-/**
* nilfs_copy_page -- copy the page with buffers
* @dst: destination page
* @src: source page
@@ -492,10 +425,10 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
return nc;
}
-void nilfs_mapping_init(struct address_space *mapping,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
struct backing_dev_info *bdi)
{
- mapping->host = NULL;
+ mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->assoc_mapping = NULL;
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index f06b79ad7493..fb7de71605a0 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -38,14 +38,12 @@ enum {
BH_NILFS_Redirected,
};
-BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
-void nilfs_mark_buffer_dirty(struct buffer_head *bh);
int __nilfs_clear_page_dirty(struct page *);
struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
@@ -54,14 +52,11 @@ void nilfs_forget_buffer(struct buffer_head *);
void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
int nilfs_page_buffers_clean(struct page *);
void nilfs_page_bug(struct page *);
-struct page *nilfs_alloc_private_page(struct block_device *, int,
- unsigned long);
-void nilfs_free_private_page(struct page *);
int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init(struct address_space *mapping,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
struct backing_dev_info *bdi);
unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ba4a64518f38..a604ac0331b2 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -387,9 +387,9 @@ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr,
static void dispose_recovery_list(struct list_head *head)
{
while (!list_empty(head)) {
- struct nilfs_recovery_block *rb
- = list_entry(head->next,
- struct nilfs_recovery_block, list);
+ struct nilfs_recovery_block *rb;
+
+ rb = list_first_entry(head, struct nilfs_recovery_block, list);
list_del(&rb->list);
kfree(rb);
}
@@ -416,9 +416,9 @@ static int nilfs_segment_list_add(struct list_head *head, __u64 segnum)
void nilfs_dispose_segment_list(struct list_head *head)
{
while (!list_empty(head)) {
- struct nilfs_segment_entry *ent
- = list_entry(head->next,
- struct nilfs_segment_entry, list);
+ struct nilfs_segment_entry *ent;
+
+ ent = list_first_entry(head, struct nilfs_segment_entry, list);
list_del(&ent->list);
kfree(ent);
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2853ff20f85a..850a7c0228fb 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -239,12 +239,15 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf,
u32 seed)
{
struct nilfs_super_root *raw_sr;
+ struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info;
+ unsigned srsize;
u32 crc;
raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data;
+ srsize = NILFS_SR_BYTES(nilfs->ns_inode_size);
crc = crc32_le(seed,
(unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
- NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
+ srsize - sizeof(raw_sr->sr_sum));
raw_sr->sr_sum = cpu_to_le32(crc);
}
@@ -254,18 +257,6 @@ static void nilfs_release_buffers(struct list_head *list)
list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
list_del_init(&bh->b_assoc_buffers);
- if (buffer_nilfs_allocated(bh)) {
- struct page *clone_page = bh->b_page;
-
- /* remove clone page */
- brelse(bh);
- page_cache_release(clone_page); /* for each bh */
- if (page_count(clone_page) <= 2) {
- lock_page(clone_page);
- nilfs_free_private_page(clone_page);
- }
- continue;
- }
brelse(bh);
}
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index afe4f2183454..141646e88fb5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -655,13 +655,10 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
if (unlikely(page->index > last))
break;
- if (mapping->host) {
- lock_page(page);
- if (!page_has_buffers(page))
- create_empty_buffers(page,
- 1 << inode->i_blkbits, 0);
- unlock_page(page);
- }
+ lock_page(page);
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+ unlock_page(page);
bh = head = page_buffers(page);
do {
@@ -809,7 +806,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
/* The following code is duplicated with cpfile. But, it is
needed to collect the checkpoint even if it was not newly
created */
- nilfs_mdt_mark_buffer_dirty(bh_cp);
+ mark_buffer_dirty(bh_cp);
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
@@ -889,12 +886,14 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
{
struct buffer_head *bh_sr;
struct nilfs_super_root *raw_sr;
- unsigned isz = nilfs->ns_inode_size;
+ unsigned isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+ isz = nilfs->ns_inode_size;
+ srsz = NILFS_SR_BYTES(isz);
- raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
+ raw_sr->sr_bytes = cpu_to_le16(srsz);
raw_sr->sr_nongc_ctime
= cpu_to_le64(nilfs_doing_gc() ?
nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
@@ -906,6 +905,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
NILFS_SR_CPFILE_OFFSET(isz), 1);
nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
NILFS_SR_SUFILE_OFFSET(isz), 1);
+ memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
}
static void nilfs_redirty_inodes(struct list_head *head)
@@ -954,8 +954,8 @@ static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
dispose_buffers:
while (!list_empty(listp)) {
- bh = list_entry(listp->next, struct buffer_head,
- b_assoc_buffers);
+ bh = list_first_entry(listp, struct buffer_head,
+ b_assoc_buffers);
list_del_init(&bh->b_assoc_buffers);
brelse(bh);
}
@@ -1500,10 +1500,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
nblocks = le32_to_cpu(finfo->fi_nblocks);
ndatablk = le32_to_cpu(finfo->fi_ndatablk);
- if (buffer_nilfs_node(bh))
- inode = NILFS_BTNC_I(bh->b_page->mapping);
- else
- inode = NILFS_AS_I(bh->b_page->mapping);
+ inode = bh->b_page->mapping->host;
if (mode == SC_LSEG_DSYNC)
sc_op = &nilfs_sc_dsync_ops;
@@ -1556,83 +1553,24 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
return 0;
}
-static int
-nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
-{
- struct page *clone_page;
- struct buffer_head *bh, *head, *bh2;
- void *kaddr;
-
- bh = head = page_buffers(page);
-
- clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
- if (unlikely(!clone_page))
- return -ENOMEM;
-
- bh2 = page_buffers(clone_page);
- kaddr = kmap_atomic(page, KM_USER0);
- do {
- if (list_empty(&bh->b_assoc_buffers))
- continue;
- get_bh(bh2);
- page_cache_get(clone_page); /* for each bh */
- memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
- bh2->b_blocknr = bh->b_blocknr;
- list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
- list_add_tail(&bh->b_assoc_buffers, out);
- } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
- kunmap_atomic(kaddr, KM_USER0);
-
- if (!TestSetPageWriteback(clone_page))
- account_page_writeback(clone_page);
- unlock_page(clone_page);
-
- return 0;
-}
-
-static int nilfs_test_page_to_be_frozen(struct page *page)
-{
- struct address_space *mapping = page->mapping;
-
- if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
- return 0;
-
- if (page_mapped(page)) {
- ClearPageChecked(page);
- return 1;
- }
- return PageChecked(page);
-}
-
-static int nilfs_begin_page_io(struct page *page, struct list_head *out)
+static void nilfs_begin_page_io(struct page *page)
{
if (!page || PageWriteback(page))
/* For split b-tree node pages, this function may be called
twice. We ignore the 2nd or later calls by this check. */
- return 0;
+ return;
lock_page(page);
clear_page_dirty_for_io(page);
set_page_writeback(page);
unlock_page(page);
-
- if (nilfs_test_page_to_be_frozen(page)) {
- int err = nilfs_copy_replace_page_buffers(page, out);
- if (unlikely(err))
- return err;
- }
- return 0;
}
-static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
- struct page **failed_page)
+static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf;
struct page *bd_page = NULL, *fs_page = NULL;
- struct list_head *list = &sci->sc_copied_buffers;
- int err;
- *failed_page = NULL;
list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
struct buffer_head *bh;
@@ -1662,11 +1600,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
break;
}
if (bh->b_page != fs_page) {
- err = nilfs_begin_page_io(fs_page, list);
- if (unlikely(err)) {
- *failed_page = fs_page;
- goto out;
- }
+ nilfs_begin_page_io(fs_page);
fs_page = bh->b_page;
}
}
@@ -1677,11 +1611,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
set_page_writeback(bd_page);
unlock_page(bd_page);
}
- err = nilfs_begin_page_io(fs_page, list);
- if (unlikely(err))
- *failed_page = fs_page;
- out:
- return err;
+ nilfs_begin_page_io(fs_page);
}
static int nilfs_segctor_write(struct nilfs_sc_info *sci,
@@ -1694,24 +1624,6 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
return ret;
}
-static void __nilfs_end_page_io(struct page *page, int err)
-{
- if (!err) {
- if (!nilfs_page_buffers_clean(page))
- __set_page_dirty_nobuffers(page);
- ClearPageError(page);
- } else {
- __set_page_dirty_nobuffers(page);
- SetPageError(page);
- }
-
- if (buffer_nilfs_allocated(page_buffers(page))) {
- if (TestClearPageWriteback(page))
- dec_zone_page_state(page, NR_WRITEBACK);
- } else
- end_page_writeback(page);
-}
-
static void nilfs_end_page_io(struct page *page, int err)
{
if (!page)
@@ -1738,40 +1650,19 @@ static void nilfs_end_page_io(struct page *page, int err)
return;
}
- __nilfs_end_page_io(page, err);
-}
-
-static void nilfs_clear_copied_buffers(struct list_head *list, int err)
-{
- struct buffer_head *bh, *head;
- struct page *page;
-
- while (!list_empty(list)) {
- bh = list_entry(list->next, struct buffer_head,
- b_assoc_buffers);
- page = bh->b_page;
- page_cache_get(page);
- head = bh = page_buffers(page);
- do {
- if (!list_empty(&bh->b_assoc_buffers)) {
- list_del_init(&bh->b_assoc_buffers);
- if (!err) {
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- clear_buffer_delay(bh);
- clear_buffer_nilfs_volatile(bh);
- }
- brelse(bh); /* for b_assoc_buffers */
- }
- } while ((bh = bh->b_this_page) != head);
-
- __nilfs_end_page_io(page, err);
- page_cache_release(page);
+ if (!err) {
+ if (!nilfs_page_buffers_clean(page))
+ __set_page_dirty_nobuffers(page);
+ ClearPageError(page);
+ } else {
+ __set_page_dirty_nobuffers(page);
+ SetPageError(page);
}
+
+ end_page_writeback(page);
}
-static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
- int err)
+static void nilfs_abort_logs(struct list_head *logs, int err)
{
struct nilfs_segment_buffer *segbuf;
struct page *bd_page = NULL, *fs_page = NULL;
@@ -1801,8 +1692,6 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page,
}
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, err);
- if (fs_page && fs_page == failed_page)
- return;
fs_page = bh->b_page;
}
}
@@ -1821,12 +1710,11 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
list_splice_tail_init(&sci->sc_write_logs, &logs);
ret = nilfs_wait_on_logs(&logs);
- nilfs_abort_logs(&logs, NULL, ret ? : err);
+ nilfs_abort_logs(&logs, ret ? : err);
list_splice_tail_init(&sci->sc_segbufs, &logs);
nilfs_cancel_segusage(&logs, nilfs->ns_sufile);
nilfs_free_incomplete_logs(&logs, nilfs);
- nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
@@ -1920,8 +1808,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
nilfs_end_page_io(fs_page, 0);
- nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
-
nilfs_drop_collected_inodes(&sci->sc_dirty_files);
if (nilfs_doing_gc())
@@ -1979,7 +1865,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
"failed to get inode block.\n");
return err;
}
- nilfs_mdt_mark_buffer_dirty(ibh);
+ mark_buffer_dirty(ibh);
nilfs_mdt_mark_dirty(ifile);
spin_lock(&nilfs->ns_inode_lock);
if (likely(!ii->i_bh))
@@ -1991,8 +1877,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
clear_bit(NILFS_I_QUEUED, &ii->i_state);
set_bit(NILFS_I_BUSY, &ii->i_state);
- list_del(&ii->i_dirty);
- list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
+ list_move_tail(&ii->i_dirty, &sci->sc_dirty_files);
}
spin_unlock(&nilfs->ns_inode_lock);
@@ -2014,8 +1899,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
- list_del(&ii->i_dirty);
- list_add_tail(&ii->i_dirty, &ti->ti_garbage);
+ list_move_tail(&ii->i_dirty, &ti->ti_garbage);
}
spin_unlock(&nilfs->ns_inode_lock);
}
@@ -2026,7 +1910,6 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
{
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
- struct page *failed_page;
int err;
sci->sc_stage.scnt = NILFS_ST_INIT;
@@ -2081,11 +1964,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
/* Write partial segments */
- err = nilfs_segctor_prepare_write(sci, &failed_page);
- if (err) {
- nilfs_abort_logs(&sci->sc_segbufs, failed_page, err);
- goto failed_to_write;
- }
+ nilfs_segctor_prepare_write(sci);
nilfs_add_checksums_on_logs(&sci->sc_segbufs,
nilfs->ns_crc_seed);
@@ -2687,7 +2566,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
INIT_LIST_HEAD(&sci->sc_segbufs);
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
- INIT_LIST_HEAD(&sci->sc_copied_buffers);
init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2741,8 +2619,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
if (flag || !nilfs_segctor_confirm(sci))
nilfs_segctor_write_out(sci);
- WARN_ON(!list_empty(&sci->sc_copied_buffers));
-
if (!list_empty(&sci->sc_dirty_files)) {
nilfs_warning(sci->sc_super, __func__,
"dirty file(s) after the final construction\n");
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6c02a86745fb..38a1d0013314 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -92,7 +92,6 @@ struct nilfs_segsum_pointer {
* @sc_nblk_inc: Block count of current generation
* @sc_dirty_files: List of files to be written
* @sc_gc_inodes: List of GC inodes having blocks to be written
- * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
* @sc_freesegs: array of segment numbers to be freed
* @sc_nfreesegs: number of segments on @sc_freesegs
* @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -136,7 +135,6 @@ struct nilfs_sc_info {
struct list_head sc_dirty_files;
struct list_head sc_gc_inodes;
- struct list_head sc_copied_buffers;
__u64 *sc_freesegs;
size_t sc_nfreesegs;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1d6f488ccae8..0a0aba617d8a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -33,7 +33,9 @@
struct nilfs_sufile_info {
struct nilfs_mdt_info mi;
- unsigned long ncleansegs;
+ unsigned long ncleansegs;/* number of clean segments */
+ __u64 allocmin; /* lower limit of allocatable segment range */
+ __u64 allocmax; /* upper limit of allocatable segment range */
};
static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile)
@@ -96,6 +98,13 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
create, NULL, bhp);
}
+static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile,
+ __u64 segnum)
+{
+ return nilfs_mdt_delete_block(sufile,
+ nilfs_sufile_get_blkoff(sufile, segnum));
+}
+
static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
u64 ncleanadd, u64 ndirtyadd)
{
@@ -108,7 +117,7 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(header_bh);
+ mark_buffer_dirty(header_bh);
}
/**
@@ -248,6 +257,35 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
}
/**
+ * nilfs_sufile_set_alloc_range - limit range of segment to be allocated
+ * @sufile: inode of segment usage file
+ * @start: minimum segment number of allocatable region (inclusive)
+ * @end: maximum segment number of allocatable region (inclusive)
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-ERANGE - invalid segment region
+ */
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end)
+{
+ struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+ __u64 nsegs;
+ int ret = -ERANGE;
+
+ down_write(&NILFS_MDT(sufile)->mi_sem);
+ nsegs = nilfs_sufile_get_nsegments(sufile);
+
+ if (start <= end && end < nsegs) {
+ sui->allocmin = start;
+ sui->allocmax = end;
+ ret = 0;
+ }
+ up_write(&NILFS_MDT(sufile)->mi_sem);
+ return ret;
+}
+
+/**
* nilfs_sufile_alloc - allocate a segment
* @sufile: inode of segment usage file
* @segnump: pointer to segment number
@@ -269,11 +307,12 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
struct buffer_head *header_bh, *su_bh;
struct nilfs_sufile_header *header;
struct nilfs_segment_usage *su;
+ struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
__u64 segnum, maxsegnum, last_alloc;
void *kaddr;
- unsigned long nsegments, ncleansegs, nsus;
- int ret, i, j;
+ unsigned long nsegments, ncleansegs, nsus, cnt;
+ int ret, j;
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -287,13 +326,31 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
kunmap_atomic(kaddr, KM_USER0);
nsegments = nilfs_sufile_get_nsegments(sufile);
+ maxsegnum = sui->allocmax;
segnum = last_alloc + 1;
- maxsegnum = nsegments - 1;
- for (i = 0; i < nsegments; i += nsus) {
- if (segnum >= nsegments) {
- /* wrap around */
- segnum = 0;
- maxsegnum = last_alloc;
+ if (segnum < sui->allocmin || segnum > sui->allocmax)
+ segnum = sui->allocmin;
+
+ for (cnt = 0; cnt < nsegments; cnt += nsus) {
+ if (segnum > maxsegnum) {
+ if (cnt < sui->allocmax - sui->allocmin + 1) {
+ /*
+ * wrap around in the limited region.
+ * if allocation started from
+ * sui->allocmin, this never happens.
+ */
+ segnum = sui->allocmin;
+ maxsegnum = last_alloc;
+ } else if (segnum > sui->allocmin &&
+ sui->allocmax + 1 < nsegments) {
+ segnum = sui->allocmax + 1;
+ maxsegnum = nsegments - 1;
+ } else if (sui->allocmin > 0) {
+ segnum = 0;
+ maxsegnum = sui->allocmin - 1;
+ } else {
+ break; /* never happens */
+ }
}
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
&su_bh);
@@ -319,9 +376,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
header->sh_last_alloc = cpu_to_le64(segnum);
kunmap_atomic(kaddr, KM_USER0);
- NILFS_SUI(sufile)->ncleansegs--;
- nilfs_mdt_mark_buffer_dirty(header_bh);
- nilfs_mdt_mark_buffer_dirty(su_bh);
+ sui->ncleansegs--;
+ mark_buffer_dirty(header_bh);
+ mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
@@ -364,7 +421,7 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
nilfs_sufile_mod_counter(header_bh, -1, 1);
NILFS_SUI(sufile)->ncleansegs--;
- nilfs_mdt_mark_buffer_dirty(su_bh);
+ mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
}
@@ -395,7 +452,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
NILFS_SUI(sufile)->ncleansegs -= clean;
- nilfs_mdt_mark_buffer_dirty(su_bh);
+ mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
}
@@ -421,7 +478,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
sudirty = nilfs_segment_usage_dirty(su);
nilfs_segment_usage_set_clean(su);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(su_bh);
+ mark_buffer_dirty(su_bh);
nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
NILFS_SUI(sufile)->ncleansegs++;
@@ -441,7 +498,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
if (!ret) {
- nilfs_mdt_mark_buffer_dirty(bh);
+ mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
brelse(bh);
}
@@ -476,7 +533,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
su->su_nblocks = cpu_to_le32(nblocks);
kunmap_atomic(kaddr, KM_USER0);
- nilfs_mdt_mark_buffer_dirty(bh);
+ mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
brelse(bh);
@@ -505,7 +562,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
{
struct buffer_head *header_bh;
struct nilfs_sufile_header *header;
- struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
void *kaddr;
int ret;
@@ -555,11 +612,183 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
nilfs_sufile_mod_counter(header_bh, -1, 0);
NILFS_SUI(sufile)->ncleansegs--;
}
- nilfs_mdt_mark_buffer_dirty(su_bh);
+ mark_buffer_dirty(su_bh);
nilfs_mdt_mark_dirty(sufile);
}
/**
+ * nilfs_sufile_truncate_range - truncate range of segment array
+ * @sufile: inode of segment usage file
+ * @start: start segment number (inclusive)
+ * @end: end segment number (inclusive)
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid number of segments specified
+ *
+ * %-EBUSY - Dirty or active segments are present in the range
+ */
+static int nilfs_sufile_truncate_range(struct inode *sufile,
+ __u64 start, __u64 end)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *header_bh;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su, *su2;
+ size_t susz = NILFS_MDT(sufile)->mi_entry_size;
+ unsigned long segusages_per_block;
+ unsigned long nsegs, ncleaned;
+ __u64 segnum;
+ void *kaddr;
+ ssize_t n, nc;
+ int ret;
+ int j;
+
+ nsegs = nilfs_sufile_get_nsegments(sufile);
+
+ ret = -EINVAL;
+ if (start > end || start >= nsegs)
+ goto out;
+
+ ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+ if (ret < 0)
+ goto out;
+
+ segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
+ ncleaned = 0;
+
+ for (segnum = start; segnum <= end; segnum += n) {
+ n = min_t(unsigned long,
+ segusages_per_block -
+ nilfs_sufile_get_offset(sufile, segnum),
+ end - segnum + 1);
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_header;
+ /* hole */
+ continue;
+ }
+ kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ su2 = su;
+ for (j = 0; j < n; j++, su = (void *)su + susz) {
+ if ((le32_to_cpu(su->su_flags) &
+ ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+ nilfs_segment_is_active(nilfs, segnum + j)) {
+ ret = -EBUSY;
+ kunmap_atomic(kaddr, KM_USER0);
+ brelse(su_bh);
+ goto out_header;
+ }
+ }
+ nc = 0;
+ for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) {
+ if (nilfs_segment_usage_error(su)) {
+ nilfs_segment_usage_set_clean(su);
+ nc++;
+ }
+ }
+ kunmap_atomic(kaddr, KM_USER0);
+ if (nc > 0) {
+ mark_buffer_dirty(su_bh);
+ ncleaned += nc;
+ }
+ brelse(su_bh);
+
+ if (n == segusages_per_block) {
+ /* make hole */
+ nilfs_sufile_delete_segment_usage_block(sufile, segnum);
+ }
+ }
+ ret = 0;
+
+out_header:
+ if (ncleaned > 0) {
+ NILFS_SUI(sufile)->ncleansegs += ncleaned;
+ nilfs_sufile_mod_counter(header_bh, ncleaned, 0);
+ nilfs_mdt_mark_dirty(sufile);
+ }
+ brelse(header_bh);
+out:
+ return ret;
+}
+
+/**
+ * nilfs_sufile_resize - resize segment array
+ * @sufile: inode of segment usage file
+ * @newnsegs: new number of segments
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOSPC - Enough free space is not left for shrinking
+ *
+ * %-EBUSY - Dirty or active segments exist in the region to be truncated
+ */
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *header_bh;
+ struct nilfs_sufile_header *header;
+ struct nilfs_sufile_info *sui = NILFS_SUI(sufile);
+ void *kaddr;
+ unsigned long nsegs, nrsvsegs;
+ int ret = 0;
+
+ down_write(&NILFS_MDT(sufile)->mi_sem);
+
+ nsegs = nilfs_sufile_get_nsegments(sufile);
+ if (nsegs == newnsegs)
+ goto out;
+
+ ret = -ENOSPC;
+ nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs);
+ if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs)
+ goto out;
+
+ ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+ if (ret < 0)
+ goto out;
+
+ if (newnsegs > nsegs) {
+ sui->ncleansegs += newnsegs - nsegs;
+ } else /* newnsegs < nsegs */ {
+ ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1);
+ if (ret < 0)
+ goto out_header;
+
+ sui->ncleansegs -= nsegs - newnsegs;
+ }
+
+ kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
+ header = kaddr + bh_offset(header_bh);
+ header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ mark_buffer_dirty(header_bh);
+ nilfs_mdt_mark_dirty(sufile);
+ nilfs_set_nsegments(nilfs, newnsegs);
+
+out_header:
+ brelse(header_bh);
+out:
+ up_write(&NILFS_MDT(sufile)->mi_sem);
+ return ret;
+}
+
+/**
* nilfs_sufile_get_suinfo -
* @sufile: inode of segment usage file
* @segnum: segment number to start looking
@@ -583,7 +812,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
struct nilfs_segment_usage *su;
struct nilfs_suinfo *si = buf;
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
- struct the_nilfs *nilfs = NILFS_I_NILFS(sufile);
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
void *kaddr;
unsigned long nsegs, segusages_per_block;
ssize_t n;
@@ -679,6 +908,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
kunmap_atomic(kaddr, KM_USER0);
brelse(header_bh);
+ sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
+ sui->allocmin = 0;
+
unlock_new_inode(sufile);
out:
*inodep = sufile;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a943fbacb45b..e84bc5b51fc1 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -31,11 +31,12 @@
static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
{
- return NILFS_I_NILFS(sufile)->ns_nsegments;
+ return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments;
}
unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile);
+int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end);
int nilfs_sufile_alloc(struct inode *, __u64 *);
int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum);
int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
@@ -61,6 +62,7 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *,
void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
struct buffer_head *);
+int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 062cca065195..8351c44a7320 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -56,6 +56,7 @@
#include "btnode.h"
#include "page.h"
#include "cpfile.h"
+#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */
#include "ifile.h"
#include "dat.h"
#include "segment.h"
@@ -165,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
ii->i_state = 0;
ii->i_cno = 0;
ii->vfs_inode.i_version = 1;
- nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi);
+ nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
return &ii->vfs_inode;
}
@@ -347,6 +348,134 @@ int nilfs_cleanup_super(struct super_block *sb)
return ret;
}
+/**
+ * nilfs_move_2nd_super - relocate secondary super block
+ * @sb: super block instance
+ * @sb2off: new offset of the secondary super block (in bytes)
+ */
+static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
+{
+ struct the_nilfs *nilfs = sb->s_fs_info;
+ struct buffer_head *nsbh;
+ struct nilfs_super_block *nsbp;
+ sector_t blocknr, newblocknr;
+ unsigned long offset;
+ int sb2i = -1; /* array index of the secondary superblock */
+ int ret = 0;
+
+ /* nilfs->ns_sem must be locked by the caller. */
+ if (nilfs->ns_sbh[1] &&
+ nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) {
+ sb2i = 1;
+ blocknr = nilfs->ns_sbh[1]->b_blocknr;
+ } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
+ sb2i = 0;
+ blocknr = nilfs->ns_sbh[0]->b_blocknr;
+ }
+ if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
+ goto out; /* super block location is unchanged */
+
+ /* Get new super block buffer */
+ newblocknr = sb2off >> nilfs->ns_blocksize_bits;
+ offset = sb2off & (nilfs->ns_blocksize - 1);
+ nsbh = sb_getblk(sb, newblocknr);
+ if (!nsbh) {
+ printk(KERN_WARNING
+ "NILFS warning: unable to move secondary superblock "
+ "to block %llu\n", (unsigned long long)newblocknr);
+ ret = -EIO;
+ goto out;
+ }
+ nsbp = (void *)nsbh->b_data + offset;
+ memset(nsbp, 0, nilfs->ns_blocksize);
+
+ if (sb2i >= 0) {
+ memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+ brelse(nilfs->ns_sbh[sb2i]);
+ nilfs->ns_sbh[sb2i] = nsbh;
+ nilfs->ns_sbp[sb2i] = nsbp;
+ } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) {
+ /* secondary super block will be restored to index 1 */
+ nilfs->ns_sbh[1] = nsbh;
+ nilfs->ns_sbp[1] = nsbp;
+ } else {
+ brelse(nsbh);
+ }
+out:
+ return ret;
+}
+
+/**
+ * nilfs_resize_fs - resize the filesystem
+ * @sb: super block instance
+ * @newsize: new size of the filesystem (in bytes)
+ */
+int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
+{
+ struct the_nilfs *nilfs = sb->s_fs_info;
+ struct nilfs_super_block **sbp;
+ __u64 devsize, newnsegs;
+ loff_t sb2off;
+ int ret;
+
+ ret = -ERANGE;
+ devsize = i_size_read(sb->s_bdev->bd_inode);
+ if (newsize > devsize)
+ goto out;
+
+ /*
+ * Write lock is required to protect some functions depending
+ * on the number of segments, the number of reserved segments,
+ * and so forth.
+ */
+ down_write(&nilfs->ns_segctor_sem);
+
+ sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
+ newnsegs = sb2off >> nilfs->ns_blocksize_bits;
+ do_div(newnsegs, nilfs->ns_blocks_per_segment);
+
+ ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
+ up_write(&nilfs->ns_segctor_sem);
+ if (ret < 0)
+ goto out;
+
+ ret = nilfs_construct_segment(sb);
+ if (ret < 0)
+ goto out;
+
+ down_write(&nilfs->ns_sem);
+ nilfs_move_2nd_super(sb, sb2off);
+ ret = -EIO;
+ sbp = nilfs_prepare_super(sb, 0);
+ if (likely(sbp)) {
+ nilfs_set_log_cursor(sbp[0], nilfs);
+ /*
+ * Drop NILFS_RESIZE_FS flag for compatibility with
+ * mount-time resize which may be implemented in a
+ * future release.
+ */
+ sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) &
+ ~NILFS_RESIZE_FS);
+ sbp[0]->s_dev_size = cpu_to_le64(newsize);
+ sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments);
+ if (sbp[1])
+ memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
+ ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
+ }
+ up_write(&nilfs->ns_sem);
+
+ /*
+ * Reset the range of allocatable segments last. This order
+ * is important in the case of expansion because the secondary
+ * superblock must be protected from log write until migration
+ * completes.
+ */
+ if (!ret)
+ nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1);
+out:
+ return ret;
+}
+
static void nilfs_put_super(struct super_block *sb)
{
struct the_nilfs *nilfs = sb->s_fs_info;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index d2acd1a651f3..d32714094375 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -363,6 +363,24 @@ static unsigned long long nilfs_max_size(unsigned int blkbits)
return res;
}
+/**
+ * nilfs_nrsvsegs - calculate the number of reserved segments
+ * @nilfs: nilfs object
+ * @nsegs: total number of segments
+ */
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+ return max_t(unsigned long, NILFS_MIN_NRSVSEGS,
+ DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage,
+ 100));
+}
+
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
+{
+ nilfs->ns_nsegments = nsegs;
+ nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs);
+}
+
static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
@@ -389,13 +407,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
}
nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
- nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
nilfs->ns_r_segments_percentage =
le32_to_cpu(sbp->s_r_segments_percentage);
- nilfs->ns_nrsvsegs =
- max_t(unsigned long, NILFS_MIN_NRSVSEGS,
- DIV_ROUND_UP(nilfs->ns_nsegments *
- nilfs->ns_r_segments_percentage, 100));
+ nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index f4968145c2a3..9992b11312ff 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -268,6 +268,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev);
void destroy_nilfs(struct the_nilfs *nilfs);
int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
+unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs);
+void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs);
int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 5d32749c896d..3c7606cff1ab 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3706,7 +3706,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
context->cow_start = cow_start;
context->cow_len = cow_len;
context->ref_tree = ref_tree;
- context->ref_root_bh = ref_root_bh;;
+ context->ref_root_bh = ref_root_bh;
context->cow_object = xv;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index de4ff29f1e05..c368360c35a1 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -240,8 +240,12 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
int ret;
- if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
- return -ENOTEMPTY;
+
+ if (S_ISDIR(inode->i_mode)) {
+ dentry_unhash(dentry);
+ if (!omfs_dir_is_empty(inode))
+ return -ENOTEMPTY;
+ }
ret = omfs_delete_entry(dentry);
if (ret)
@@ -378,6 +382,9 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int err;
if (new_inode) {
+ if (S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* overwriting existing file/dir */
err = omfs_remove(new_dir, new_dentry);
if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index d545e97d99c3..8ed4d3433199 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%u\n", p->discard_alignment);
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%u\n",
+ queue_limit_discard_alignment(&disk->queue->limits,
+ p->start_sect));
}
ssize_t part_stat_show(struct device *dev,
@@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
p->start_sect = start;
p->alignment_offset =
queue_limit_alignment_offset(&disk->queue->limits, start);
- p->discard_alignment =
- queue_limit_discard_alignment(&disk->queue->limits, start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index ce4f62440425..af9fdf046769 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -565,7 +565,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state)
data = read_part_sector(state, 0, &sect);
if (!data) {
- ldm_crit ("Disk read failed.");
+ ldm_info ("Disk read failed.");
return false;
}
@@ -1335,6 +1335,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
list_add_tail (&f->list, frags);
found:
+ if (rec >= f->num) {
+ ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num);
+ return false;
+ }
+
if (f->map & (1 << rec)) {
ldm_error ("Duplicate VBLK, part %d.", rec);
f->map &= 0x7F; /* Mark the group as broken */
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5f28fb..c1c729335924 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -20,6 +20,7 @@ proc-y += stat.o
proc-y += uptime.o
proc-y += version.o
proc-y += softirqs.o
+proc-y += namespaces.o
proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dfa532730e55..dc8bca72b002 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -600,7 +600,7 @@ static int proc_fd_access_allowed(struct inode *inode)
return allowed;
}
-static int proc_setattr(struct dentry *dentry, struct iattr *attr)
+int proc_setattr(struct dentry *dentry, struct iattr *attr)
{
int error;
struct inode *inode = dentry->d_inode;
@@ -1736,8 +1736,7 @@ static int task_dumpable(struct task_struct *task)
return 0;
}
-
-static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
{
struct inode * inode;
struct proc_inode *ei;
@@ -1779,7 +1778,7 @@ out_unlock:
return NULL;
}
-static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
struct task_struct *task;
@@ -1820,7 +1819,7 @@ static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat
* made this apply to all per process world readable and executable
* directories.
*/
-static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode;
struct task_struct *task;
@@ -1862,7 +1861,7 @@ static int pid_delete_dentry(const struct dentry * dentry)
return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
}
-static const struct dentry_operations pid_dentry_operations =
+const struct dentry_operations pid_dentry_operations =
{
.d_revalidate = pid_revalidate,
.d_delete = pid_delete_dentry,
@@ -1870,9 +1869,6 @@ static const struct dentry_operations pid_dentry_operations =
/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
- struct task_struct *, const void *);
-
/*
* Fill a directory entry.
*
@@ -1885,8 +1881,8 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
* reported by readdir in sync with the inode numbers reported
* by stat.
*/
-static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
- char *name, int len,
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ const char *name, int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
struct dentry *child, *dir = filp->f_path.dentry;
@@ -2820,6 +2816,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
@@ -3168,6 +3165,7 @@ out_no_task:
static const struct pid_entry tid_base_stuff[] = {
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
+ DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index f1281339b6fa..f1637f17c37c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -674,6 +674,7 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
}
return ent;
}
+EXPORT_SYMBOL(proc_mkdir_mode);
struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
struct proc_dir_entry *parent)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d15aa1b1cc8f..74b48cfa1bb2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -28,6 +28,7 @@ static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
+ const struct proc_ns_operations *ns_ops;
truncate_inode_pages(&inode->i_data, 0);
end_writeback(inode);
@@ -44,6 +45,10 @@ static void proc_evict_inode(struct inode *inode)
rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head);
}
+ /* Release any associated namespace */
+ ns_ops = PROC_I(inode)->ns_ops;
+ if (ns_ops && ns_ops->put)
+ ns_ops->put(PROC_I(inode)->ns);
}
static struct kmem_cache * proc_inode_cachep;
@@ -62,6 +67,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->pde = NULL;
ei->sysctl = NULL;
ei->sysctl_entry = NULL;
+ ei->ns = NULL;
+ ei->ns_ops = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c03e8d3a3a5b..7838e5cfec14 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_net_operations;
extern const struct inode_operations proc_net_inode_operations;
+struct proc_maps_private {
+ struct pid *pid;
+ struct task_struct *task;
+#ifdef CONFIG_MMU
+ struct vm_area_struct *tail_vma;
+#endif
+};
+
void proc_init_inodecache(void);
static inline struct pid *proc_pid(struct inode *inode)
@@ -119,3 +127,21 @@ struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
*/
int proc_readdir(struct file *, void *, filldir_t);
struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
+
+
+
+/* Lookups */
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+ struct task_struct *, const void *);
+int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
+ const char *name, int len,
+ instantiate_t instantiate, struct task_struct *task, const void *ptr);
+int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
+struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
+extern const struct dentry_operations pid_dentry_operations;
+int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
+int proc_setattr(struct dentry *dentry, struct iattr *attr);
+
+extern const struct inode_operations proc_ns_dir_inode_operations;
+extern const struct file_operations proc_ns_dir_operations;
+
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
new file mode 100644
index 000000000000..781dec5bd682
--- /dev/null
+++ b/fs/proc/namespaces.c
@@ -0,0 +1,198 @@
+#include <linux/proc_fs.h>
+#include <linux/nsproxy.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <net/net_namespace.h>
+#include <linux/mnt_namespace.h>
+#include <linux/ipc_namespace.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
+
+
+static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+ &netns_operations,
+#endif
+#ifdef CONFIG_UTS_NS
+ &utsns_operations,
+#endif
+#ifdef CONFIG_IPC_NS
+ &ipcns_operations,
+#endif
+};
+
+static const struct file_operations ns_file_operations = {
+ .llseek = no_llseek,
+};
+
+static struct dentry *proc_ns_instantiate(struct inode *dir,
+ struct dentry *dentry, struct task_struct *task, const void *ptr)
+{
+ const struct proc_ns_operations *ns_ops = ptr;
+ struct inode *inode;
+ struct proc_inode *ei;
+ struct dentry *error = ERR_PTR(-ENOENT);
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ goto out;
+
+ ei = PROC_I(inode);
+ inode->i_mode = S_IFREG|S_IRUSR;
+ inode->i_fop = &ns_file_operations;
+ ei->ns_ops = ns_ops;
+ ei->ns = ns_ops->get(task);
+ if (!ei->ns)
+ goto out_iput;
+
+ dentry->d_op = &pid_dentry_operations;
+ d_add(dentry, inode);
+ /* Close the race of the process dying before we return the dentry */
+ if (pid_revalidate(dentry, NULL))
+ error = NULL;
+out:
+ return error;
+out_iput:
+ iput(inode);
+ goto out;
+}
+
+static int proc_ns_fill_cache(struct file *filp, void *dirent,
+ filldir_t filldir, struct task_struct *task,
+ const struct proc_ns_operations *ops)
+{
+ return proc_fill_cache(filp, dirent, filldir,
+ ops->name, strlen(ops->name),
+ proc_ns_instantiate, task, ops);
+}
+
+static int proc_ns_dir_readdir(struct file *filp, void *dirent,
+ filldir_t filldir)
+{
+ int i;
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
+ const struct proc_ns_operations **entry, **last;
+ ino_t ino;
+ int ret;
+
+ ret = -ENOENT;
+ if (!task)
+ goto out_no_task;
+
+ ret = -EPERM;
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ ret = 0;
+ i = filp->f_pos;
+ switch (i) {
+ case 0:
+ ino = inode->i_ino;
+ if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ goto out;
+ i++;
+ filp->f_pos++;
+ /* fall through */
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
+ goto out;
+ i++;
+ filp->f_pos++;
+ /* fall through */
+ default:
+ i -= 2;
+ if (i >= ARRAY_SIZE(ns_entries)) {
+ ret = 1;
+ goto out;
+ }
+ entry = ns_entries + i;
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ while (entry <= last) {
+ if (proc_ns_fill_cache(filp, dirent, filldir,
+ task, *entry) < 0)
+ goto out;
+ filp->f_pos++;
+ entry++;
+ }
+ }
+
+ ret = 1;
+out:
+ put_task_struct(task);
+out_no_task:
+ return ret;
+}
+
+const struct file_operations proc_ns_dir_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_ns_dir_readdir,
+};
+
+static struct dentry *proc_ns_dir_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ struct dentry *error;
+ struct task_struct *task = get_proc_task(dir);
+ const struct proc_ns_operations **entry, **last;
+ unsigned int len = dentry->d_name.len;
+
+ error = ERR_PTR(-ENOENT);
+
+ if (!task)
+ goto out_no_task;
+
+ error = ERR_PTR(-EPERM);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
+ for (entry = ns_entries; entry <= last; entry++) {
+ if (strlen((*entry)->name) != len)
+ continue;
+ if (!memcmp(dentry->d_name.name, (*entry)->name, len))
+ break;
+ }
+ error = ERR_PTR(-ENOENT);
+ if (entry > last)
+ goto out;
+
+ error = proc_ns_instantiate(dir, dentry, task, *entry);
+out:
+ put_task_struct(task);
+out_no_task:
+ return error;
+}
+
+const struct inode_operations proc_ns_dir_inode_operations = {
+ .lookup = proc_ns_dir_lookup,
+ .getattr = pid_getattr,
+ .setattr = proc_setattr,
+};
+
+struct file *proc_ns_fget(int fd)
+{
+ struct file *file;
+
+ file = fget(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
+
+ if (file->f_op != &ns_file_operations)
+ goto out_invalid;
+
+ return file;
+
+out_invalid:
+ fput(file);
+ return ERR_PTR(-EINVAL);
+}
+
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 318d8654989b..db15935fa757 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -211,7 +211,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
- int flags = vma->vm_flags;
+ vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
@@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = {
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
-extern int show_numa_map(struct seq_file *m, void *v);
+
+struct numa_maps {
+ struct vm_area_struct *vma;
+ unsigned long pages;
+ unsigned long anon;
+ unsigned long active;
+ unsigned long writeback;
+ unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
+ unsigned long node[MAX_NUMNODES];
+};
+
+struct numa_maps_private {
+ struct proc_maps_private proc_maps;
+ struct numa_maps md;
+};
+
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+{
+ int count = page_mapcount(page);
+
+ md->pages++;
+ if (pte_dirty || PageDirty(page))
+ md->dirty++;
+
+ if (PageSwapCache(page))
+ md->swapcache++;
+
+ if (PageActive(page) || PageUnevictable(page))
+ md->active++;
+
+ if (PageWriteback(page))
+ md->writeback++;
+
+ if (PageAnon(page))
+ md->anon++;
+
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
+ md->node[page_to_nid(page)]++;
+}
+
+static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md;
+ spinlock_t *ptl;
+ pte_t *orig_pte;
+ pte_t *pte;
+
+ md = walk->private;
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ do {
+ struct page *page;
+ int nid;
+
+ if (!pte_present(*pte))
+ continue;
+
+ page = vm_normal_page(md->vma, addr, *pte);
+ if (!page)
+ continue;
+
+ if (PageReserved(page))
+ continue;
+
+ nid = page_to_nid(page);
+ if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+ continue;
+
+ gather_stats(page, md, pte_dirty(*pte));
+
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ pte_unmap_unlock(orig_pte, ptl);
+ return 0;
+}
+#ifdef CONFIG_HUGETLB_PAGE
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ struct numa_maps *md;
+ struct page *page;
+
+ if (pte_none(*pte))
+ return 0;
+
+ page = pte_page(*pte);
+ if (!page)
+ return 0;
+
+ md = walk->private;
+ gather_stats(page, md, pte_dirty(*pte));
+ return 0;
+}
+
+#else
+static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end, struct mm_walk *walk)
+{
+ return 0;
+}
+#endif
+
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static int show_numa_map(struct seq_file *m, void *v)
+{
+ struct numa_maps_private *numa_priv = m->private;
+ struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
+ struct vm_area_struct *vma = v;
+ struct numa_maps *md = &numa_priv->md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mm_walk walk = {};
+ struct mempolicy *pol;
+ int n;
+ char buffer[50];
+
+ if (!mm)
+ return 0;
+
+ /* Ensure we start with an empty set of numa_maps statistics. */
+ memset(md, 0, sizeof(*md));
+
+ md->vma = vma;
+
+ walk.hugetlb_entry = gather_hugetbl_stats;
+ walk.pmd_entry = gather_pte_stats;
+ walk.private = md;
+ walk.mm = mm;
+
+ pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
+ mpol_to_str(buffer, sizeof(buffer), pol, 0);
+ mpol_cond_put(pol);
+
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
+
+ if (file) {
+ seq_printf(m, " file=");
+ seq_path(m, &file->f_path, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_printf(m, " heap");
+ } else if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ seq_printf(m, " stack");
+ }
+
+ walk_page_range(vma->vm_start, vma->vm_end, &walk);
+
+ if (!md->pages)
+ goto out;
+
+ if (md->anon)
+ seq_printf(m, " anon=%lu", md->anon);
+
+ if (md->dirty)
+ seq_printf(m, " dirty=%lu", md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m, " swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m, " active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m, " writeback=%lu", md->writeback);
+
+ for_each_node_state(n, N_HIGH_MEMORY)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+ seq_putc(m, '\n');
+
+ if (m->count < m->size)
+ m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
+ return 0;
+}
static const struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
@@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = {
static int numa_maps_open(struct inode *inode, struct file *file)
{
- return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+ struct numa_maps_private *priv;
+ int ret = -ENOMEM;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ priv->proc_maps.pid = proc_pid(inode);
+ ret = seq_open(file, &proc_pid_numa_maps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = priv;
+ } else {
+ kfree(priv);
+ }
+ }
+ return ret;
}
const struct file_operations proc_numa_maps_operations = {
@@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = {
.llseek = seq_lseek,
.release = seq_release_private,
};
-#endif
+#endif /* CONFIG_NUMA */
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f835a25625ff..f2c3ff20ea68 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -152,21 +152,27 @@ EXPORT_SYMBOL_GPL(pstore_register);
void pstore_get_records(void)
{
struct pstore_info *psi = psinfo;
- size_t size;
+ ssize_t size;
u64 id;
enum pstore_type_id type;
struct timespec time;
- int failed = 0;
+ int failed = 0, rc;
if (!psi)
return;
mutex_lock(&psinfo->buf_mutex);
+ rc = psi->open(psi);
+ if (rc)
+ goto out;
+
while ((size = psi->read(&id, &type, &time)) > 0) {
- if (pstore_mkfile(type, psi->name, id, psi->buf, size,
+ if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
time, psi->erase))
failed++;
}
+ psi->close(psi);
+out:
mutex_unlock(&psinfo->buf_mutex);
if (failed)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index d3c032f5fa0a..5b572c89e6c4 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -691,8 +691,11 @@ static void prune_dqcache(int count)
* This is called from kswapd when we think we need some
* more memory
*/
-static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink,
+ struct shrink_control *sc)
{
+ int nr = sc->nr_to_scan;
+
if (nr) {
spin_lock(&dq_list_lock);
prune_dqcache(nr);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 118662690cdf..76c8164d5651 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,6 +831,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
+ dentry_unhash(dentry);
+
/* we will be doing 2 balancings and update 2 stat data, we change quotas
* of the owner of the directory and of the owner of the parent directory.
* The quota structure is possibly deleted only on last iput => outside
@@ -1225,6 +1227,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
+ if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/* three balancings: (1) old name removal, (2) new name insertion
and (3) maybe "save" link insertion
stat data updates: (1) old directory,
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 47d2a4498b03..50f1abccd1cd 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -105,7 +105,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
mutex_unlock(&dentry->d_inode->i_mutex);
if (!error)
d_delete(dentry);
- dput(dentry);
return error;
}
diff --git a/fs/splice.c b/fs/splice.c
index 50a5d978da16..aa866d309695 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = {
.get = generic_pipe_buf_get,
};
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
+
/**
* splice_to_pipe - fill passed data into a pipe
* @pipe: pipe to fill
@@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
pipe_unlock(pipe);
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (do_wakeup)
+ wakeup_pipe_readers(pipe);
while (page_nr < spd_pages)
spd->spd_release(spd, page_nr++);
@@ -1892,12 +1896,9 @@ retry:
/*
* If we put data in the output pipe, wakeup any potential readers.
*/
- if (ret > 0) {
- smp_mb();
- if (waitqueue_active(&opipe->wait))
- wake_up_interruptible(&opipe->wait);
- kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
+
if (input_wakeup)
wakeup_pipe_writers(ipipe);
@@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
/*
* If we put data in the output pipe, wakeup any potential readers.
*/
- if (ret > 0) {
- smp_mb();
- if (waitqueue_active(&opipe->wait))
- wake_up_interruptible(&opipe->wait);
- kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
return ret;
}
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index efc309fa3035..7797218d0b30 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -42,7 +42,7 @@ config SQUASHFS_LZO
select LZO_DECOMPRESS
help
Saying Y here includes support for reading Squashfs file systems
- compressed with LZO compresssion. LZO compression is mainly
+ compressed with LZO compression. LZO compression is mainly
aimed at embedded systems with slower CPUs where the overheads
of zlib are too high.
@@ -57,7 +57,7 @@ config SQUASHFS_XZ
select XZ_DEC
help
Saying Y here includes support for reading Squashfs file systems
- compressed with XZ compresssion. XZ gives better compression than
+ compressed with XZ compression. XZ gives better compression than
the default zlib compression, at the expense of greater CPU and
memory overhead.
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index c37b520132ff..4b5a3fbb1f1f 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -29,7 +29,7 @@
* plus functions layered ontop of the generic cache implementation to
* access the metadata and fragment caches.
*
- * To avoid out of memory and fragmentation isssues with vmalloc the cache
+ * To avoid out of memory and fragmentation issues with vmalloc the cache
* uses sequences of kmalloced PAGE_CACHE_SIZE buffers.
*
* It should be noted that the cache is not used for file datablocks, these
diff --git a/fs/super.c b/fs/super.c
index b383fa407740..c75593953c52 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -951,8 +951,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
* filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
* but s_maxbytes was an unsigned long long for many releases. Throw
* this warning for a little while to try and catch filesystems that
- * violate this rule. This warning should be either removed or
- * converted to a BUG() in 2.6.34.
+ * violate this rule.
*/
WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
"negative value (%lld)\n", type->name, sb->s_maxbytes);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index da3fefe91a8f..1ad8c93c1b85 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -24,13 +24,6 @@
#include "sysfs.h"
-/* used in crash dumps to help with debugging */
-static char last_sysfs_file[PATH_MAX];
-void sysfs_printk_last_file(void)
-{
- printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
-}
-
/*
* There's one sysfs_buffer for each open file and one
* sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -337,11 +330,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
struct sysfs_buffer *buffer;
const struct sysfs_ops *ops;
int error = -EACCES;
- char *p;
-
- p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
- if (!IS_ERR(p))
- memmove(last_sysfs_file, p, strlen(p) + 1);
/* need attr_sd for attr and ops, its parent for kobj */
if (!sysfs_get_active(attr_sd))
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index c8769dc222d8..194414f8298c 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -101,9 +101,9 @@ int sysfs_create_group(struct kobject *kobj,
}
/**
- * sysfs_update_group - given a directory kobject, create an attribute group
- * @kobj: The kobject to create the group on
- * @grp: The attribute group to create
+ * sysfs_update_group - given a directory kobject, update an attribute group
+ * @kobj: The kobject to update the group on
+ * @grp: The attribute group to update
*
* This function updates an attribute group. Unlike
* sysfs_create_group(), it will explicitly not warn or error if any
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e474fbcf8bde..e2cc6756f3b1 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,6 +196,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
struct inode *inode = dentry->d_inode;
int err = -ENOTEMPTY;
+ dentry_unhash(dentry);
+
if (sysv_empty_dir(inode)) {
err = sysv_unlink(dir, dentry);
if (!err) {
@@ -222,6 +224,9 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
struct sysv_dir_entry * old_de;
int err = -ENOENT;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_de = sysv_find_entry(old_dentry, &old_page);
if (!old_de)
goto out;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 8c4fc1425b3e..f67acbdda5e8 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -22,16 +22,24 @@
#include <linux/anon_inodes.h>
#include <linux/timerfd.h>
#include <linux/syscalls.h>
+#include <linux/rcupdate.h>
struct timerfd_ctx {
struct hrtimer tmr;
ktime_t tintv;
+ ktime_t moffs;
wait_queue_head_t wqh;
u64 ticks;
int expired;
int clockid;
+ struct rcu_head rcu;
+ struct list_head clist;
+ bool might_cancel;
};
+static LIST_HEAD(cancel_list);
+static DEFINE_SPINLOCK(cancel_lock);
+
/*
* This gets called when the timer event triggers. We set the "expired"
* flag, but we do not re-arm the timer (in case it's necessary,
@@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr)
return HRTIMER_NORESTART;
}
+/*
+ * Called when the clock was set to cancel the timers in the cancel
+ * list.
+ */
+void timerfd_clock_was_set(void)
+{
+ ktime_t moffs = ktime_get_monotonic_offset();
+ struct timerfd_ctx *ctx;
+ unsigned long flags;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ctx, &cancel_list, clist) {
+ if (!ctx->might_cancel)
+ continue;
+ spin_lock_irqsave(&ctx->wqh.lock, flags);
+ if (ctx->moffs.tv64 != moffs.tv64) {
+ ctx->moffs.tv64 = KTIME_MAX;
+ wake_up_locked(&ctx->wqh);
+ }
+ spin_unlock_irqrestore(&ctx->wqh.lock, flags);
+ }
+ rcu_read_unlock();
+}
+
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+ if (ctx->might_cancel) {
+ ctx->might_cancel = false;
+ spin_lock(&cancel_lock);
+ list_del_rcu(&ctx->clist);
+ spin_unlock(&cancel_lock);
+ }
+}
+
+static bool timerfd_canceled(struct timerfd_ctx *ctx)
+{
+ if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+ return false;
+ ctx->moffs = ktime_get_monotonic_offset();
+ return true;
+}
+
+static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
+{
+ if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) &&
+ (flags & TFD_TIMER_CANCEL_ON_SET)) {
+ if (!ctx->might_cancel) {
+ ctx->might_cancel = true;
+ spin_lock(&cancel_lock);
+ list_add_rcu(&ctx->clist, &cancel_list);
+ spin_unlock(&cancel_lock);
+ }
+ } else if (ctx->might_cancel) {
+ timerfd_remove_cancel(ctx);
+ }
+}
+
static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
{
ktime_t remaining;
@@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
}
-static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
- const struct itimerspec *ktmr)
+static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
+ const struct itimerspec *ktmr)
{
enum hrtimer_mode htmode;
ktime_t texp;
+ int clockid = ctx->clockid;
htmode = (flags & TFD_TIMER_ABSTIME) ?
HRTIMER_MODE_ABS: HRTIMER_MODE_REL;
@@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags,
ctx->expired = 0;
ctx->ticks = 0;
ctx->tintv = timespec_to_ktime(ktmr->it_interval);
- hrtimer_init(&ctx->tmr, ctx->clockid, htmode);
+ hrtimer_init(&ctx->tmr, clockid, htmode);
hrtimer_set_expires(&ctx->tmr, texp);
ctx->tmr.function = timerfd_tmrproc;
- if (texp.tv64 != 0)
+ if (texp.tv64 != 0) {
hrtimer_start(&ctx->tmr, texp, htmode);
+ if (timerfd_canceled(ctx))
+ return -ECANCELED;
+ }
+ return 0;
}
static int timerfd_release(struct inode *inode, struct file *file)
{
struct timerfd_ctx *ctx = file->private_data;
+ timerfd_remove_cancel(ctx);
hrtimer_cancel(&ctx->tmr);
- kfree(ctx);
+ kfree_rcu(ctx, rcu);
return 0;
}
@@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
res = -EAGAIN;
else
res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks);
+
+ /*
+ * If clock has changed, we do not care about the
+ * ticks and we do not rearm the timer. Userspace must
+ * reevaluate anyway.
+ */
+ if (timerfd_canceled(ctx)) {
+ ctx->ticks = 0;
+ ctx->expired = 0;
+ res = -ECANCELED;
+ }
+
if (ctx->ticks) {
ticks = ctx->ticks;
+
if (ctx->expired && ctx->tintv.tv64) {
/*
* If tintv.tv64 != 0, this is a periodic timer that
@@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
init_waitqueue_head(&ctx->wqh);
ctx->clockid = clockid;
hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
+ ctx->moffs = ktime_get_monotonic_offset();
ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
struct file *file;
struct timerfd_ctx *ctx;
struct itimerspec ktmr, kotmr;
+ int ret;
if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
return -EFAULT;
@@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
return PTR_ERR(file);
ctx = file->private_data;
+ timerfd_setup_cancel(ctx, flags);
+
/*
* We need to stop the existing timer before reprogramming
* it to the new values.
@@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
/*
* Re-program the timer to the new value ...
*/
- timerfd_setup(ctx, flags, &ktmr);
+ ret = timerfd_setup(ctx, flags, &ktmr);
spin_unlock_irq(&ctx->wqh.lock);
fput(file);
if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
return -EFAULT;
- return 0;
+ return ret;
}
SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 8b3a7da531eb..315de66e52b2 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c)
long long liab;
spin_lock(&c->space_lock);
- liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+ liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth;
spin_unlock(&c->space_lock);
return liab;
}
@@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
int idx_lebs;
long long idx_size;
- idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+ idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx;
/* And make sure we have thrice the index size of space reserved */
idx_size += idx_size << 1;
/*
@@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c)
* budgeted index space to the size of the current index, multiplies this by 3,
* and makes sure this does not exceed the amount of free LEBs.
*
- * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables:
* o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
* be large, because UBIFS does not do any index consolidation as long as
* there is free space. IOW, the index may take a lot of LEBs, but the LEBs
* will contain a lot of dirt.
- * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
- * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
+ * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW,
+ * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs.
*
* This function returns zero in case of success, and %-ENOSPC in case of
* failure.
@@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c)
c->lst.taken_empty_lebs;
if (unlikely(rsvd_idx_lebs > lebs)) {
dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
- "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+ "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
rsvd_idx_lebs);
return -ENOSPC;
}
available = ubifs_calc_available(c, min_idx_lebs);
- outstanding = c->budg_data_growth + c->budg_dd_growth;
+ outstanding = c->bi.data_growth + c->bi.dd_growth;
if (unlikely(available < outstanding)) {
dbg_budg("out of data space: available %lld, outstanding %lld",
@@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c)
if (available - outstanding <= c->rp_size && !can_use_rp(c))
return -ENOSPC;
- c->min_idx_lebs = min_idx_lebs;
+ c->bi.min_idx_lebs = min_idx_lebs;
return 0;
}
@@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c,
{
int data_growth;
- data_growth = req->new_ino ? c->inode_budget : 0;
+ data_growth = req->new_ino ? c->bi.inode_budget : 0;
if (req->new_page)
- data_growth += c->page_budget;
+ data_growth += c->bi.page_budget;
if (req->new_dent)
- data_growth += c->dent_budget;
+ data_growth += c->bi.dent_budget;
data_growth += req->new_ino_d;
return data_growth;
}
@@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c,
{
int dd_growth;
- dd_growth = req->dirtied_page ? c->page_budget : 0;
+ dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
if (req->dirtied_ino)
- dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+ dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
if (req->mod_dent)
- dd_growth += c->dent_budget;
+ dd_growth += c->bi.dent_budget;
dd_growth += req->dirtied_ino_d;
return dd_growth;
}
@@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
again:
spin_lock(&c->space_lock);
- ubifs_assert(c->budg_idx_growth >= 0);
- ubifs_assert(c->budg_data_growth >= 0);
- ubifs_assert(c->budg_dd_growth >= 0);
+ ubifs_assert(c->bi.idx_growth >= 0);
+ ubifs_assert(c->bi.data_growth >= 0);
+ ubifs_assert(c->bi.dd_growth >= 0);
- if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+ if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) {
dbg_budg("no space");
spin_unlock(&c->space_lock);
return -ENOSPC;
}
- c->budg_idx_growth += idx_growth;
- c->budg_data_growth += data_growth;
- c->budg_dd_growth += dd_growth;
+ c->bi.idx_growth += idx_growth;
+ c->bi.data_growth += data_growth;
+ c->bi.dd_growth += dd_growth;
err = do_budget_space(c);
if (likely(!err)) {
@@ -484,9 +484,9 @@ again:
}
/* Restore the old values */
- c->budg_idx_growth -= idx_growth;
- c->budg_data_growth -= data_growth;
- c->budg_dd_growth -= dd_growth;
+ c->bi.idx_growth -= idx_growth;
+ c->bi.data_growth -= data_growth;
+ c->bi.dd_growth -= dd_growth;
spin_unlock(&c->space_lock);
if (req->fast) {
@@ -506,9 +506,9 @@ again:
goto again;
}
dbg_budg("FS is full, -ENOSPC");
- c->nospace = 1;
+ c->bi.nospace = 1;
if (can_use_rp(c) || c->rp_size == 0)
- c->nospace_rp = 1;
+ c->bi.nospace_rp = 1;
smp_wmb();
} else
ubifs_err("cannot budget space, error %d", err);
@@ -523,8 +523,8 @@ again:
* This function releases the space budgeted by 'ubifs_budget_space()'. Note,
* since the index changes (which were budgeted for in @req->idx_growth) will
* only be written to the media on commit, this function moves the index budget
- * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
- * zeroed by the commit operation.
+ * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed
+ * by the commit operation.
*/
void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
{
@@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
if (!req->data_growth && !req->dd_growth)
return;
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
spin_lock(&c->space_lock);
- c->budg_idx_growth -= req->idx_growth;
- c->budg_uncommitted_idx += req->idx_growth;
- c->budg_data_growth -= req->data_growth;
- c->budg_dd_growth -= req->dd_growth;
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
-
- ubifs_assert(c->budg_idx_growth >= 0);
- ubifs_assert(c->budg_data_growth >= 0);
- ubifs_assert(c->budg_dd_growth >= 0);
- ubifs_assert(c->min_idx_lebs < c->main_lebs);
- ubifs_assert(!(c->budg_idx_growth & 7));
- ubifs_assert(!(c->budg_data_growth & 7));
- ubifs_assert(!(c->budg_dd_growth & 7));
+ c->bi.idx_growth -= req->idx_growth;
+ c->bi.uncommitted_idx += req->idx_growth;
+ c->bi.data_growth -= req->data_growth;
+ c->bi.dd_growth -= req->dd_growth;
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ ubifs_assert(c->bi.idx_growth >= 0);
+ ubifs_assert(c->bi.data_growth >= 0);
+ ubifs_assert(c->bi.dd_growth >= 0);
+ ubifs_assert(c->bi.min_idx_lebs < c->main_lebs);
+ ubifs_assert(!(c->bi.idx_growth & 7));
+ ubifs_assert(!(c->bi.data_growth & 7));
+ ubifs_assert(!(c->bi.dd_growth & 7));
spin_unlock(&c->space_lock);
}
@@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c)
{
spin_lock(&c->space_lock);
/* Release the index growth reservation */
- c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
/* Release the data growth reservation */
- c->budg_data_growth -= c->page_budget;
+ c->bi.data_growth -= c->bi.page_budget;
/* Increase the dirty data growth reservation instead */
- c->budg_dd_growth += c->page_budget;
+ c->bi.dd_growth += c->bi.page_budget;
/* And re-calculate the indexing space reservation */
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
spin_unlock(&c->space_lock);
}
@@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
memset(&req, 0, sizeof(struct ubifs_budget_req));
/* The "no space" flags will be cleared because dd_growth is > 0 */
- req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+ req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8);
ubifs_release_budget(c, &req);
}
@@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
int rsvd_idx_lebs, lebs;
long long available, outstanding, free;
- ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
- outstanding = c->budg_data_growth + c->budg_dd_growth;
- available = ubifs_calc_available(c, c->min_idx_lebs);
+ ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+ outstanding = c->bi.data_growth + c->bi.dd_growth;
+ available = ubifs_calc_available(c, c->bi.min_idx_lebs);
/*
* When reporting free space to user-space, UBIFS guarantees that it is
@@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c)
* Note, the calculations below are similar to what we have in
* 'do_budget_space()', so refer there for comments.
*/
- if (c->min_idx_lebs > c->lst.idx_lebs)
- rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
else
rsvd_idx_lebs = 0;
lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 1bd01ded7123..87cd0ead8633 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c)
c->mst_node->root_len = cpu_to_le32(zroot.len);
c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
- c->mst_node->index_size = cpu_to_le64(c->old_idx_sz);
+ c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz);
c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 004d3745dc45..0bb2bcef0de9 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -34,7 +34,6 @@
#include <linux/moduleparam.h>
#include <linux/debugfs.h>
#include <linux/math64.h>
-#include <linux/slab.h>
#ifdef CONFIG_UBIFS_FS_DEBUG
@@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock);
static char dbg_key_buf0[128];
static char dbg_key_buf1[128];
-unsigned int ubifs_msg_flags;
unsigned int ubifs_chk_flags;
unsigned int ubifs_tst_flags;
-module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
MODULE_PARM_DESC(debug_chks, "Debug check flags");
MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
@@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
printk(KERN_DEBUG "\t big_lpt %u\n",
!!(sup_flags & UBIFS_FLG_BIGLPT));
+ printk(KERN_DEBUG "\t space_fixup %u\n",
+ !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
printk(KERN_DEBUG "\tmin_io_size %u\n",
le32_to_cpu(sup->min_io_size));
printk(KERN_DEBUG "\tleb_size %u\n",
@@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
spin_unlock(&dbg_lock);
}
-void dbg_dump_budg(struct ubifs_info *c)
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
{
int i;
struct rb_node *rb;
@@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c)
struct ubifs_gced_idx_leb *idx_gc;
long long available, outstanding, free;
- ubifs_assert(spin_is_locked(&c->space_lock));
+ spin_lock(&c->space_lock);
spin_lock(&dbg_lock);
- printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
- "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
- c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
- printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
- "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
- c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
- c->freeable_cnt);
- printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
- "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
- c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+ printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
+ "total budget sum %lld\n", current->pid,
+ bi->data_growth + bi->dd_growth,
+ bi->data_growth + bi->dd_growth + bi->idx_growth);
+ printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
+ "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
+ bi->idx_growth);
+ printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
+ "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
+ bi->uncommitted_idx);
+ printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+ bi->page_budget, bi->inode_budget, bi->dent_budget);
+ printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
+ bi->nospace, bi->nospace_rp);
+ printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+ c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+
+ if (bi != &c->bi)
+ /*
+ * If we are dumping saved budgeting data, do not print
+ * additional information which is about the current state, not
+ * the old one which corresponded to the saved budgeting data.
+ */
+ goto out_unlock;
+
+ printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+ c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
"clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
atomic_long_read(&c->dirty_zn_cnt),
atomic_long_read(&c->clean_zn_cnt));
- printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
- c->dark_wm, c->dead_wm, c->max_idx_node_sz);
printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
c->gc_lnum, c->ihead_lnum);
+
/* If we are in R/O mode, journal heads do not exist */
if (c->jheads)
for (i = 0; i < c->jhead_cnt; i++)
@@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c)
printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
/* Print budgeting predictions */
- available = ubifs_calc_available(c, c->min_idx_lebs);
- outstanding = c->budg_data_growth + c->budg_dd_growth;
+ available = ubifs_calc_available(c, c->bi.min_idx_lebs);
+ outstanding = c->bi.data_growth + c->bi.dd_growth;
free = ubifs_get_free_space_nolock(c);
printk(KERN_DEBUG "Budgeting predictions:\n");
printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
available, outstanding, free);
+out_unlock:
spin_unlock(&dbg_lock);
+ spin_unlock(&c->space_lock);
}
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
@@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
if (bud->lnum == lp->lnum) {
int head = 0;
for (i = 0; i < c->jhead_cnt; i++) {
- if (lp->lnum == c->jheads[i].wbuf.lnum) {
+ /*
+ * Note, if we are in R/O mode or in the middle
+ * of mounting/re-mounting, the write-buffers do
+ * not exist.
+ */
+ if (c->jheads &&
+ lp->lnum == c->jheads[i].wbuf.lnum) {
printk(KERN_CONT ", jhead %s",
dbg_jhead(i));
head = 1;
@@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c)
spin_lock(&c->space_lock);
memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
+ memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info));
+ d->saved_idx_gc_cnt = c->idx_gc_cnt;
/*
* We use a dirty hack here and zero out @c->freeable_cnt, because it
@@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c)
out:
ubifs_msg("saved lprops statistics dump");
dbg_dump_lstats(&d->saved_lst);
- ubifs_get_lp_stats(c, &lst);
-
+ ubifs_msg("saved budgeting info dump");
+ dbg_dump_budg(c, &d->saved_bi);
+ ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
ubifs_msg("current lprops statistics dump");
+ ubifs_get_lp_stats(c, &lst);
dbg_dump_lstats(&lst);
-
- spin_lock(&c->space_lock);
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
+ ubifs_msg("current budgeting info dump");
+ dbg_dump_budg(c, &c->bi);
dump_stack();
return -EINVAL;
}
@@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
struct rb_node **p, *parent = NULL;
struct fsck_inode *fscki;
ino_t inum = key_inum_flash(c, &ino->key);
+ struct inode *inode;
+ struct ubifs_inode *ui;
p = &fsckd->inodes.rb_node;
while (*p) {
@@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c,
if (!fscki)
return ERR_PTR(-ENOMEM);
+ inode = ilookup(c->vfs_sb, inum);
+
fscki->inum = inum;
- fscki->nlink = le32_to_cpu(ino->nlink);
- fscki->size = le64_to_cpu(ino->size);
- fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
- fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
- fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
- fscki->mode = le32_to_cpu(ino->mode);
+ /*
+ * If the inode is present in the VFS inode cache, use it instead of
+ * the on-flash inode which might be out-of-date. E.g., the size might
+ * be out-of-date. If we do not do this, the following may happen, for
+ * example:
+ * 1. A power cut happens
+ * 2. We mount the file-system R/O, the replay process fixes up the
+ * inode size in the VFS cache, but on on-flash.
+ * 3. 'check_leaf()' fails because it hits a data node beyond inode
+ * size.
+ */
+ if (!inode) {
+ fscki->nlink = le32_to_cpu(ino->nlink);
+ fscki->size = le64_to_cpu(ino->size);
+ fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+ fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+ fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+ fscki->mode = le32_to_cpu(ino->mode);
+ } else {
+ ui = ubifs_inode(inode);
+ fscki->nlink = inode->i_nlink;
+ fscki->size = inode->i_size;
+ fscki->xattr_cnt = ui->xattr_cnt;
+ fscki->xattr_sz = ui->xattr_size;
+ fscki->xattr_nms = ui->xattr_names;
+ fscki->mode = inode->i_mode;
+ iput(inode);
+ }
+
if (S_ISDIR(fscki->mode)) {
fscki->calc_sz = UBIFS_INO_NODE_SZ;
fscki->calc_cnt = 2;
}
+
rb_link_node(&fscki->rb, parent, p);
rb_insert_color(&fscki->rb, &fsckd->inodes);
+
return fscki;
}
@@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head)
hashb = key_block(c, &sb->key);
if (hasha > hashb) {
- ubifs_err("larger hash %u goes before %u", hasha, hashb);
+ ubifs_err("larger hash %u goes before %u",
+ hasha, hashb);
goto error_dump;
}
}
@@ -2437,14 +2491,12 @@ error_dump:
return 0;
}
-static int invocation_cnt;
-
int dbg_force_in_the_gaps(void)
{
- if (!dbg_force_in_the_gaps_enabled)
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
return 0;
- /* Force in-the-gaps every 8th commit */
- return !((invocation_cnt++) & 0x7);
+
+ return !(random32() & 7);
}
/* Failure mode for recovery testing */
@@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
int len, int check)
{
if (in_failure_mode(desc))
- return -EIO;
+ return -EROFS;
return ubi_leb_read(desc, lnum, buf, offset, len, check);
}
@@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
int err, failing;
if (in_failure_mode(desc))
- return -EIO;
+ return -EROFS;
failing = do_fail(desc, lnum, 1);
if (failing)
cut_data(buf, len);
@@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
if (err)
return err;
if (failing)
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
int err;
if (do_fail(desc, lnum, 1))
- return -EIO;
+ return -EROFS;
err = ubi_leb_change(desc, lnum, buf, len, dtype);
if (err)
return err;
if (do_fail(desc, lnum, 1))
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
int err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
err = ubi_leb_erase(desc, lnum);
if (err)
return err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
int err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
err = ubi_leb_unmap(desc, lnum);
if (err)
return err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
return 0;
}
int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
{
if (in_failure_mode(desc))
- return -EIO;
+ return -EROFS;
return ubi_is_mapped(desc, lnum);
}
@@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
int err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
err = ubi_leb_map(desc, lnum, dtype);
if (err)
return err;
if (do_fail(desc, lnum, 0))
- return -EIO;
+ return -EROFS;
return 0;
}
@@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void)
static int open_debugfs_file(struct inode *inode, struct file *file)
{
file->private_data = inode->i_private;
- return 0;
+ return nonseekable_open(inode, file);
}
static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
@@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
if (file->f_path.dentry == d->dfs_dump_lprops)
dbg_dump_lprops(c);
- else if (file->f_path.dentry == d->dfs_dump_budg) {
- spin_lock(&c->space_lock);
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
- } else if (file->f_path.dentry == d->dfs_dump_tnc) {
+ else if (file->f_path.dentry == d->dfs_dump_budg)
+ dbg_dump_budg(c, &c->bi);
+ else if (file->f_path.dentry == d->dfs_dump_tnc) {
mutex_lock(&c->tnc_mutex);
dbg_dump_tnc(c);
mutex_unlock(&c->tnc_mutex);
} else
return -EINVAL;
- *ppos += count;
return count;
}
@@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = {
.open = open_debugfs_file,
.write = write_debugfs_file,
.owner = THIS_MODULE,
- .llseek = default_llseek,
+ .llseek = no_llseek,
};
/**
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index e6493cac193d..a811ac4a26bb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
#ifdef CONFIG_UBIFS_FS_DEBUG
+#include <linux/random.h>
+
/**
* ubifs_debug_info - per-FS debugging information.
* @old_zroot: old index root - used by 'dbg_check_old_index()'
@@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c,
* @new_ihead_offs: used by debugging to check @c->ihead_offs
*
* @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
- * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ * @saved_bi: saved budgeting information
+ * @saved_free: saved amount of free space
+ * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt
*
- * dfs_dir_name: name of debugfs directory containing this file-system's files
- * dfs_dir: direntry object of the file-system debugfs directory
- * dfs_dump_lprops: "dump lprops" debugfs knob
- * dfs_dump_budg: "dump budgeting information" debugfs knob
- * dfs_dump_tnc: "dump TNC" debugfs knob
+ * @dfs_dir_name: name of debugfs directory containing this file-system's files
+ * @dfs_dir: direntry object of the file-system debugfs directory
+ * @dfs_dump_lprops: "dump lprops" debugfs knob
+ * @dfs_dump_budg: "dump budgeting information" debugfs knob
+ * @dfs_dump_tnc: "dump TNC" debugfs knob
*/
struct ubifs_debug_info {
struct ubifs_zbranch old_zroot;
@@ -76,7 +80,9 @@ struct ubifs_debug_info {
int new_ihead_offs;
struct ubifs_lp_stats saved_lst;
+ struct ubifs_budg_info saved_bi;
long long saved_free;
+ int saved_idx_gc_cnt;
char dfs_dir_name[100];
struct dentry *dfs_dir;
@@ -101,23 +107,7 @@ struct ubifs_debug_info {
} \
} while (0)
-#define dbg_dump_stack() do { \
- if (!dbg_failure_mode) \
- dump_stack(); \
-} while (0)
-
-/* Generic debugging messages */
-#define dbg_msg(fmt, ...) do { \
- spin_lock(&dbg_lock); \
- printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
- __func__, ##__VA_ARGS__); \
- spin_unlock(&dbg_lock); \
-} while (0)
-
-#define dbg_do_msg(typ, fmt, ...) do { \
- if (ubifs_msg_flags & typ) \
- dbg_msg(fmt, ##__VA_ARGS__); \
-} while (0)
+#define dbg_dump_stack() dump_stack()
#define dbg_err(fmt, ...) do { \
spin_lock(&dbg_lock); \
@@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c,
#define DBGKEY(key) dbg_key_str0(c, (key))
#define DBGKEY1(key) dbg_key_str1(c, (key))
-/* General messages */
-#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+#define ubifs_dbg_msg(type, fmt, ...) do { \
+ spin_lock(&dbg_lock); \
+ pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
+ spin_unlock(&dbg_lock); \
+} while (0)
+/* Just a debugging messages not related to any specific UBIFS subsystem */
+#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__)
+/* General messages */
+#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
/* Additional journal messages */
-#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
-
+#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
/* Additional TNC messages */
-#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
-
+#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
/* Additional lprops messages */
-#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
-
+#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
/* Additional LEB find messages */
-#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
-
+#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
/* Additional mount messages */
-#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
-
+#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
/* Additional I/O messages */
-#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
-
+#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
/* Additional commit messages */
-#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
-
+#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__)
/* Additional budgeting messages */
-#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
-
+#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__)
/* Additional log messages */
-#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
-
+#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__)
/* Additional gc messages */
-#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
-
+#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__)
/* Additional scan messages */
-#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
-
+#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__)
/* Additional recovery messages */
-#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
-
-/*
- * Debugging message type flags.
- *
- * UBIFS_MSG_GEN: general messages
- * UBIFS_MSG_JNL: journal messages
- * UBIFS_MSG_MNT: mount messages
- * UBIFS_MSG_CMT: commit messages
- * UBIFS_MSG_FIND: LEB find messages
- * UBIFS_MSG_BUDG: budgeting messages
- * UBIFS_MSG_GC: garbage collection messages
- * UBIFS_MSG_TNC: TNC messages
- * UBIFS_MSG_LP: lprops messages
- * UBIFS_MSG_IO: I/O messages
- * UBIFS_MSG_LOG: log messages
- * UBIFS_MSG_SCAN: scan messages
- * UBIFS_MSG_RCVRY: recovery messages
- */
-enum {
- UBIFS_MSG_GEN = 0x1,
- UBIFS_MSG_JNL = 0x2,
- UBIFS_MSG_MNT = 0x4,
- UBIFS_MSG_CMT = 0x8,
- UBIFS_MSG_FIND = 0x10,
- UBIFS_MSG_BUDG = 0x20,
- UBIFS_MSG_GC = 0x40,
- UBIFS_MSG_TNC = 0x80,
- UBIFS_MSG_LP = 0x100,
- UBIFS_MSG_IO = 0x200,
- UBIFS_MSG_LOG = 0x400,
- UBIFS_MSG_SCAN = 0x800,
- UBIFS_MSG_RCVRY = 0x1000,
-};
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
/*
* Debugging check flags.
@@ -233,11 +186,9 @@ enum {
/*
* Special testing flags.
*
- * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
* UBIFS_TST_RCVRY: failure mode for recovery testing
*/
enum {
- UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
UBIFS_TST_RCVRY = 0x4,
};
@@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
int offs);
void dbg_dump_budget_req(const struct ubifs_budget_req *req);
void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
void dbg_dump_lprops(struct ubifs_info *c);
void dbg_dump_lpt_info(struct ubifs_info *c);
@@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head);
int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
/* Force the use of in-the-gaps method for testing */
-
-#define dbg_force_in_the_gaps_enabled \
- (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
-
+static inline int dbg_force_in_the_gaps_enabled(void)
+{
+ return ubifs_chk_flags & UBIFS_CHK_GEN;
+}
int dbg_force_in_the_gaps(void);
/* Failure mode for recovery testing */
-
#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
#ifndef UBIFS_DBG_PRESERVE_UBI
-
#define ubi_leb_read dbg_leb_read
#define ubi_leb_write dbg_leb_write
#define ubi_leb_change dbg_leb_change
@@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void);
#define ubi_leb_unmap dbg_leb_unmap
#define ubi_is_mapped dbg_is_mapped
#define ubi_leb_map dbg_leb_map
-
#endif
int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
@@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
__func__, __LINE__, current->pid); \
} while (0)
-#define dbg_err(fmt, ...) do { \
- if (0) \
- ubifs_err(fmt, ##__VA_ARGS__); \
+#define dbg_err(fmt, ...) do { \
+ if (0) \
+ ubifs_err(fmt, ##__VA_ARGS__); \
} while (0)
-#define dbg_msg(fmt, ...) do { \
- if (0) \
- printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \
- current->pid, __func__, ##__VA_ARGS__); \
+#define ubifs_dbg_msg(fmt, ...) do { \
+ if (0) \
+ pr_debug(fmt "\n", ##__VA_ARGS__); \
} while (0)
#define dbg_dump_stack()
#define ubifs_assert_cmt_locked(c)
-#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
#define DBGKEY(key) ((char *)(key))
#define DBGKEY1(key) ((char *)(key))
@@ -420,7 +368,9 @@ static inline void
dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; }
static inline void
dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; }
-static inline void dbg_dump_budg(struct ubifs_info *c) { return; }
+static inline void
+dbg_dump_budg(struct ubifs_info *c,
+ const struct ubifs_budg_info *bi) { return; }
static inline void dbg_dump_lprop(const struct ubifs_info *c,
const struct ubifs_lprops *lp) { return; }
static inline void dbg_dump_lprops(struct ubifs_info *c) { return; }
@@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c,
struct list_head *head) { return 0; }
static inline int dbg_force_in_the_gaps(void) { return 0; }
-#define dbg_force_in_the_gaps_enabled 0
-#define dbg_failure_mode 0
+#define dbg_force_in_the_gaps_enabled() 0
+#define dbg_failure_mode 0
static inline int dbg_debugfs_init(void) { return 0; }
static inline void dbg_debugfs_exit(void) { return; }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 7217d67a80a6..c2b80943560d 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
ubifs_release_budget(c, &req);
else {
/* We've deleted something - clean the "no space" flags */
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
return 0;
@@ -656,6 +656,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+ dentry_unhash(dentry);
+
/*
* Budget request settings: deletion direntry, deletion inode and
* changing the parent inode. If budgeting fails, go ahead anyway
@@ -693,7 +695,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
ubifs_release_budget(c, &req);
else {
/* We've deleted something - clean the "no space" flags */
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
return 0;
@@ -976,6 +978,9 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec time;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
/*
* Budget request settings: deletion direntry, new direntry, removing
* the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b286db79c686..5e7fccfc4b29 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c)
*/
static void release_existing_page_budget(struct ubifs_info *c)
{
- struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+ struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget};
ubifs_release_budget(c, &req);
}
@@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len)
* the page locked, and it locks @ui_mutex. However, write-back does take inode
* @i_mutex, which means other VFS operations may be run on this inode at the
* same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'truncate_setsize()', which first changes @inode->i_size, then
- * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with
- * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
- * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ * we have to call 'truncate_setsize()', which first changes @inode->i_size,
+ * then drops the truncated pages. And while dropping the pages, it takes the
+ * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()'
+ * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'.
+ * This means that @inode->i_size is changed while @ui_mutex is unlocked.
*
* XXX(truncate): with the new truncate sequence this is not true anymore,
* and the calls to truncate_setsize can be move around freely. They should
@@ -1189,7 +1189,7 @@ out_budg:
if (budgeted)
ubifs_release_budget(c, &req);
else {
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
return err;
@@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync)
dbg_gen("syncing inode %lu", inode->i_ino);
- if (inode->i_sb->s_flags & MS_RDONLY)
+ if (c->ro_mount)
+ /*
+ * For some really strange reasons VFS does not filter out
+ * 'fsync()' for R/O mounted file-systems as per 2.6.39.
+ */
return 0;
/*
@@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
}
/*
- * mmap()d file has taken write protection fault and is being made
- * writable. UBIFS must ensure page is budgeted for.
+ * mmap()d file has taken write protection fault and is being made writable.
+ * UBIFS must ensure page is budgeted for.
*/
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
int err;
- /* 'generic_file_mmap()' takes care of NOMMU case */
err = generic_file_mmap(file, vma);
if (err)
return err;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 1d54383d1269..2559d174e004 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
* But if the index takes fewer LEBs than it is reserved for it,
* this function must avoid picking those reserved LEBs.
*/
- if (c->min_idx_lebs >= c->lst.idx_lebs) {
- rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ if (c->bi.min_idx_lebs >= c->lst.idx_lebs) {
+ rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
exclude_index = 1;
}
spin_unlock(&c->space_lock);
@@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
pick_free = 0;
} else {
spin_lock(&c->space_lock);
- exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+ exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs);
spin_unlock(&c->space_lock);
}
@@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
/* Check if there are enough empty LEBs for commit */
spin_lock(&c->space_lock);
- if (c->min_idx_lebs > c->lst.idx_lebs)
- rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ if (c->bi.min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs;
else
rsvd_idx_lebs = 0;
lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 151f10882820..ded29f6224c2 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c)
if (err)
return err;
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ return err;
+
err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
if (err)
return err;
@@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c)
* This function compares data nodes @a and @b. Returns %1 if @a has greater
* inode or block number, and %-1 otherwise.
*/
-int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
{
ino_t inuma, inumb;
struct ubifs_info *c = priv;
@@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
* first and sorted by length in descending order. Directory entry nodes go
* after inode nodes and are sorted in ascending hash valuer order.
*/
-int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int nondata_nodes_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
{
ino_t inuma, inumb;
struct ubifs_info *c = priv;
@@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
ubifs_assert(c->gc_lnum != lnum);
ubifs_assert(wbuf->lnum != lnum);
+ if (lp->free + lp->dirty == c->leb_size) {
+ /* Special case - a free LEB */
+ dbg_gc("LEB %d is free, return it", lp->lnum);
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+
+ if (lp->free != c->leb_size) {
+ /*
+ * Write buffers must be sync'd before unmapping
+ * freeable LEBs, because one of them may contain data
+ * which obsoletes something in 'lp->pnum'.
+ */
+ err = gc_sync_wbufs(c);
+ if (err)
+ return err;
+ err = ubifs_change_one_lp(c, lp->lnum, c->leb_size,
+ 0, 0, 0, 0);
+ if (err)
+ return err;
+ }
+ err = ubifs_leb_unmap(c, lp->lnum);
+ if (err)
+ return err;
+
+ if (c->gc_lnum == -1) {
+ c->gc_lnum = lnum;
+ return LEB_RETAINED;
+ }
+
+ return LEB_FREED;
+ }
+
/*
* We scan the entire LEB even though we only really need to scan up to
* (c->leb_size - lp->free).
@@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
"(min. space %d)", lp.lnum, lp.free, lp.dirty,
lp.free + lp.dirty, min_space);
- if (lp.free + lp.dirty == c->leb_size) {
- /* An empty LEB was returned */
- dbg_gc("LEB %d is free, return it", lp.lnum);
- /*
- * ubifs_find_dirty_leb() doesn't return freeable index
- * LEBs.
- */
- ubifs_assert(!(lp.flags & LPROPS_INDEX));
- if (lp.free != c->leb_size) {
- /*
- * Write buffers must be sync'd before
- * unmapping freeable LEBs, because one of them
- * may contain data which obsoletes something
- * in 'lp.pnum'.
- */
- ret = gc_sync_wbufs(c);
- if (ret)
- goto out;
- ret = ubifs_change_one_lp(c, lp.lnum,
- c->leb_size, 0, 0, 0,
- 0);
- if (ret)
- goto out;
- }
- ret = ubifs_leb_unmap(c, lp.lnum);
- if (ret)
- goto out;
- ret = lp.lnum;
- break;
- }
-
space_before = c->leb_size - wbuf->offs - wbuf->used;
if (wbuf->lnum == -1)
space_before = 0;
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index dfd168b7807e..166951e0dcd3 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
ubifs_assert(wbuf->size % c->min_io_size == 0);
ubifs_assert(!c->ro_media && !c->ro_mount);
if (c->leb_size - wbuf->offs >= c->max_write_size)
- ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+ ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
if (c->ro_error)
return -EROFS;
@@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
* @dtype: data type
*
* This function targets the write-buffer to logical eraseblock @lnum:@offs.
- * The write-buffer is synchronized if it is not empty. Returns zero in case of
- * success and a negative error code in case of failure.
+ * The write-buffer has to be empty. Returns zero in case of success and a
+ * negative error code in case of failure.
*/
int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
int dtype)
@@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
ubifs_assert(offs >= 0 && offs <= c->leb_size);
ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
ubifs_assert(lnum != wbuf->lnum);
-
- if (wbuf->used > 0) {
- int err = ubifs_wbuf_sync_nolock(wbuf);
-
- if (err)
- return err;
- }
+ ubifs_assert(wbuf->used == 0);
spin_lock(&wbuf->lock);
wbuf->lnum = lnum;
@@ -573,7 +567,7 @@ out_timers:
int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
{
struct ubifs_info *c = wbuf->c;
- int err, written, n, aligned_len = ALIGN(len, 8), offs;
+ int err, written, n, aligned_len = ALIGN(len, 8);
dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len,
dbg_ntype(((struct ubifs_ch *)buf)->node_type),
@@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
ubifs_assert(!c->ro_media && !c->ro_mount);
if (c->leb_size - wbuf->offs >= c->max_write_size)
- ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
+ ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
err = -ENOSPC;
@@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
goto exit;
}
- offs = wbuf->offs;
written = 0;
if (wbuf->used) {
@@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
if (err)
goto out;
- offs += wbuf->size;
+ wbuf->offs += wbuf->size;
len -= wbuf->avail;
aligned_len -= wbuf->avail;
written += wbuf->avail;
@@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
if (err)
goto out;
- offs += wbuf->size;
+ wbuf->offs += wbuf->size;
len -= wbuf->size;
aligned_len -= wbuf->size;
written += wbuf->size;
@@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
n = aligned_len >> c->max_write_shift;
if (n) {
n <<= c->max_write_shift;
- dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
- err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
- wbuf->dtype);
+ dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
+ wbuf->offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written,
+ wbuf->offs, n, wbuf->dtype);
if (err)
goto out;
- offs += n;
+ wbuf->offs += n;
aligned_len -= n;
len -= n;
written += n;
@@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
*/
memcpy(wbuf->buf, buf + written, len);
- wbuf->offs = offs;
if (c->leb_size - wbuf->offs >= c->max_write_size)
wbuf->size = c->max_write_size;
else
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index aed25e864227..34b1679e6e3a 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -141,14 +141,8 @@ again:
* LEB with some empty space.
*/
lnum = ubifs_find_free_space(c, len, &offs, squeeze);
- if (lnum >= 0) {
- /* Found an LEB, add it to the journal head */
- err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
- if (err)
- goto out_return;
- /* A new bud was successfully allocated and added to the log */
+ if (lnum >= 0)
goto out;
- }
err = lnum;
if (err != -ENOSPC)
@@ -203,12 +197,23 @@ again:
return 0;
}
- err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
- if (err)
- goto out_return;
offs = 0;
out:
+ /*
+ * Make sure we synchronize the write-buffer before we add the new bud
+ * to the log. Otherwise we may have a power cut after the log
+ * reference node for the last bud (@lnum) is written but before the
+ * write-buffer data are written to the next-to-last bud
+ * (@wbuf->lnum). And the effect would be that the recovery would see
+ * that there is corruption in the next-to-last bud.
+ */
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ goto out_return;
+ err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+ if (err)
+ goto out_return;
err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
if (err)
goto out_unlock;
@@ -380,10 +385,8 @@ out:
if (err == -ENOSPC) {
/* This are some budgeting problems, print useful information */
down_write(&c->commit_sem);
- spin_lock(&c->space_lock);
dbg_dump_stack();
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
+ dbg_dump_budg(c, &c->bi);
dbg_dump_lprops(c);
cmt_retries = dbg_check_lprops(c);
up_write(&c->commit_sem);
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 40fa780ebea7..affea9494ae2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
}
/**
- * next_log_lnum - switch to the next log LEB.
- * @c: UBIFS file-system description object
- * @lnum: current log LEB
- */
-static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
-{
- lnum += 1;
- if (lnum > c->log_last)
- lnum = UBIFS_LOG_LNUM;
-
- return lnum;
-}
-
-/**
* empty_log_bytes - calculate amount of empty space in the log.
* @c: UBIFS file-system description object
*/
@@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
ref->jhead = cpu_to_le32(jhead);
if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
- c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
c->lhead_offs = 0;
}
@@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
/* Switch to the next log LEB */
if (c->lhead_offs) {
- c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
c->lhead_offs = 0;
}
@@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
c->lhead_offs += len;
if (c->lhead_offs == c->leb_size) {
- c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
c->lhead_offs = 0;
}
@@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
}
mutex_lock(&c->log_mutex);
for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
- lnum = next_log_lnum(c, lnum)) {
+ lnum = ubifs_next_log_lnum(c, lnum)) {
dbg_log("unmap log LEB %d", lnum);
err = ubifs_leb_unmap(c, lnum);
if (err)
@@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
if (err)
return err;
- *lnum = next_log_lnum(c, *lnum);
+ *lnum = ubifs_next_log_lnum(c, *lnum);
*offs = 0;
}
memcpy(buf + *offs, node, len);
@@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
ubifs_scan_destroy(sleb);
if (lnum == c->lhead_lnum)
break;
- lnum = next_log_lnum(c, lnum);
+ lnum = ubifs_next_log_lnum(c, lnum);
}
if (offs) {
int sz = ALIGN(offs, c->min_io_size);
@@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c)
/* Unmap remaining LEBs */
lnum = write_lnum;
do {
- lnum = next_log_lnum(c, lnum);
+ lnum = ubifs_next_log_lnum(c, lnum);
err = ubifs_leb_unmap(c, lnum);
if (err)
return err;
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 0ee0847f2421..667884f4a615 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1007,21 +1007,11 @@ out:
}
/**
- * struct scan_check_data - data provided to scan callback function.
- * @lst: LEB properties statistics
- * @err: error code
- */
-struct scan_check_data {
- struct ubifs_lp_stats lst;
- int err;
-};
-
-/**
* scan_check_cb - scan callback.
* @c: the UBIFS file-system description object
* @lp: LEB properties to scan
* @in_tree: whether the LEB properties are in main memory
- * @data: information passed to and from the caller of the scan
+ * @lst: lprops statistics to update
*
* This function returns a code that indicates whether the scan should continue
* (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
@@ -1030,11 +1020,10 @@ struct scan_check_data {
*/
static int scan_check_cb(struct ubifs_info *c,
const struct ubifs_lprops *lp, int in_tree,
- struct scan_check_data *data)
+ struct ubifs_lp_stats *lst)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
- struct ubifs_lp_stats *lst = &data->lst;
int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
void *buf = NULL;
@@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c,
if (cat != (lp->flags & LPROPS_CAT_MASK)) {
ubifs_err("bad LEB category %d expected %d",
(lp->flags & LPROPS_CAT_MASK), cat);
- goto out;
+ return -EINVAL;
}
}
@@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
if (!found) {
ubifs_err("bad LPT list (category %d)", cat);
- goto out;
+ return -EINVAL;
}
}
}
@@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c,
if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
lp != heap->arr[lp->hpos]) {
ubifs_err("bad LPT heap (category %d)", cat);
- goto out;
+ return -EINVAL;
}
}
buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
- if (!buf) {
- ubifs_err("cannot allocate memory to scan LEB %d", lnum);
- goto out;
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * After an unclean unmount, empty and freeable LEBs
+ * may contain garbage - do not scan them.
+ */
+ if (lp->free == c->leb_size) {
+ lst->empty_lebs += 1;
+ lst->total_free += c->leb_size;
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
+ }
+ if (lp->free + lp->dirty == c->leb_size &&
+ !(lp->flags & LPROPS_INDEX)) {
+ lst->total_free += lp->free;
+ lst->total_dirty += lp->dirty;
+ lst->total_dark += ubifs_calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
}
sleb = ubifs_scan(c, lnum, 0, buf, 0);
if (IS_ERR(sleb)) {
- /*
- * After an unclean unmount, empty and freeable LEBs
- * may contain garbage.
- */
- if (lp->free == c->leb_size) {
- ubifs_err("scan errors were in empty LEB "
- "- continuing checking");
- lst->empty_lebs += 1;
- lst->total_free += c->leb_size;
- lst->total_dark += ubifs_calc_dark(c, c->leb_size);
- ret = LPT_SCAN_CONTINUE;
- goto exit;
- }
-
- if (lp->free + lp->dirty == c->leb_size &&
- !(lp->flags & LPROPS_INDEX)) {
- ubifs_err("scan errors were in freeable LEB "
- "- continuing checking");
- lst->total_free += lp->free;
- lst->total_dirty += lp->dirty;
- lst->total_dark += ubifs_calc_dark(c, c->leb_size);
- ret = LPT_SCAN_CONTINUE;
- goto exit;
+ ret = PTR_ERR(sleb);
+ if (ret == -EUCLEAN) {
+ dbg_dump_lprops(c);
+ dbg_dump_budg(c, &c->bi);
}
- data->err = PTR_ERR(sleb);
- ret = LPT_SCAN_STOP;
- goto exit;
+ goto out;
}
is_idx = -1;
@@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c,
}
ubifs_scan_destroy(sleb);
- ret = LPT_SCAN_CONTINUE;
-exit:
vfree(buf);
- return ret;
+ return LPT_SCAN_CONTINUE;
out_print:
ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1258,10 +1240,10 @@ out_print:
dbg_dump_leb(c, lnum);
out_destroy:
ubifs_scan_destroy(sleb);
+ ret = -EINVAL;
out:
vfree(buf);
- data->err = -EINVAL;
- return LPT_SCAN_STOP;
+ return ret;
}
/**
@@ -1278,8 +1260,7 @@ out:
int dbg_check_lprops(struct ubifs_info *c)
{
int i, err;
- struct scan_check_data data;
- struct ubifs_lp_stats *lst = &data.lst;
+ struct ubifs_lp_stats lst;
if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
return 0;
@@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c)
return err;
}
- memset(lst, 0, sizeof(struct ubifs_lp_stats));
-
- data.err = 0;
+ memset(&lst, 0, sizeof(struct ubifs_lp_stats));
err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
(ubifs_lpt_scan_callback)scan_check_cb,
- &data);
+ &lst);
if (err && err != -ENOSPC)
goto out;
- if (data.err) {
- err = data.err;
- goto out;
- }
- if (lst->empty_lebs != c->lst.empty_lebs ||
- lst->idx_lebs != c->lst.idx_lebs ||
- lst->total_free != c->lst.total_free ||
- lst->total_dirty != c->lst.total_dirty ||
- lst->total_used != c->lst.total_used) {
+ if (lst.empty_lebs != c->lst.empty_lebs ||
+ lst.idx_lebs != c->lst.idx_lebs ||
+ lst.total_free != c->lst.total_free ||
+ lst.total_dirty != c->lst.total_dirty ||
+ lst.total_used != c->lst.total_used) {
ubifs_err("bad overall accounting");
ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
"total_free %lld, total_dirty %lld, total_used %lld",
- lst->empty_lebs, lst->idx_lebs, lst->total_free,
- lst->total_dirty, lst->total_used);
+ lst.empty_lebs, lst.idx_lebs, lst.total_free,
+ lst.total_dirty, lst.total_used);
ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
"total_free %lld, total_dirty %lld, total_used %lld",
c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
@@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c)
goto out;
}
- if (lst->total_dead != c->lst.total_dead ||
- lst->total_dark != c->lst.total_dark) {
+ if (lst.total_dead != c->lst.total_dead ||
+ lst.total_dark != c->lst.total_dark) {
ubifs_err("bad dead/dark space accounting");
ubifs_err("calculated: total_dead %lld, total_dark %lld",
- lst->total_dead, lst->total_dark);
+ lst.total_dead, lst.total_dark);
ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
c->lst.total_dead, c->lst.total_dark);
err = -EINVAL;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 0c9c69bd983a..dfcb5748a7dc 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -29,6 +29,12 @@
#include <linux/slab.h>
#include "ubifs.h"
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_populate_lsave(struct ubifs_info *c);
+#else
+#define dbg_populate_lsave(c) 0
+#endif
+
/**
* first_dirty_cnode - find first dirty cnode.
* @c: UBIFS file-system description object
@@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
if (nnode->nbranch[iip].lnum)
break;
}
- } while (iip >= UBIFS_LPT_FANOUT);
+ } while (iip >= UBIFS_LPT_FANOUT);
/* Go right */
nnode = ubifs_get_nnode(c, nnode, iip);
@@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c)
c->lpt_drty_flgs |= LSAVE_DIRTY;
ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
}
+
+ if (dbg_populate_lsave(c))
+ return;
+
list_for_each_entry(lprops, &c->empty_list, list) {
c->lsave[cnt++] = lprops->lnum;
if (cnt >= c->lsave_cnt)
@@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c)
current->pid);
}
+/**
+ * dbg_populate_lsave - debugging version of 'populate_lsave()'
+ * @c: UBIFS file-system description object
+ *
+ * This is a debugging version for 'populate_lsave()' which populates lsave
+ * with random LEBs instead of useful LEBs, which is good for test coverage.
+ * Returns zero if lsave has not been populated (this debugging feature is
+ * disabled) an non-zero if lsave has been populated.
+ */
+static int dbg_populate_lsave(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ int i;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+ if (random32() & 3)
+ return 0;
+
+ for (i = 0; i < c->lsave_cnt; i++)
+ c->lsave[i] = c->main_first;
+
+ list_for_each_entry(lprops, &c->empty_list, list)
+ c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ list_for_each_entry(lprops, &c->freeable_list, list)
+ c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+ list_for_each_entry(lprops, &c->frdi_idx_list, list)
+ c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+
+ heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+ for (i = 0; i < heap->cnt; i++)
+ c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ for (i = 0; i < heap->cnt; i++)
+ c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ for (i = 0; i < heap->cnt; i++)
+ c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+
+ return 1;
+}
+
#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 21f47afdacff..278c2382e8c2 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c)
}
main_sz = (long long)c->main_lebs * c->leb_size;
- if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+ if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) {
err = 9;
goto out;
}
@@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c)
}
if (c->lst.total_dead + c->lst.total_dark +
- c->lst.total_used + c->old_idx_sz > main_sz) {
+ c->lst.total_used + c->bi.old_idx_sz > main_sz) {
err = 21;
goto out;
}
@@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c)
c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
- c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);
+ c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size);
c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
@@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c)
c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
- c->calc_idx_sz = c->old_idx_sz;
+ c->calc_idx_sz = c->bi.old_idx_sz;
if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
c->no_orphs = 1;
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index c3de04dc952a..0b5296a9a4c5 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c)
mutex_unlock(&c->lp_mutex);
}
+/**
+ * ubifs_next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ *
+ * This helper function returns the log LEB number which goes next after LEB
+ * 'lnum'.
+ */
+static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+ lnum += 1;
+ if (lnum > c->log_last)
+ lnum = UBIFS_LOG_LNUM;
+
+ return lnum;
+}
+
#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 09df318e368f..bd644bf587a8 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c)
sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1);
if (IS_ERR(sleb)) {
if (PTR_ERR(sleb) == -EUCLEAN)
- sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ sleb = ubifs_recover_leb(c, lnum, 0,
+ c->sbuf, 0);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 3dbad6fbd1eb..731d9e2e7b50 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
}
/**
- * drop_incomplete_group - drop nodes from an incomplete group.
+ * drop_last_node - drop the last node or group of nodes.
* @sleb: scanned LEB information
* @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
*
- * This function returns %1 if nodes are dropped and %0 otherwise.
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * This function returns %1 if a node was dropped and %0 otherwise.
*/
-static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
{
int dropped = 0;
@@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
kfree(snod);
sleb->nodes_cnt -= 1;
dropped = 1;
+ if (!grouped)
+ break;
}
return dropped;
}
@@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
int offs, void *sbuf, int grouped)
{
- int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
- int empty_chkd = 0, start = offs;
+ int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
struct ubifs_scan_leb *sleb;
void *buf = sbuf + offs;
@@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
if (IS_ERR(sleb))
return sleb;
- if (sleb->ecc)
- need_clean = 1;
-
+ ubifs_assert(len >= 8);
while (len >= 8) {
- int ret;
-
dbg_scan("look at LEB %d:%d (%d bytes left)",
lnum, offs, len);
@@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
* Scan quietly until there is an error from which we cannot
* recover
*/
- ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
-
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
if (ret == SCANNED_A_NODE) {
/* A valid node, and not a padding node */
struct ubifs_ch *ch = buf;
@@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
offs += node_len;
buf += node_len;
len -= node_len;
- continue;
- }
-
- if (ret > 0) {
+ } else if (ret > 0) {
/* Padding bytes or a valid padding node */
offs += ret;
buf += ret;
len -= ret;
- continue;
- }
-
- if (ret == SCANNED_EMPTY_SPACE) {
- if (!is_empty(buf, len)) {
- if (!is_last_write(c, buf, offs))
- break;
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- }
- empty_chkd = 1;
+ } else if (ret == SCANNED_EMPTY_SPACE ||
+ ret == SCANNED_GARBAGE ||
+ ret == SCANNED_A_BAD_PAD_NODE ||
+ ret == SCANNED_A_CORRUPT_NODE) {
+ dbg_rcvry("found corruption - %d", ret);
break;
- }
-
- if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
- if (is_last_write(c, buf, offs)) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- empty_chkd = 1;
- break;
- }
-
- if (ret == SCANNED_A_CORRUPT_NODE)
- if (no_more_nodes(c, buf, len, lnum, offs)) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- empty_chkd = 1;
- break;
- }
-
- if (quiet) {
- /* Redo the last scan but noisily */
- quiet = 0;
- continue;
- }
-
- switch (ret) {
- case SCANNED_GARBAGE:
- dbg_err("garbage");
- goto corrupted;
- case SCANNED_A_CORRUPT_NODE:
- case SCANNED_A_BAD_PAD_NODE:
- dbg_err("bad node");
- goto corrupted;
- default:
- dbg_err("unknown");
+ } else {
+ dbg_err("unexpected return value %d", ret);
err = -EINVAL;
goto error;
}
}
- if (!empty_chkd && !is_empty(buf, len)) {
- if (is_last_write(c, buf, offs)) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- } else {
+ if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) {
+ if (!is_last_write(c, buf, offs))
+ goto corrupted_rescan;
+ } else if (ret == SCANNED_A_CORRUPT_NODE) {
+ if (!no_more_nodes(c, buf, len, lnum, offs))
+ goto corrupted_rescan;
+ } else if (!is_empty(buf, len)) {
+ if (!is_last_write(c, buf, offs)) {
int corruption = first_non_ff(buf, len);
/*
@@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
}
}
- /* Drop nodes from incomplete group */
- if (grouped && drop_incomplete_group(sleb, &offs)) {
- buf = sbuf + offs;
- len = c->leb_size - offs;
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- }
+ min_io_unit = round_down(offs, c->min_io_size);
+ if (grouped)
+ /*
+ * If nodes are grouped, always drop the incomplete group at
+ * the end.
+ */
+ drop_last_node(sleb, &offs, 1);
- if (offs % c->min_io_size) {
- clean_buf(c, &buf, lnum, &offs, &len);
- need_clean = 1;
- }
+ /*
+ * While we are in the middle of the same min. I/O unit keep dropping
+ * nodes. So basically, what we want is to make sure that the last min.
+ * I/O unit where we saw the corruption is dropped completely with all
+ * the uncorrupted node which may possibly sit there.
+ *
+ * In other words, let's name the min. I/O unit where the corruption
+ * starts B, and the previous min. I/O unit A. The below code tries to
+ * deal with a situation when half of B contains valid nodes or the end
+ * of a valid node, and the second half of B contains corrupted data or
+ * garbage. This means that UBIFS had been writing to B just before the
+ * power cut happened. I do not know how realistic is this scenario
+ * that half of the min. I/O unit had been written successfully and the
+ * other half not, but this is possible in our 'failure mode emulation'
+ * infrastructure at least.
+ *
+ * So what is the problem, why we need to drop those nodes? Whey can't
+ * we just clean-up the second half of B by putting a padding node
+ * there? We can, and this works fine with one exception which was
+ * reproduced with power cut emulation testing and happens extremely
+ * rarely. The description follows, but it is worth noting that that is
+ * only about the GC head, so we could do this trick only if the bud
+ * belongs to the GC head, but it does not seem to be worth an
+ * additional "if" statement.
+ *
+ * So, imagine the file-system is full, we run GC which is moving valid
+ * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+ * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+ * and will try to continue. Imagine that LEB X is currently the
+ * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+ * same as amount of free space in LEB X.
+ *
+ * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+ * are here trying to recover LEB Y which is the GC head LEB. We find
+ * the min. I/O unit B as described above. Then we clean-up LEB Y by
+ * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+ * fails, because it cannot find a dirty LEB which could be GC'd into
+ * LEB Y! Even LEB X does not match because the amount of valid nodes
+ * there does not fit the free space in LEB Y any more! And this is
+ * because of the padding node which we added to LEB Y. The
+ * user-visible effect of this which I once observed and analysed is
+ * that we cannot mount the file-system with -ENOSPC error.
+ *
+ * So obviously, to make sure that situation does not happen we should
+ * free min. I/O unit B in LEB Y completely and the last used min. I/O
+ * unit in LEB Y should be A. This is basically what the below code
+ * tries to do.
+ */
+ while (min_io_unit == round_down(offs, c->min_io_size) &&
+ min_io_unit != offs &&
+ drop_last_node(sleb, &offs, grouped));
+
+ buf = sbuf + offs;
+ len = c->leb_size - offs;
+ clean_buf(c, &buf, lnum, &offs, &len);
ubifs_end_scan(c, sleb, lnum, offs);
- if (need_clean) {
- err = fix_unclean_leb(c, sleb, start);
- if (err)
- goto error;
- }
+ err = fix_unclean_leb(c, sleb, start);
+ if (err)
+ goto error;
return sleb;
+corrupted_rescan:
+ /* Re-scan the corrupted data with verbose messages */
+ dbg_err("corruptio %d", ret);
+ ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
@@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
}
/**
+ * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty
+ * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static int grab_empty_leb(struct ubifs_info *c)
+{
+ int lnum, err;
+
+ /*
+ * Note, it is very important to first search for an empty LEB and then
+ * run the commit, not vice-versa. The reason is that there might be
+ * only one empty LEB at the moment, the one which has been the
+ * @c->gc_lnum just before the power cut happened. During the regular
+ * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no
+ * one but GC can grab it. But at this moment this single empty LEB is
+ * not marked as taken, so if we run commit - what happens? Right, the
+ * commit will grab it and write the index there. Remember that the
+ * index always expands as long as there is free space, and it only
+ * starts consolidating when we run out of space.
+ *
+ * IOW, if we run commit now, we might not be able to find a free LEB
+ * after this.
+ */
+ lnum = ubifs_find_free_leb_for_idx(c);
+ if (lnum < 0) {
+ dbg_err("could not find an empty LEB");
+ dbg_dump_lprops(c);
+ dbg_dump_budg(c, &c->bi);
+ return lnum;
+ }
+
+ /* Reset the index flag */
+ err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_INDEX, 0);
+ if (err)
+ return err;
+
+ c->gc_lnum = lnum;
+ dbg_rcvry("found empty LEB %d, run commit", lnum);
+
+ return ubifs_run_commit(c);
+}
+
+/**
* ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
* @c: UBIFS file-system description object
*
@@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
{
struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
struct ubifs_lprops lp;
- int lnum, err;
+ int err;
+
+ dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs);
c->gc_lnum = -1;
- if (wbuf->lnum == -1) {
- dbg_rcvry("no GC head LEB");
- goto find_free;
- }
- /*
- * See whether the used space in the dirtiest LEB fits in the GC head
- * LEB.
- */
- if (wbuf->offs == c->leb_size) {
- dbg_rcvry("no room in GC head LEB");
- goto find_free;
- }
+ if (wbuf->lnum == -1 || wbuf->offs == c->leb_size)
+ return grab_empty_leb(c);
+
err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
if (err) {
- /*
- * There are no dirty or empty LEBs subject to here being
- * enough for the index. Try to use
- * 'ubifs_find_free_leb_for_idx()', which will return any empty
- * LEBs (ignoring index requirements). If the index then
- * doesn't have enough LEBs the recovery commit will fail -
- * which is the same result anyway i.e. recovery fails. So
- * there is no problem ignoring index requirements and just
- * grabbing a free LEB since we have already established there
- * is not a dirty LEB we could have used instead.
- */
- if (err == -ENOSPC) {
- dbg_rcvry("could not find a dirty LEB");
- goto find_free;
- }
- return err;
- }
- ubifs_assert(!(lp.flags & LPROPS_INDEX));
- lnum = lp.lnum;
- if (lp.free + lp.dirty == c->leb_size) {
- /* An empty LEB was returned */
- if (lp.free != c->leb_size) {
- err = ubifs_change_one_lp(c, lnum, c->leb_size,
- 0, 0, 0, 0);
- if (err)
- return err;
- }
- err = ubifs_leb_unmap(c, lnum);
- if (err)
+ if (err != -ENOSPC)
return err;
- c->gc_lnum = lnum;
- dbg_rcvry("allocated LEB %d for GC", lnum);
- /* Run the commit */
- dbg_rcvry("committing");
- return ubifs_run_commit(c);
- }
- /*
- * There was no empty LEB so the used space in the dirtiest LEB must fit
- * in the GC head LEB.
- */
- if (lp.free + lp.dirty < wbuf->offs) {
- dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
- lnum, wbuf->lnum, wbuf->offs);
- err = ubifs_return_leb(c, lnum);
- if (err)
- return err;
- goto find_free;
+
+ dbg_rcvry("could not find a dirty LEB");
+ return grab_empty_leb(c);
}
+
+ ubifs_assert(!(lp.flags & LPROPS_INDEX));
+ ubifs_assert(lp.free + lp.dirty >= wbuf->offs);
+
/*
* We run the commit before garbage collection otherwise subsequent
* mounts will see the GC and orphan deletion in a different order.
@@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
err = ubifs_run_commit(c);
if (err)
return err;
- /*
- * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
- * - use locking to keep 'ubifs_assert()' happy.
- */
- dbg_rcvry("GC'ing LEB %d", lnum);
+
+ dbg_rcvry("GC'ing LEB %d", lp.lnum);
mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
err = ubifs_garbage_collect_leb(c, &lp);
if (err >= 0) {
@@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c)
err = -EINVAL;
return err;
}
- if (err != LEB_RETAINED) {
- dbg_err("GC returned %d", err);
+
+ ubifs_assert(err == LEB_RETAINED);
+ if (err != LEB_RETAINED)
return -EINVAL;
- }
+
err = ubifs_leb_unmap(c, c->gc_lnum);
if (err)
return err;
- dbg_rcvry("allocated LEB %d for GC", lnum);
- return 0;
-find_free:
- /*
- * There is no GC head LEB or the free space in the GC head LEB is too
- * small, or there are not dirty LEBs. Allocate gc_lnum by calling
- * 'ubifs_find_free_leb_for_idx()' so GC is not run.
- */
- lnum = ubifs_find_free_leb_for_idx(c);
- if (lnum < 0) {
- dbg_err("could not find an empty LEB");
- return lnum;
- }
- /* And reset the index flag */
- err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
- LPROPS_INDEX, 0);
- if (err)
- return err;
- c->gc_lnum = lnum;
- dbg_rcvry("allocated LEB %d for GC", lnum);
- /* Run the commit */
- dbg_rcvry("committing");
- return ubifs_run_commit(c);
+ dbg_rcvry("allocated LEB %d for GC", lp.lnum);
+ return 0;
}
/**
@@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
if (err)
goto out;
- dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ",
+ dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
(unsigned long)e->inum, lnum, offs, i_size, e->d_size);
return 0;
@@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c)
e->i_size = le64_to_cpu(ino->size);
}
}
+
if (e->exists && e->i_size < e->d_size) {
- if (!e->inode && c->ro_mount) {
+ if (c->ro_mount) {
/* Fix the inode size and pin it in memory */
struct inode *inode;
+ struct ubifs_inode *ui;
+
+ ubifs_assert(!e->inode);
inode = ubifs_iget(c->vfs_sb, e->inum);
if (IS_ERR(inode))
return PTR_ERR(inode);
+
+ ui = ubifs_inode(inode);
if (inode->i_size < e->d_size) {
dbg_rcvry("ino %lu size %lld -> %lld",
(unsigned long)e->inum,
- e->d_size, inode->i_size);
+ inode->i_size, e->d_size);
inode->i_size = e->d_size;
- ubifs_inode(inode)->ui_size = e->d_size;
+ ui->ui_size = e->d_size;
+ ui->synced_i_size = e->d_size;
e->inode = inode;
this = rb_next(this);
continue;
@@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c)
iput(e->inode);
}
}
+
this = rb_next(this);
rb_erase(&e->rb, &c->size_tree);
kfree(e);
}
+
return 0;
}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index d3d6d365bfc1..6617280d1679 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -33,44 +33,32 @@
*/
#include "ubifs.h"
-
-/*
- * Replay flags.
- *
- * REPLAY_DELETION: node was deleted
- * REPLAY_REF: node is a reference node
- */
-enum {
- REPLAY_DELETION = 1,
- REPLAY_REF = 2,
-};
+#include <linux/list_sort.h>
/**
- * struct replay_entry - replay tree entry.
+ * struct replay_entry - replay list entry.
* @lnum: logical eraseblock number of the node
* @offs: node offset
* @len: node length
+ * @deletion: non-zero if this entry corresponds to a node deletion
* @sqnum: node sequence number
- * @flags: replay flags
- * @rb: links the replay tree
+ * @list: links the replay list
* @key: node key
* @nm: directory entry name
* @old_size: truncation old size
* @new_size: truncation new size
- * @free: amount of free space in a bud
- * @dirty: amount of dirty space in a bud from padding and deletion nodes
- * @jhead: journal head number of the bud
*
- * UBIFS journal replay must compare node sequence numbers, which means it must
- * build a tree of node information to insert into the TNC.
+ * The replay process first scans all buds and builds the replay list, then
+ * sorts the replay list in nodes sequence number order, and then inserts all
+ * the replay entries to the TNC.
*/
struct replay_entry {
int lnum;
int offs;
int len;
+ unsigned int deletion:1;
unsigned long long sqnum;
- int flags;
- struct rb_node rb;
+ struct list_head list;
union ubifs_key key;
union {
struct qstr nm;
@@ -78,11 +66,6 @@ struct replay_entry {
loff_t old_size;
loff_t new_size;
};
- struct {
- int free;
- int dirty;
- int jhead;
- };
};
};
@@ -90,57 +73,64 @@ struct replay_entry {
* struct bud_entry - entry in the list of buds to replay.
* @list: next bud in the list
* @bud: bud description object
- * @free: free bytes in the bud
* @sqnum: reference node sequence number
+ * @free: free bytes in the bud
+ * @dirty: dirty bytes in the bud
*/
struct bud_entry {
struct list_head list;
struct ubifs_bud *bud;
- int free;
unsigned long long sqnum;
+ int free;
+ int dirty;
};
/**
* set_bud_lprops - set free and dirty space used by a bud.
* @c: UBIFS file-system description object
- * @r: replay entry of bud
+ * @b: bud entry which describes the bud
+ *
+ * This function makes sure the LEB properties of bud @b are set correctly
+ * after the replay. Returns zero in case of success and a negative error code
+ * in case of failure.
*/
-static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b)
{
const struct ubifs_lprops *lp;
int err = 0, dirty;
ubifs_get_lprops(c);
- lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+ lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum);
if (IS_ERR(lp)) {
err = PTR_ERR(lp);
goto out;
}
dirty = lp->dirty;
- if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+ if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
/*
* The LEB was added to the journal with a starting offset of
* zero which means the LEB must have been empty. The LEB
- * property values should be lp->free == c->leb_size and
- * lp->dirty == 0, but that is not the case. The reason is that
- * the LEB was garbage collected. The garbage collector resets
- * the free and dirty space without recording it anywhere except
- * lprops, so if there is not a commit then lprops does not have
- * that information next time the file system is mounted.
+ * property values should be @lp->free == @c->leb_size and
+ * @lp->dirty == 0, but that is not the case. The reason is that
+ * the LEB had been garbage collected before it became the bud,
+ * and there was not commit inbetween. The garbage collector
+ * resets the free and dirty space without recording it
+ * anywhere except lprops, so if there was no commit then
+ * lprops does not have that information.
*
* We do not need to adjust free space because the scan has told
* us the exact value which is recorded in the replay entry as
- * r->free.
+ * @b->free.
*
* However we do need to subtract from the dirty space the
* amount of space that the garbage collector reclaimed, which
* is the whole LEB minus the amount of space that was free.
*/
- dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
lp->free, lp->dirty);
- dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum,
lp->free, lp->dirty);
dirty -= c->leb_size - lp->free;
/*
@@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
*/
if (dirty != 0)
dbg_msg("LEB %d lp: %d free %d dirty "
- "replay: %d free %d dirty", r->lnum, lp->free,
- lp->dirty, r->free, r->dirty);
+ "replay: %d free %d dirty", b->bud->lnum,
+ lp->free, lp->dirty, b->free, b->dirty);
}
- lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+ lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
lp->flags | LPROPS_TAKEN, 0);
if (IS_ERR(lp)) {
err = PTR_ERR(lp);
@@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
}
/* Make sure the journal head points to the latest bud */
- err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
- c->leb_size - r->free, UBI_SHORTTERM);
+ err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
+ b->bud->lnum, c->leb_size - b->free,
+ UBI_SHORTTERM);
out:
ubifs_release_lprops(c);
@@ -172,6 +163,27 @@ out:
}
/**
+ * set_buds_lprops - set free and dirty space for all replayed buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function sets LEB properties for all replayed buds. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int set_buds_lprops(struct ubifs_info *c)
+{
+ struct bud_entry *b;
+ int err;
+
+ list_for_each_entry(b, &c->replay_buds, list) {
+ err = set_bud_lprops(c, b);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
* trun_remove_range - apply a replay entry for a truncation to the TNC.
* @c: UBIFS file-system description object
* @r: replay entry of truncation
@@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
*/
static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
{
- int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+ int err;
- dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
- r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+ dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
+ r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
/* Set c->replay_sqnum to help deal with dangling branches. */
c->replay_sqnum = r->sqnum;
- if (r->flags & REPLAY_REF)
- err = set_bud_lprops(c, r);
- else if (is_hash_key(c, &r->key)) {
- if (deletion)
+ if (is_hash_key(c, &r->key)) {
+ if (r->deletion)
err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
else
err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
r->len, &r->nm);
} else {
- if (deletion)
+ if (r->deletion)
switch (key_type(c, &r->key)) {
case UBIFS_INO_KEY:
{
@@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
return err;
if (c->need_recovery)
- err = ubifs_recover_size_accum(c, &r->key, deletion,
+ err = ubifs_recover_size_accum(c, &r->key, r->deletion,
r->new_size);
}
@@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
}
/**
- * destroy_replay_tree - destroy the replay.
- * @c: UBIFS file-system description object
+ * replay_entries_cmp - compare 2 replay entries.
+ * @priv: UBIFS file-system description object
+ * @a: first replay entry
+ * @a: second replay entry
*
- * Destroy the replay tree.
+ * This is a comparios function for 'list_sort()' which compares 2 replay
+ * entries @a and @b by comparing their sequence numer. Returns %1 if @a has
+ * greater sequence number and %-1 otherwise.
*/
-static void destroy_replay_tree(struct ubifs_info *c)
+static int replay_entries_cmp(void *priv, struct list_head *a,
+ struct list_head *b)
{
- struct rb_node *this = c->replay_tree.rb_node;
- struct replay_entry *r;
-
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- r = rb_entry(this, struct replay_entry, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &r->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
- if (is_hash_key(c, &r->key))
- kfree(r->nm.name);
- kfree(r);
- }
- c->replay_tree = RB_ROOT;
+ struct replay_entry *ra, *rb;
+
+ cond_resched();
+ if (a == b)
+ return 0;
+
+ ra = list_entry(a, struct replay_entry, list);
+ rb = list_entry(b, struct replay_entry, list);
+ ubifs_assert(ra->sqnum != rb->sqnum);
+ if (ra->sqnum > rb->sqnum)
+ return 1;
+ return -1;
}
/**
- * apply_replay_tree - apply the replay tree to the TNC.
+ * apply_replay_list - apply the replay list to the TNC.
* @c: UBIFS file-system description object
*
- * Apply the replay tree.
- * Returns zero in case of success and a negative error code in case of
- * failure.
+ * Apply all entries in the replay list to the TNC. Returns zero in case of
+ * success and a negative error code in case of failure.
*/
-static int apply_replay_tree(struct ubifs_info *c)
+static int apply_replay_list(struct ubifs_info *c)
{
- struct rb_node *this = rb_first(&c->replay_tree);
+ struct replay_entry *r;
+ int err;
- while (this) {
- struct replay_entry *r;
- int err;
+ list_sort(c, &c->replay_list, &replay_entries_cmp);
+ list_for_each_entry(r, &c->replay_list, list) {
cond_resched();
- r = rb_entry(this, struct replay_entry, rb);
err = apply_replay_entry(c, r);
if (err)
return err;
- this = rb_next(this);
}
+
return 0;
}
/**
- * insert_node - insert a node to the replay tree.
+ * destroy_replay_list - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay list.
+ */
+static void destroy_replay_list(struct ubifs_info *c)
+{
+ struct replay_entry *r, *tmp;
+
+ list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
+ if (is_hash_key(c, &r->key))
+ kfree(r->nm.name);
+ list_del(&r->list);
+ kfree(r);
+ }
+}
+
+/**
+ * insert_node - insert a node to the replay list
* @c: UBIFS file-system description object
* @lnum: node logical eraseblock number
* @offs: node offset
@@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c)
* @old_size: truncation old size
* @new_size: truncation new size
*
- * This function inserts a scanned non-direntry node to the replay tree. The
- * replay tree is an RB-tree containing @struct replay_entry elements which are
- * indexed by the sequence number. The replay tree is applied at the very end
- * of the replay process. Since the tree is sorted in sequence number order,
- * the older modifications are applied first. This function returns zero in
- * case of success and a negative error code in case of failure.
+ * This function inserts a scanned non-direntry node to the replay list. The
+ * replay list contains @struct replay_entry elements, and we sort this list in
+ * sequence number order before applying it. The replay list is applied at the
+ * very end of the replay process. Since the list is sorted in sequence number
+ * order, the older modifications are applied first. This function returns zero
+ * in case of success and a negative error code in case of failure.
*/
static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
union ubifs_key *key, unsigned long long sqnum,
int deletion, int *used, loff_t old_size,
loff_t new_size)
{
- struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
struct replay_entry *r;
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct replay_entry, rb);
- if (sqnum < r->sqnum) {
- p = &(*p)->rb_left;
- continue;
- } else if (sqnum > r->sqnum) {
- p = &(*p)->rb_right;
- continue;
- }
- ubifs_err("duplicate sqnum in replay");
- return -EINVAL;
- }
-
r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
if (!r)
return -ENOMEM;
@@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
r->lnum = lnum;
r->offs = offs;
r->len = len;
+ r->deletion = !!deletion;
r->sqnum = sqnum;
- r->flags = (deletion ? REPLAY_DELETION : 0);
+ key_copy(c, key, &r->key);
r->old_size = old_size;
r->new_size = new_size;
- key_copy(c, key, &r->key);
- rb_link_node(&r->rb, parent, p);
- rb_insert_color(&r->rb, &c->replay_tree);
+ list_add_tail(&r->list, &c->replay_list);
return 0;
}
/**
- * insert_dent - insert a directory entry node into the replay tree.
+ * insert_dent - insert a directory entry node into the replay list.
* @c: UBIFS file-system description object
* @lnum: node logical eraseblock number
* @offs: node offset
@@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
* @deletion: non-zero if this is a deletion
* @used: number of bytes in use in a LEB
*
- * This function inserts a scanned directory entry node to the replay tree.
- * Returns zero in case of success and a negative error code in case of
- * failure.
- *
- * This function is also used for extended attribute entries because they are
- * implemented as directory entry nodes.
+ * This function inserts a scanned directory entry node or an extended
+ * attribute entry to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
*/
static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
union ubifs_key *key, const char *name, int nlen,
unsigned long long sqnum, int deletion, int *used)
{
- struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
struct replay_entry *r;
char *nbuf;
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
if (key_inum(c, key) >= c->highest_inum)
c->highest_inum = key_inum(c, key);
- dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct replay_entry, rb);
- if (sqnum < r->sqnum) {
- p = &(*p)->rb_left;
- continue;
- }
- if (sqnum > r->sqnum) {
- p = &(*p)->rb_right;
- continue;
- }
- ubifs_err("duplicate sqnum in replay");
- return -EINVAL;
- }
-
r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
if (!r)
return -ENOMEM;
+
nbuf = kmalloc(nlen + 1, GFP_KERNEL);
if (!nbuf) {
kfree(r);
@@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
r->lnum = lnum;
r->offs = offs;
r->len = len;
+ r->deletion = !!deletion;
r->sqnum = sqnum;
+ key_copy(c, key, &r->key);
r->nm.len = nlen;
memcpy(nbuf, name, nlen);
nbuf[nlen] = '\0';
r->nm.name = nbuf;
- r->flags = (deletion ? REPLAY_DELETION : 0);
- key_copy(c, key, &r->key);
- ubifs_assert(!*p);
- rb_link_node(&r->rb, parent, p);
- rb_insert_color(&r->rb, &c->replay_tree);
+ list_add_tail(&r->list, &c->replay_list);
return 0;
}
@@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c,
}
/**
+ * is_last_bud - check if the bud is the last in the journal head.
+ * @c: UBIFS file-system description object
+ * @bud: bud description object
+ *
+ * This function checks if bud @bud is the last bud in its journal head. This
+ * information is then used by 'replay_bud()' to decide whether the bud can
+ * have corruptions or not. Indeed, only last buds can be corrupted by power
+ * cuts. Returns %1 if this is the last bud, and %0 if not.
+ */
+static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+ struct ubifs_jhead *jh = &c->jheads[bud->jhead];
+ struct ubifs_bud *next;
+ uint32_t data;
+ int err;
+
+ if (list_is_last(&bud->list, &jh->buds_list))
+ return 1;
+
+ /*
+ * The following is a quirk to make sure we work correctly with UBIFS
+ * images used with older UBIFS.
+ *
+ * Normally, the last bud will be the last in the journal head's list
+ * of bud. However, there is one exception if the UBIFS image belongs
+ * to older UBIFS. This is fairly unlikely: one would need to use old
+ * UBIFS, then have a power cut exactly at the right point, and then
+ * try to mount this image with new UBIFS.
+ *
+ * The exception is: it is possible to have 2 buds A and B, A goes
+ * before B, and B is the last, bud B is contains no data, and bud A is
+ * corrupted at the end. The reason is that in older versions when the
+ * journal code switched the next bud (from A to B), it first added a
+ * log reference node for the new bud (B), and only after this it
+ * synchronized the write-buffer of current bud (A). But later this was
+ * changed and UBIFS started to always synchronize the write-buffer of
+ * the bud (A) before writing the log reference for the new bud (B).
+ *
+ * But because older UBIFS always synchronized A's write-buffer before
+ * writing to B, we can recognize this exceptional situation but
+ * checking the contents of bud B - if it is empty, then A can be
+ * treated as the last and we can recover it.
+ *
+ * TODO: remove this piece of code in a couple of years (today it is
+ * 16.05.2011).
+ */
+ next = list_entry(bud->list.next, struct ubifs_bud, list);
+ if (!list_is_last(&next->list, &jh->buds_list))
+ return 0;
+
+ err = ubi_read(c->ubi, next->lnum, (char *)&data,
+ next->start, 4);
+ if (err)
+ return 0;
+
+ return data == 0xFFFFFFFF;
+}
+
+/**
* replay_bud - replay a bud logical eraseblock.
* @c: UBIFS file-system description object
- * @lnum: bud logical eraseblock number to replay
- * @offs: bud start offset
- * @jhead: journal head to which this bud belongs
- * @free: amount of free space in the bud is returned here
- * @dirty: amount of dirty space from padding and deletion nodes is returned
- * here
+ * @b: bud entry which describes the bud
*
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function replays bud @bud, recovers it if needed, and adds all nodes
+ * from this bud to the replay list. Returns zero in case of success and a
+ * negative error code in case of failure.
*/
-static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
- int *free, int *dirty)
+static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
{
- int err = 0, used = 0;
+ int is_last = is_last_bud(c, b->bud);
+ int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start;
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
- struct ubifs_bud *bud;
- dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
- if (c->need_recovery)
- sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+ dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d",
+ lnum, b->bud->jhead, offs, is_last);
+
+ if (c->need_recovery && is_last)
+ /*
+ * Recover only last LEBs in the journal heads, because power
+ * cuts may cause corruptions only in these LEBs, because only
+ * these LEBs could possibly be written to at the power cut
+ * time.
+ */
+ sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+ b->bud->jhead != GCHD);
else
sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
if (IS_ERR(sleb))
@@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
goto out;
}
- bud = ubifs_search_bud(c, lnum);
- if (!bud)
- BUG();
-
+ ubifs_assert(ubifs_search_bud(c, lnum));
ubifs_assert(sleb->endpt - offs >= used);
ubifs_assert(sleb->endpt % c->min_io_size == 0);
- *dirty = sleb->endpt - offs - used;
- *free = c->leb_size - sleb->endpt;
+ b->dirty = sleb->endpt - offs - used;
+ b->free = c->leb_size - sleb->endpt;
+ dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
out:
ubifs_scan_destroy(sleb);
@@ -649,58 +694,6 @@ out_dump:
}
/**
- * insert_ref_node - insert a reference node to the replay tree.
- * @c: UBIFS file-system description object
- * @lnum: node logical eraseblock number
- * @offs: node offset
- * @sqnum: sequence number
- * @free: amount of free space in bud
- * @dirty: amount of dirty space from padding and deletion nodes
- * @jhead: journal head number for the bud
- *
- * This function inserts a reference node to the replay tree and returns zero
- * in case of success or a negative error code in case of failure.
- */
-static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
- unsigned long long sqnum, int free, int dirty,
- int jhead)
-{
- struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
- struct replay_entry *r;
-
- dbg_mnt("add ref LEB %d:%d", lnum, offs);
- while (*p) {
- parent = *p;
- r = rb_entry(parent, struct replay_entry, rb);
- if (sqnum < r->sqnum) {
- p = &(*p)->rb_left;
- continue;
- } else if (sqnum > r->sqnum) {
- p = &(*p)->rb_right;
- continue;
- }
- ubifs_err("duplicate sqnum in replay tree");
- return -EINVAL;
- }
-
- r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->lnum = lnum;
- r->offs = offs;
- r->sqnum = sqnum;
- r->flags = REPLAY_REF;
- r->free = free;
- r->dirty = dirty;
- r->jhead = jhead;
-
- rb_link_node(&r->rb, parent, p);
- rb_insert_color(&r->rb, &c->replay_tree);
- return 0;
-}
-
-/**
* replay_buds - replay all buds.
* @c: UBIFS file-system description object
*
@@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
static int replay_buds(struct ubifs_info *c)
{
struct bud_entry *b;
- int err, uninitialized_var(free), uninitialized_var(dirty);
+ int err;
+ unsigned long long prev_sqnum = 0;
list_for_each_entry(b, &c->replay_buds, list) {
- err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
- &free, &dirty);
- if (err)
- return err;
- err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
- free, dirty, b->bud->jhead);
+ err = replay_bud(c, b);
if (err)
return err;
+
+ ubifs_assert(b->sqnum > prev_sqnum);
+ prev_sqnum = b->sqnum;
}
return 0;
@@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c)
if (err)
goto out;
- err = apply_replay_tree(c);
+ err = apply_replay_list(c);
+ if (err)
+ goto out;
+
+ err = set_buds_lprops(c);
if (err)
goto out;
/*
- * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
- * to roughly estimate index growth. Things like @c->min_idx_lebs
+ * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
+ * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
* depend on it. This means we have to initialize it to make sure
* budgeting works properly.
*/
- c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
- c->budg_uncommitted_idx *= c->max_idx_node_sz;
+ c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+ c->bi.uncommitted_idx *= c->max_idx_node_sz;
ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
(unsigned long)c->highest_inum);
out:
- destroy_replay_tree(c);
+ destroy_replay_list(c);
destroy_bud_list(c);
c->replaying = 0;
return err;
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index bf31b4729e51..c606f010e8df 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -475,7 +475,8 @@ failed:
* @c: UBIFS file-system description object
*
* This function returns a pointer to the superblock node or a negative error
- * code.
+ * code. Note, the user of this function is responsible of kfree()'ing the
+ * returned superblock buffer.
*/
struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
{
@@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
memcpy(&c->uuid, &sup->uuid, 16);
c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+ c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
/* Automatically increase file system size to the maximum size */
c->old_leb_cnt = c->leb_cnt;
@@ -650,3 +652,152 @@ out:
kfree(sup);
return err;
}
+
+/**
+ * fixup_leb - fixup/unmap an LEB containing free space.
+ * @c: UBIFS file-system description object
+ * @lnum: the LEB number to fix up
+ * @len: number of used bytes in LEB (starting at offset 0)
+ *
+ * This function reads the contents of the given LEB number @lnum, then fixes
+ * it up, so that empty min. I/O units in the end of LEB are actually erased on
+ * flash (rather than being just all-0xff real data). If the LEB is completely
+ * empty, it is simply unmapped.
+ */
+static int fixup_leb(struct ubifs_info *c, int lnum, int len)
+{
+ int err;
+
+ ubifs_assert(len >= 0);
+ ubifs_assert(len % c->min_io_size == 0);
+ ubifs_assert(len < c->leb_size);
+
+ if (len == 0) {
+ dbg_mnt("unmap empty LEB %d", lnum);
+ return ubi_leb_unmap(c->ubi, lnum);
+ }
+
+ dbg_mnt("fixup LEB %d, data len %d", lnum, len);
+ err = ubi_read(c->ubi, lnum, c->sbuf, 0, len);
+ if (err)
+ return err;
+
+ return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+}
+
+/**
+ * fixup_free_space - find & remap all LEBs containing free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function walks through all LEBs in the filesystem and fiexes up those
+ * containing free/empty space.
+ */
+static int fixup_free_space(struct ubifs_info *c)
+{
+ int lnum, err = 0;
+ struct ubifs_lprops *lprops;
+
+ ubifs_get_lprops(c);
+
+ /* Fixup LEBs in the master area */
+ for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) {
+ err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz);
+ if (err)
+ goto out;
+ }
+
+ /* Unmap unused log LEBs */
+ lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
+ while (lnum != c->ltail_lnum) {
+ err = fixup_leb(c, lnum, 0);
+ if (err)
+ goto out;
+ lnum = ubifs_next_log_lnum(c, lnum);
+ }
+
+ /* Fixup the current log head */
+ err = fixup_leb(c, c->lhead_lnum, c->lhead_offs);
+ if (err)
+ goto out;
+
+ /* Fixup LEBs in the LPT area */
+ for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+ int free = c->ltab[lnum - c->lpt_first].free;
+
+ if (free > 0) {
+ err = fixup_leb(c, lnum, c->leb_size - free);
+ if (err)
+ goto out;
+ }
+ }
+
+ /* Unmap LEBs in the orphans area */
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ err = fixup_leb(c, lnum, 0);
+ if (err)
+ goto out;
+ }
+
+ /* Fixup LEBs in the main area */
+ for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+ lprops = ubifs_lpt_lookup(c, lnum);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ if (lprops->free > 0) {
+ err = fixup_leb(c, lnum, c->leb_size - lprops->free);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_fixup_free_space - find & fix all LEBs with free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function fixes up LEBs containing free space on first mount, if the
+ * appropriate flag was set when the FS was created. Each LEB with one or more
+ * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure
+ * the free space is actually erased. E.g., this is necessary for some NAND
+ * chips, since the free space may have been programmed like real "0xff" data
+ * (generating a non-0xff ECC), causing future writes to the not-really-erased
+ * NAND pages to behave badly. After the space is fixed up, the superblock flag
+ * is cleared, so that this is skipped for all future mounts.
+ */
+int ubifs_fixup_free_space(struct ubifs_info *c)
+{
+ int err;
+ struct ubifs_sb_node *sup;
+
+ ubifs_assert(c->space_fixup);
+ ubifs_assert(!c->ro_mount);
+
+ ubifs_msg("start fixing up free space");
+
+ err = fixup_free_space(c);
+ if (err)
+ return err;
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup))
+ return PTR_ERR(sup);
+
+ /* Free-space fixup is no longer required */
+ c->space_fixup = 0;
+ sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP);
+
+ err = ubifs_write_sb_node(c, sup);
+ kfree(sup);
+ if (err)
+ return err;
+
+ ubifs_msg("free space fixup complete");
+ return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 04ad07f4fcc3..6db0bdaa9f74 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -375,7 +375,7 @@ out:
ubifs_release_dirty_inode_budget(c, ui);
else {
/* We've deleted something - clean the "no space" flags */
- c->nospace = c->nospace_rp = 0;
+ c->bi.nospace = c->bi.nospace_rp = 0;
smp_wmb();
}
done:
@@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c)
* be compressed and direntries are of the maximum size.
*
* Note, data, which may be stored in inodes is budgeted separately, so
- * it is not included into 'c->inode_budget'.
+ * it is not included into 'c->bi.inode_budget'.
*/
- c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
- c->inode_budget = UBIFS_INO_NODE_SZ;
- c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+ c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+ c->bi.inode_budget = UBIFS_INO_NODE_SZ;
+ c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ;
/*
* When the amount of flash space used by buds becomes
@@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c)
{
long long tmp64;
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
c->report_rp_size = ubifs_reported_space(c, c->rp_size);
/*
@@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c)
{
ubifs_assert(c->dark_wm > 0);
if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
- ubifs_err("insufficient free space to mount in read/write mode");
- dbg_dump_budg(c);
+ ubifs_err("insufficient free space to mount in R/W mode");
+ dbg_dump_budg(c, &c->bi);
dbg_dump_lprops(c);
return -ENOSPC;
}
@@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_lpt;
- err = dbg_check_idx_size(c, c->old_idx_sz);
+ err = dbg_check_idx_size(c, c->bi.old_idx_sz);
if (err)
goto out_lpt;
@@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c)
goto out_journal;
/* Calculate 'min_idx_lebs' after journal replay */
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount);
if (err)
@@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c)
} else
ubifs_assert(c->lst.taken_empty_lebs > 0);
+ if (!c->ro_mount && c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ goto out_infos;
+ }
+
err = dbg_check_filesystem(c);
if (err)
goto out_infos;
@@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c)
c->main_lebs, c->main_first, c->leb_cnt - 1);
dbg_msg("index LEBs: %d", c->lst.idx_lebs);
dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
- c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+ c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
+ c->bi.old_idx_sz >> 20);
dbg_msg("key hash type: %d", c->key_hash_type);
dbg_msg("tree fanout: %d", c->fanout);
dbg_msg("reserved GC LEB: %d", c->gc_lnum);
@@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c)
dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
- UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+ UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
dbg_msg("dead watermark: %d", c->dead_wm);
dbg_msg("dark watermark: %d", c->dark_wm);
@@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
}
sup->leb_cnt = cpu_to_le32(c->leb_cnt);
err = ubifs_write_sb_node(c, sup);
+ kfree(sup);
if (err)
goto out;
}
@@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
*/
err = dbg_check_space_info(c);
}
+
+ if (c->space_fixup) {
+ err = ubifs_fixup_free_space(c);
+ if (err)
+ goto out;
+ }
+
mutex_unlock(&c->umount_mutex);
return err;
@@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb)
* to write them back because of I/O errors.
*/
if (!c->ro_error) {
- ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
- ubifs_assert(c->budg_idx_growth == 0);
- ubifs_assert(c->budg_dd_growth == 0);
- ubifs_assert(c->budg_data_growth == 0);
+ ubifs_assert(c->bi.idx_growth == 0);
+ ubifs_assert(c->bi.dd_growth == 0);
+ ubifs_assert(c->bi.data_growth == 0);
}
/*
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index de485979ca39..8119b1fd8d94 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
if (err) {
/* Ensure the znode is dirtied */
if (znode->cnext || !ubifs_zn_dirty(znode)) {
- znode = dirty_cow_bottom_up(c, znode);
- if (IS_ERR(znode)) {
- err = PTR_ERR(znode);
- goto out_unlock;
- }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
}
err = tnc_delete(c, znode, n);
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 53288e5d604e..41920f357bbf 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
c->gap_lebs = NULL;
return err;
}
- if (!dbg_force_in_the_gaps_enabled) {
+ if (dbg_force_in_the_gaps_enabled()) {
/*
* Do not print scary warnings if the debugging
* option which forces in-the-gaps is enabled.
*/
- ubifs_err("out of space");
- spin_lock(&c->space_lock);
- dbg_dump_budg(c);
- spin_unlock(&c->space_lock);
+ ubifs_warn("out of space");
+ dbg_dump_budg(c, &c->bi);
dbg_dump_lprops(c);
}
/* Try to commit anyway */
@@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
spin_lock(&c->space_lock);
/*
* Although we have not finished committing yet, update size of the
- * committed index ('c->old_idx_sz') and zero out the index growth
+ * committed index ('c->bi.old_idx_sz') and zero out the index growth
* budget. It is OK to do this now, because we've reserved all the
* space which is needed to commit the index, and it is save for the
* budgeting subsystem to assume the index is already committed,
* even though it is not.
*/
- ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
- c->old_idx_sz = c->calc_idx_sz;
- c->budg_uncommitted_idx = 0;
- c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+ c->bi.old_idx_sz = c->calc_idx_sz;
+ c->bi.uncommitted_idx = 0;
+ c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c);
spin_unlock(&c->space_lock);
mutex_unlock(&c->tnc_mutex);
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 191ca7863fe7..e24380cf46ed 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -408,9 +408,11 @@ enum {
* Superblock flags.
*
* UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
*/
enum {
UBIFS_FLG_BIGLPT = 0x02,
+ UBIFS_FLG_SPACE_FIXUP = 0x04,
};
/**
@@ -434,7 +436,7 @@ struct ubifs_ch {
__u8 node_type;
__u8 group_type;
__u8 padding[2];
-} __attribute__ ((packed));
+} __packed;
/**
* union ubifs_dev_desc - device node descriptor.
@@ -448,7 +450,7 @@ struct ubifs_ch {
union ubifs_dev_desc {
__le32 new;
__le64 huge;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_ino_node - inode node.
@@ -509,7 +511,7 @@ struct ubifs_ino_node {
__le16 compr_type;
__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
__u8 data[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_dent_node - directory entry node.
@@ -534,7 +536,7 @@ struct ubifs_dent_node {
__le16 nlen;
__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
__u8 name[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_data_node - data node.
@@ -555,7 +557,7 @@ struct ubifs_data_node {
__le16 compr_type;
__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
__u8 data[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_trun_node - truncation node.
@@ -575,7 +577,7 @@ struct ubifs_trun_node {
__u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
__le64 old_size;
__le64 new_size;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_pad_node - padding node.
@@ -586,7 +588,7 @@ struct ubifs_trun_node {
struct ubifs_pad_node {
struct ubifs_ch ch;
__le32 pad_len;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_sb_node - superblock node.
@@ -644,7 +646,7 @@ struct ubifs_sb_node {
__u8 uuid[16];
__le32 ro_compat_version;
__u8 padding2[3968];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_mst_node - master node.
@@ -711,7 +713,7 @@ struct ubifs_mst_node {
__le32 idx_lebs;
__le32 leb_cnt;
__u8 padding[344];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_ref_node - logical eraseblock reference node.
@@ -727,7 +729,7 @@ struct ubifs_ref_node {
__le32 offs;
__le32 jhead;
__u8 padding[28];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_branch - key/reference/length branch
@@ -741,7 +743,7 @@ struct ubifs_branch {
__le32 offs;
__le32 len;
__u8 key[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_idx_node - indexing node.
@@ -755,7 +757,7 @@ struct ubifs_idx_node {
__le16 child_cnt;
__le16 level;
__u8 branches[];
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_cs_node - commit start node.
@@ -765,7 +767,7 @@ struct ubifs_idx_node {
struct ubifs_cs_node {
struct ubifs_ch ch;
__le64 cmt_no;
-} __attribute__ ((packed));
+} __packed;
/**
* struct ubifs_orph_node - orphan node.
@@ -777,6 +779,6 @@ struct ubifs_orph_node {
struct ubifs_ch ch;
__le64 cmt_no;
__le64 inos[];
-} __attribute__ ((packed));
+} __packed;
#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 8c40ad3c6721..93d1412a06f0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb {
* The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
* @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
* make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock
- * with 'ubifs_writepage()' (see file.c). All the other inode fields are
- * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would
+ * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields
+ * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one
* could consider to rework locking and base it on "shadow" fields.
*/
struct ubifs_inode {
@@ -937,6 +937,40 @@ struct ubifs_mount_opts {
unsigned int compr_type:2;
};
+/**
+ * struct ubifs_budg_info - UBIFS budgeting information.
+ * @idx_growth: amount of bytes budgeted for index growth
+ * @data_growth: amount of bytes budgeted for cached data
+ * @dd_growth: amount of bytes budgeted for cached data that will make
+ * other data dirty
+ * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but
+ * which still have to be taken into account because the index
+ * has not been committed so far
+ * @old_idx_sz: size of index on flash
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ * optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ * pool is full
+ * @page_budget: budget for a page (constant, nenver changed after mount)
+ * @inode_budget: budget for an inode (constant, nenver changed after mount)
+ * @dent_budget: budget for a directory entry (constant, nenver changed after
+ * mount)
+ */
+struct ubifs_budg_info {
+ long long idx_growth;
+ long long data_growth;
+ long long dd_growth;
+ long long uncommitted_idx;
+ unsigned long long old_idx_sz;
+ int min_idx_lebs;
+ unsigned int nospace:1;
+ unsigned int nospace_rp:1;
+ int page_budget;
+ int inode_budget;
+ int dent_budget;
+};
+
struct ubifs_debug_info;
/**
@@ -980,6 +1014,7 @@ struct ubifs_debug_info;
* @cmt_wq: wait queue to sleep on if the log is full and a commit is running
*
* @big_lpt: flag that LPT is too big to write whole during commit
+ * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
* @no_chk_data_crc: do not check CRCs when reading data nodes (except during
* recovery)
* @bulk_read: enable bulk-reads
@@ -1057,32 +1092,14 @@ struct ubifs_debug_info;
* @dirty_zn_cnt: number of dirty znodes
* @clean_zn_cnt: number of clean znodes
*
- * @budg_idx_growth: amount of bytes budgeted for index growth
- * @budg_data_growth: amount of bytes budgeted for cached data
- * @budg_dd_growth: amount of bytes budgeted for cached data that will make
- * other data dirty
- * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
- * but which still have to be taken into account because
- * the index has not been committed so far
- * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
- * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst,
- * @nospace, and @nospace_rp;
- * @min_idx_lebs: minimum number of LEBs required for the index
- * @old_idx_sz: size of index on flash
+ * @space_lock: protects @bi and @lst
+ * @lst: lprops statistics
+ * @bi: budgeting information
* @calc_idx_sz: temporary variable which is used to calculate new index size
* (contains accurate new index size at end of TNC commit start)
- * @lst: lprops statistics
- * @nospace: non-zero if the file-system does not have flash space (used as
- * optimization)
- * @nospace_rp: the same as @nospace, but additionally means that even reserved
- * pool is full
- *
- * @page_budget: budget for a page
- * @inode_budget: budget for an inode
- * @dent_budget: budget for a directory entry
*
* @ref_node_alsz: size of the LEB reference node aligned to the min. flash
- * I/O unit
+ * I/O unit
* @mst_node_alsz: master node aligned size
* @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
* @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
@@ -1189,7 +1206,6 @@ struct ubifs_debug_info;
* @replaying: %1 during journal replay
* @mounting: %1 while mounting
* @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
- * @replay_tree: temporary tree used during journal replay
* @replay_list: temporary list used during journal replay
* @replay_buds: list of buds to replay
* @cs_sqnum: sequence number of first node in the log (commit start node)
@@ -1238,6 +1254,7 @@ struct ubifs_info {
wait_queue_head_t cmt_wq;
unsigned int big_lpt:1;
+ unsigned int space_fixup:1;
unsigned int no_chk_data_crc:1;
unsigned int bulk_read:1;
unsigned int default_compr:2;
@@ -1308,21 +1325,10 @@ struct ubifs_info {
atomic_long_t dirty_zn_cnt;
atomic_long_t clean_zn_cnt;
- long long budg_idx_growth;
- long long budg_data_growth;
- long long budg_dd_growth;
- long long budg_uncommitted_idx;
spinlock_t space_lock;
- int min_idx_lebs;
- unsigned long long old_idx_sz;
- unsigned long long calc_idx_sz;
struct ubifs_lp_stats lst;
- unsigned int nospace:1;
- unsigned int nospace_rp:1;
-
- int page_budget;
- int inode_budget;
- int dent_budget;
+ struct ubifs_budg_info bi;
+ unsigned long long calc_idx_sz;
int ref_node_alsz;
int mst_node_alsz;
@@ -1430,7 +1436,6 @@ struct ubifs_info {
unsigned int replaying:1;
unsigned int mounting:1;
unsigned int remounting_rw:1;
- struct rb_root replay_tree;
struct list_head replay_list;
struct list_head replay_buds;
unsigned long long cs_sqnum;
@@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c);
int ubifs_read_superblock(struct ubifs_info *c);
struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+int ubifs_fixup_free_space(struct ubifs_info *c);
/* replay.c */
int ubifs_validate_entry(struct ubifs_info *c,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 3299f469e712..16f19f55e63f 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -80,8 +80,8 @@ enum {
SECURITY_XATTR,
};
-static const struct inode_operations none_inode_operations;
-static const struct file_operations none_file_operations;
+static const struct inode_operations empty_iops;
+static const struct file_operations empty_fops;
/**
* create_xattr - create an extended attribute.
@@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
/* Re-define all operations to be "nothing" */
inode->i_mapping->a_ops = &empty_aops;
- inode->i_op = &none_inode_operations;
- inode->i_fop = &none_file_operations;
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
ui = ubifs_inode(inode);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index f1dce848ef96..4d76594c2a8f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,6 +783,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
struct fileIdentDesc *fi, cfi;
struct kernel_lb_addr tloc;
+ dentry_unhash(dentry);
+
retval = -ENOENT;
fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
if (!fi)
@@ -1081,6 +1083,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
struct kernel_lb_addr tloc;
struct udf_inode_info *old_iinfo = UDF_I(old_inode);
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
if (ofi) {
if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index e765743cf9f3..b4d791a83207 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -409,7 +409,7 @@ out:
}
/**
- * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and
+ * ufs_getfrag_block() - `get_block_t' function, interface between UFS and
* readpage, writepage and so on
*/
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 29309e25417f..953ebdfc5bf7 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,6 +258,8 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
struct inode * inode = dentry->d_inode;
int err= -ENOTEMPTY;
+ dentry_unhash(dentry);
+
lock_ufs(dir->i_sb);
if (ufs_empty_dir (inode)) {
err = ufs_unlink(dir, dentry);
@@ -282,6 +284,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ufs_dir_entry *old_de;
int err = -ENOENT;
+ if (new_inode && S_ISDIR(new_inode->i_mode))
+ dentry_unhash(new_dentry);
+
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
if (!old_de)
goto out;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9ef9ed2cfe2e..5e68099db2a5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,7 +33,6 @@
#include <linux/migrate.h>
#include <linux/backing-dev.h>
#include <linux/freezer.h>
-#include <linux/list_sort.h>
#include "xfs_sb.h"
#include "xfs_inum.h"
@@ -709,6 +708,27 @@ xfs_buf_get_empty(
return bp;
}
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+ struct xfs_buf *bp,
+ size_t len)
+{
+ if (bp->b_pages)
+ _xfs_buf_free_pages(bp);
+
+ bp->b_pages = NULL;
+ bp->b_page_count = 0;
+ bp->b_addr = NULL;
+ bp->b_file_offset = 0;
+ bp->b_buffer_length = bp->b_count_desired = len;
+ bp->b_bn = XFS_BUF_DADDR_NULL;
+ bp->b_flags &= ~XBF_MAPPED;
+}
+
static inline struct page *
mem_to_page(
void *addr)
@@ -1402,12 +1422,12 @@ restart:
int
xfs_buftarg_shrink(
struct shrinker *shrink,
- int nr_to_scan,
- gfp_t mask)
+ struct shrink_control *sc)
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
struct xfs_buf *bp;
+ int nr_to_scan = sc->nr_to_scan;
LIST_HEAD(dispose);
if (!nr_to_scan)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a9a1c4512645..50a7d5fb3b73 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
xfs_buf_flags_t);
extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
extern void xfs_buf_hold(xfs_buf_t *);
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index d61611c88012..244e797dae32 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -191,3 +191,32 @@ xfs_ioc_trim(
return -XFS_ERROR(EFAULT);
return 0;
}
+
+int
+xfs_discard_extents(
+ struct xfs_mount *mp,
+ struct list_head *list)
+{
+ struct xfs_busy_extent *busyp;
+ int error = 0;
+
+ list_for_each_entry(busyp, list, list) {
+ trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+ busyp->length);
+
+ error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, 0);
+ if (error && error != EOPNOTSUPP) {
+ xfs_info(mp,
+ "discard failed for extent [0x%llu,%u], error %d",
+ (unsigned long long)busyp->bno,
+ busyp->length,
+ error);
+ return error;
+ }
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
index e82b6dd3e127..344879aea646 100644
--- a/fs/xfs/linux-2.6/xfs_discard.h
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -2,7 +2,9 @@
#define XFS_DISCARD_H 1
struct fstrim_range;
+struct list_head;
extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int xfs_discard_extents(struct xfs_mount *, struct list_head *);
#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index b3486dfa5520..54e623bfbb85 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -586,7 +586,8 @@ xfs_file_compat_ioctl(
case XFS_IOC_RESVSP_32:
case XFS_IOC_UNRESVSP_32:
case XFS_IOC_RESVSP64_32:
- case XFS_IOC_UNRESVSP64_32: {
+ case XFS_IOC_UNRESVSP64_32:
+ case XFS_IOC_ZERO_RANGE_32: {
struct xfs_flock64 bf;
if (xfs_compat_flock64_copyin(&bf, arg))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 08b605792a99..80f4060e8970 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 {
#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64)
typedef struct compat_xfs_fsop_geom_v1 {
__u32 blocksize; /* filesystem (data) block size */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 244be9cbfe78..8633521b3b2e 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -70,6 +70,7 @@
#include <linux/ctype.h>
#include <linux/writeback.h>
#include <linux/capability.h>
+#include <linux/list_sort.h>
#include <asm/page.h>
#include <asm/div64.h>
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 9f76cceb678d..bd672def95ac 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -41,23 +41,6 @@ __xfs_printk(
printk("%sXFS: %pV\n", level, vaf);
}
-void xfs_printk(
- const char *level,
- const struct xfs_mount *mp,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- __xfs_printk(level, mp, &vaf);
- va_end(args);
-}
-
#define define_xfs_printk_level(func, kern_level) \
void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
@@ -95,8 +78,7 @@ xfs_alert_tag(
int do_panic = 0;
if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
- xfs_printk(KERN_ALERT, mp,
- "XFS: Transforming an alert into a BUG.");
+ xfs_alert(mp, "Transforming an alert into a BUG.");
do_panic = 1;
}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index f1b3fc1b6c4e..7fb7ea007672 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,9 +3,6 @@
struct xfs_mount;
-extern void xfs_printk(const char *level, const struct xfs_mount *mp,
- const char *fmt, ...)
- __attribute__ ((format (printf, 3, 4)));
extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
@@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
#else
-static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
{
}
#endif
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b38e58d02299..98b9c91fcdf1 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -110,8 +110,10 @@ mempool_t *xfs_ioend_pool;
#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
-#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
+#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
+#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
+#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
/*
* Table driven mount option parser.
@@ -355,6 +357,10 @@ xfs_parseargs(
mp->m_flags |= XFS_MOUNT_DELAYLOG;
} else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+ } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+ mp->m_flags |= XFS_MOUNT_DISCARD;
+ } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+ mp->m_flags &= ~XFS_MOUNT_DISCARD;
} else if (!strcmp(this_char, "ihashsize")) {
xfs_warn(mp,
"ihashsize no longer used, option is deprecated.");
@@ -388,6 +394,13 @@ xfs_parseargs(
return EINVAL;
}
+ if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+ !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+ xfs_warn(mp,
+ "the discard option is incompatible with the nodelaylog option");
+ return EINVAL;
+ }
+
#ifndef CONFIG_XFS_QUOTA
if (XFS_IS_QUOTA_RUNNING(mp)) {
xfs_warn(mp, "quota support not available in this kernel.");
@@ -488,6 +501,7 @@ xfs_showargs(
{ XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
{ XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
{ XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
+ { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD },
{ 0, NULL }
};
static struct proc_xfs_info xfs_info_unset[] = {
@@ -1787,10 +1801,6 @@ init_xfs_fs(void)
if (error)
goto out_cleanup_procfs;
- error = xfs_init_workqueues();
- if (error)
- goto out_sysctl_unregister;
-
vfs_initquota();
error = register_filesystem(&xfs_fs_type);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3e898a48122d..8ecad5ff9f9b 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -267,6 +267,16 @@ xfs_sync_inode_attr(
error = xfs_iflush(ip, flags);
+ /*
+ * We don't want to try again on non-blocking flushes that can't run
+ * again immediately. If an inode really must be written, then that's
+ * what the SYNC_WAIT flag is for.
+ */
+ if (error == EAGAIN) {
+ ASSERT(!(flags & SYNC_WAIT));
+ error = 0;
+ }
+
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return error;
@@ -1022,13 +1032,14 @@ xfs_reclaim_inodes(
static int
xfs_reclaim_inode_shrink(
struct shrinker *shrink,
- int nr_to_scan,
- gfp_t gfp_mask)
+ struct shrink_control *sc)
{
struct xfs_mount *mp;
struct xfs_perag *pag;
xfs_agnumber_t ag;
int reclaimable;
+ int nr_to_scan = sc->nr_to_scan;
+ gfp_t gfp_mask = sc->gfp_mask;
mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
if (nr_to_scan) {
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 2d0bcb479075..d48b7a579ae1 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap,
);
-#define XFS_BUSY_SYNC \
- { 0, "async" }, \
- { 1, "sync" }
-
-TRACE_EVENT(xfs_alloc_busy,
- TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len, int sync),
- TP_ARGS(trans, agno, agbno, len, sync),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(struct xfs_trans *, tp)
- __field(int, tid)
- __field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
- __field(xfs_extlen_t, len)
- __field(int, sync)
- ),
- TP_fast_assign(
- __entry->dev = trans->t_mountp->m_super->s_dev;
- __entry->tp = trans;
- __entry->tid = trans->t_ticket->t_tid;
- __entry->agno = agno;
- __entry->agbno = agbno;
- __entry->len = len;
- __entry->sync = sync;
- ),
- TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->tp,
- __entry->tid,
- __entry->agno,
- __entry->agbno,
- __entry->len,
- __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
-
-);
-
-TRACE_EVENT(xfs_alloc_unbusy,
+DECLARE_EVENT_CLASS(xfs_busy_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len),
TP_ARGS(mp, agno, agbno, len),
@@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy,
__entry->agbno,
__entry->len)
);
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
-#define XFS_BUSY_STATES \
- { 0, "missing" }, \
- { 1, "found" }
-
-TRACE_EVENT(xfs_alloc_busysearch,
+TRACE_EVENT(xfs_alloc_busy_trim,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len, int found),
- TP_ARGS(mp, agno, agbno, len, found),
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ xfs_agblock_t tbno, xfs_extlen_t tlen),
+ TP_ARGS(mp, agno, agbno, len, tbno, tlen),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, agbno)
__field(xfs_extlen_t, len)
- __field(int, found)
+ __field(xfs_agblock_t, tbno)
+ __field(xfs_extlen_t, tlen)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
__entry->agbno = agbno;
__entry->len = len;
- __entry->found = found;
+ __entry->tbno = tbno;
+ __entry->tlen = tlen;
),
- TP_printk("dev %d:%d agno %u agbno %u len %u %s",
+ TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
__entry->agbno,
__entry->len,
- __print_symbolic(__entry->found, XFS_BUSY_STATES))
+ __entry->tbno,
+ __entry->tlen)
);
TRACE_EVENT(xfs_trans_commit_lsn,
@@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class,
__entry->wasfromfl,
__entry->isfl,
__entry->userdata,
- __entry->firstblock)
+ (unsigned long long)__entry->firstblock)
)
#define DEFINE_ALLOC_EVENT(name) \
@@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 69228aa8605a..b94dace4e785 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -60,7 +60,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *);
STATIC int xfs_qm_init_quotainos(xfs_mount_t *);
STATIC int xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t);
+STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *);
static struct shrinker xfs_qm_shaker = {
.shrink = xfs_qm_shake,
@@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist(
STATIC int
xfs_qm_shake(
struct shrinker *shrink,
- int nr_to_scan,
- gfp_t gfp_mask)
+ struct shrink_control *sc)
{
int ndqused, nfree, n;
+ gfp_t gfp_mask = sc->gfp_mask;
if (!kmem_shake_allow(gfp_mask))
return 0;
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 58632cc17f2d..6530769a999b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -187,7 +187,9 @@ struct xfs_busy_extent {
xfs_agnumber_t agno;
xfs_agblock_t bno;
xfs_extlen_t length;
- xlog_tid_t tid; /* transaction that created this */
+ unsigned int flags;
+#define XFS_ALLOC_BUSY_DISCARDED 0x01 /* undergoing a discard op. */
+#define XFS_ALLOC_BUSY_SKIP_DISCARD 0x02 /* do not discard */
};
/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 27d64d752eab..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-/*
- * Prototypes for per-ag allocation routines
- */
-
STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
- xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
-
-/*
- * Internal functions.
- */
+ xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
+STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
+ xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
/*
* Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
xfs_extlen_t *reslen) /* result length */
{
xfs_agblock_t bno;
- xfs_extlen_t diff;
xfs_extlen_t len;
- if (args->alignment > 1 && foundlen >= args->minlen) {
- bno = roundup(foundbno, args->alignment);
- diff = bno - foundbno;
- len = diff >= foundlen ? 0 : foundlen - diff;
+ /* Trim busy sections out of found extent */
+ xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
+
+ if (args->alignment > 1 && len >= args->minlen) {
+ xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
+ xfs_extlen_t diff = aligned_bno - bno;
+
+ *resbno = aligned_bno;
+ *reslen = diff >= len ? 0 : len - diff;
} else {
- bno = foundbno;
- len = foundlen;
+ *resbno = bno;
+ *reslen = len;
}
- *resbno = bno;
- *reslen = len;
}
/*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
return 1;
agf = XFS_BUF_TO_AGF(args->agbp);
diff = be32_to_cpu(agf->agf_freeblks)
- + be32_to_cpu(agf->agf_flcount)
- args->len - args->minleft;
if (diff >= 0)
return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
if (error)
return error;
- /*
- * Search the busylist for these blocks and mark the
- * transaction as synchronous if blocks are found. This
- * avoids the need to block due to a synchronous log
- * force to ensure correct ordering as the synchronous
- * transaction will guarantee that for us.
- */
- if (xfs_alloc_busy_search(args->mp, args->agno,
- args->agbno, args->len))
- xfs_trans_set_sync(args->tp);
+ ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
+ args->agbno, args->len));
}
if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
{
xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
- xfs_agblock_t end; /* end of allocated extent */
int error;
xfs_agblock_t fbno; /* start block of found extent */
- xfs_agblock_t fend; /* end block of found extent */
xfs_extlen_t flen; /* length of found extent */
+ xfs_agblock_t tbno; /* start block of trimmed extent */
+ xfs_extlen_t tlen; /* length of trimmed extent */
+ xfs_agblock_t tend; /* end block of trimmed extent */
+ xfs_agblock_t end; /* end of allocated extent */
int i; /* success/failure of operation */
- xfs_agblock_t maxend; /* end of maximal extent */
- xfs_agblock_t minend; /* end of minimal extent */
xfs_extlen_t rlen; /* length of returned extent */
ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
ASSERT(fbno <= args->agbno);
- minend = args->agbno + args->minlen;
- maxend = args->agbno + args->maxlen;
- fend = fbno + flen;
/*
- * Give up if the freespace isn't long enough for the minimum request.
+ * Check for overlapping busy extents.
*/
- if (fend < minend)
+ xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
+
+ /*
+ * Give up if the start of the extent is busy, or the freespace isn't
+ * long enough for the minimum request.
+ */
+ if (tbno > args->agbno)
+ goto not_found;
+ if (tlen < args->minlen)
+ goto not_found;
+ tend = tbno + tlen;
+ if (tend < args->agbno + args->minlen)
goto not_found;
/*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
*
* Fix the length according to mod and prod if given.
*/
- end = XFS_AGBLOCK_MIN(fend, maxend);
+ end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
args->len = end - args->agbno;
xfs_alloc_fix_len(args);
if (!xfs_alloc_fix_minleft(args))
goto not_found;
rlen = args->len;
- ASSERT(args->agbno + rlen <= fend);
+ ASSERT(args->agbno + rlen <= tend);
end = args->agbno + rlen;
/*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
struct xfs_btree_cur **scur, /* searching cursor */
xfs_agblock_t gdiff, /* difference for search comparison */
xfs_agblock_t *sbno, /* extent found by search */
- xfs_extlen_t *slen,
- xfs_extlen_t *slena, /* aligned length */
+ xfs_extlen_t *slen, /* extent length */
+ xfs_agblock_t *sbnoa, /* aligned extent found by search */
+ xfs_extlen_t *slena, /* aligned extent length */
int dir) /* 0 = search right, 1 = search left */
{
- xfs_agblock_t bno;
xfs_agblock_t new;
xfs_agblock_t sdiff;
int error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
if (error)
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
+ xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
/*
* The good extent is closer than this one.
*/
if (!dir) {
- if (bno >= args->agbno + gdiff)
+ if (*sbnoa >= args->agbno + gdiff)
goto out_use_good;
} else {
- if (bno <= args->agbno - gdiff)
+ if (*sbnoa <= args->agbno - gdiff)
goto out_use_good;
}
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
xfs_alloc_fix_len(args);
sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, *sbno,
- *slen, &new);
+ args->alignment, *sbnoa,
+ *slena, &new);
/*
* Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
xfs_agblock_t gtbnoa; /* aligned ... */
xfs_extlen_t gtdiff; /* difference to right side entry */
xfs_extlen_t gtlen; /* length of right side entry */
- xfs_extlen_t gtlena = 0; /* aligned ... */
+ xfs_extlen_t gtlena; /* aligned ... */
xfs_agblock_t gtnew; /* useful start bno of right side */
int error; /* error code */
int i; /* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
xfs_agblock_t ltbnoa; /* aligned ... */
xfs_extlen_t ltdiff; /* difference to left side entry */
xfs_extlen_t ltlen; /* length of left side entry */
- xfs_extlen_t ltlena = 0; /* aligned ... */
+ xfs_extlen_t ltlena; /* aligned ... */
xfs_agblock_t ltnew; /* useful start bno of left side */
xfs_extlen_t rlen; /* length of returned extent */
+ int forced = 0;
#if defined(DEBUG) && defined(__KERNEL__)
/*
* Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
dofirst = random32() & 1;
#endif
+
+restart:
+ bno_cur_lt = NULL;
+ bno_cur_gt = NULL;
+ ltlen = 0;
+ gtlena = 0;
+ ltlena = 0;
+
/*
* Get a cursor for the by-size btree.
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
args->agno, XFS_BTNUM_CNT);
- ltlen = 0;
- bno_cur_lt = bno_cur_gt = NULL;
+
/*
* See if there are any free extents as big as maxlen.
*/
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
goto error0;
if (i == 0 || ltlen == 0) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_near_noentry(args);
return 0;
}
ASSERT(i == 1);
}
args->wasfromfl = 0;
+
/*
* First algorithm.
* If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
if (args->len < blen)
continue;
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbno, ltlen, &ltnew);
+ args->alignment, ltbnoa, ltlena, &ltnew);
if (ltnew != NULLAGBLOCK &&
(args->len > blen || ltdiff < bdiff)) {
bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
xfs_alloc_fix_len(args);
ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, ltbno, ltlen, &ltnew);
+ args->alignment, ltbnoa, ltlena, &ltnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_lt, &bno_cur_gt,
- ltdiff, &gtbno, &gtlen, &gtlena,
+ ltdiff, &gtbno, &gtlen,
+ &gtbnoa, &gtlena,
0 /* search right */);
} else {
ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
xfs_alloc_fix_len(args);
gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
- args->alignment, gtbno, gtlen, &gtnew);
+ args->alignment, gtbnoa, gtlena, &gtnew);
error = xfs_alloc_find_best_extent(args,
&bno_cur_gt, &bno_cur_lt,
- gtdiff, &ltbno, &ltlen, &ltlena,
+ gtdiff, &ltbno, &ltlen,
+ &ltbnoa, &ltlena,
1 /* search left */);
}
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
* If we couldn't get anything, give up.
*/
if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
+ if (!forced++) {
+ trace_xfs_alloc_near_busy(args);
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+
trace_xfs_alloc_size_neither(args);
args->agbno = NULLAGBLOCK;
return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
return 0;
}
rlen = args->len;
- (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno,
- ltlen, &ltnew);
+ (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
+ ltbnoa, ltlena, &ltnew);
ASSERT(ltnew >= ltbno);
- ASSERT(ltnew + rlen <= ltbno + ltlen);
+ ASSERT(ltnew + rlen <= ltbnoa + ltlena);
ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
args->agbno = ltnew;
+
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
ltnew, rlen, XFSA_FIXUP_BNO_OK)))
goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
int i; /* temp status variable */
xfs_agblock_t rbno; /* returned block number */
xfs_extlen_t rlen; /* length of returned extent */
+ int forced = 0;
+restart:
/*
* Allocate and initialize a cursor for the by-size btree.
*/
cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
args->agno, XFS_BTNUM_CNT);
bno_cur = NULL;
+
/*
* Look for an entry >= maxlen+alignment-1 blocks.
*/
if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
args->maxlen + args->alignment - 1, &i)))
goto error0;
+
/*
- * If none, then pick up the last entry in the tree unless the
- * tree is empty.
+ * If none or we have busy extents that we cannot allocate from, then
+ * we have to settle for a smaller extent. In the case that there are
+ * no large extents, this will return the last entry in the tree unless
+ * the tree is empty. In the case that there are only busy large
+ * extents, this will return the largest small extent unless there
+ * are no smaller extents available.
*/
- if (!i) {
- if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno,
- &flen, &i)))
+ if (!i || forced > 1) {
+ error = xfs_alloc_ag_vextent_small(args, cnt_cur,
+ &fbno, &flen, &i);
+ if (error)
goto error0;
if (i == 0 || flen == 0) {
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
return 0;
}
ASSERT(i == 1);
+ xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+ } else {
+ /*
+ * Search for a non-busy extent that is large enough.
+ * If we are at low space, don't check, or if we fall of
+ * the end of the btree, turn off the busy check and
+ * restart.
+ */
+ for (;;) {
+ error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ xfs_alloc_compute_aligned(args, fbno, flen,
+ &rbno, &rlen);
+
+ if (rlen >= args->maxlen)
+ break;
+
+ error = xfs_btree_increment(cnt_cur, 0, &i);
+ if (error)
+ goto error0;
+ if (i == 0) {
+ /*
+ * Our only valid extents must have been busy.
+ * Make it unbusy by forcing the log out and
+ * retrying. If we've been here before, forcing
+ * the log isn't making the extents available,
+ * which means they have probably been freed in
+ * this transaction. In that case, we have to
+ * give up on them and we'll attempt a minlen
+ * allocation the next time around.
+ */
+ xfs_btree_del_cursor(cnt_cur,
+ XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ if (!forced++)
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ }
}
- /*
- * There's a freespace as big as maxlen+alignment-1, get it.
- */
- else {
- if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- }
+
/*
* In the first case above, we got the last entry in the
* by-size btree. Now we check to see if the space hits maxlen
* once aligned; if not, we search left for something better.
* This can't happen in the second case above.
*/
- xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
(rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
* Fix up the length.
*/
args->len = rlen;
- xfs_alloc_fix_len(args);
- if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) {
- xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
- trace_xfs_alloc_size_nominleft(args);
- args->agbno = NULLAGBLOCK;
- return 0;
+ if (rlen < args->minlen) {
+ if (!forced++) {
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_busy(args);
+ xfs_log_force(args->mp, XFS_LOG_SYNC);
+ goto restart;
+ }
+ goto out_nominleft;
}
+ xfs_alloc_fix_len(args);
+
+ if (!xfs_alloc_fix_minleft(args))
+ goto out_nominleft;
rlen = args->len;
XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
/*
@@ -1287,6 +1350,12 @@ error0:
if (bno_cur)
xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
return error;
+
+out_nominleft:
+ xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
+ trace_xfs_alloc_size_nominleft(args);
+ args->agbno = NULLAGBLOCK;
+ return 0;
}
/*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
if (error)
goto error0;
if (fbno != NULLAGBLOCK) {
+ xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
+ args->userdata);
+
if (args->userdata) {
xfs_buf_t *bp;
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
- /*
- * Since blocks move to the free list without the coordination
- * used in xfs_bmap_finish, we can't allow block to be available
- * for reallocation and non-transaction writing (user data)
- * until we know that the transaction that moved it to the free
- * list is permanently on disk. We track the blocks by declaring
- * these blocks as "busy"; the busy list is maintained on a per-ag
- * basis and each transaction records which entries should be removed
- * when the iclog commits to disk. If a busy block is allocated,
- * the iclog is pushed up to the LSN that freed the block.
- */
- xfs_alloc_busy_insert(tp, agno, bno, len);
return 0;
error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
xfs_alloc_log_agf(tp, agbp, logflags);
*bnop = bno;
- /*
- * As blocks are freed, they are added to the per-ag busy list and
- * remain there until the freeing transaction is committed to disk.
- * Now that we have allocated blocks, this list must be searched to see
- * if a block is being reused. If one is, then the freeing transaction
- * must be pushed to disk before this transaction.
- *
- * We do this by setting the current transaction to a sync transaction
- * which guarantees that the freeing transaction is on disk before this
- * transaction. This is done instead of a synchronous log force here so
- * that we don't sit and wait with the AGF locked in the transaction
- * during the log force.
- */
- if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
- xfs_trans_set_sync(tp);
return 0;
}
@@ -2423,119 +2468,26 @@ xfs_free_extent(
}
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
+ if (!error)
+ xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
error0:
xfs_perag_put(args.pag);
return error;
}
-
-/*
- * AG Busy list management
- * The busy list contains block ranges that have been freed but whose
- * transactions have not yet hit disk. If any block listed in a busy
- * list is reused, the transaction that freed it must be forced to disk
- * before continuing to use the block.
- *
- * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_busy_clear - remove an item from the per-ag busy list
- * xfs_alloc_busy_search - search for a busy extent
- */
-
-/*
- * Insert a new extent into the busy tree.
- *
- * The busy extent tree is indexed by the start block of the busy extent.
- * there can be multiple overlapping ranges in the busy extent tree but only
- * ever one entry at a given start block. The reason for this is that
- * multi-block extents can be freed, then smaller chunks of that extent
- * allocated and freed again before the first transaction commit is on disk.
- * If the exact same start block is freed a second time, we have to wait for
- * that busy extent to pass out of the tree before the new extent is inserted.
- * There are two main cases we have to handle here.
- *
- * The first case is a transaction that triggers a "free - allocate - free"
- * cycle. This can occur during btree manipulations as a btree block is freed
- * to the freelist, then allocated from the free list, then freed again. In
- * this case, the second extxpnet free is what triggers the duplicate and as
- * such the transaction IDs should match. Because the extent was allocated in
- * this transaction, the transaction must be marked as synchronous. This is
- * true for all cases where the free/alloc/free occurs in the one transaction,
- * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
- * This serves to catch violations of the second case quite effectively.
- *
- * The second case is where the free/alloc/free occur in different
- * transactions. In this case, the thread freeing the extent the second time
- * can't mark the extent busy immediately because it is already tracked in a
- * transaction that may be committing. When the log commit for the existing
- * busy extent completes, the busy extent will be removed from the tree. If we
- * allow the second busy insert to continue using that busy extent structure,
- * it can be freed before this transaction is safely in the log. Hence our
- * only option in this case is to force the log to remove the existing busy
- * extent from the list before we insert the new one with the current
- * transaction ID.
- *
- * The problem we are trying to avoid in the free-alloc-free in separate
- * transactions is most easily described with a timeline:
- *
- * Thread 1 Thread 2 Thread 3 xfslogd
- * xact alloc
- * free X
- * mark busy
- * commit xact
- * free xact
- * xact alloc
- * alloc X
- * busy search
- * mark xact sync
- * commit xact
- * free xact
- * force log
- * checkpoint starts
- * ....
- * xact alloc
- * free X
- * mark busy
- * finds match
- * *** KABOOM! ***
- * ....
- * log IO completes
- * unbusy X
- * checkpoint completes
- *
- * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
- * the checkpoint completes, and the busy extent it matched will have been
- * removed from the tree when it is woken. Hence it can then continue safely.
- *
- * However, to ensure this matching process is robust, we need to use the
- * transaction ID for identifying transaction, as delayed logging results in
- * the busy extent and transaction lifecycles being different. i.e. the busy
- * extent is active for a lot longer than the transaction. Hence the
- * transaction structure can be freed and reallocated, then mark the same
- * extent busy again in the new transaction. In this case the new transaction
- * will have a different tid but can have the same address, and hence we need
- * to check against the tid.
- *
- * Future: for delayed logging, we could avoid the log force if the extent was
- * first freed in the current checkpoint sequence. This, however, requires the
- * ability to pin the current checkpoint in memory until this transaction
- * commits to ensure that both the original free and the current one combine
- * logically into the one checkpoint. If the checkpoint sequences are
- * different, however, we still need to wait on a log force.
- */
void
xfs_alloc_busy_insert(
struct xfs_trans *tp,
xfs_agnumber_t agno,
xfs_agblock_t bno,
- xfs_extlen_t len)
+ xfs_extlen_t len,
+ unsigned int flags)
{
struct xfs_busy_extent *new;
struct xfs_busy_extent *busyp;
struct xfs_perag *pag;
struct rb_node **rbp;
- struct rb_node *parent;
- int match;
-
+ struct rb_node *parent = NULL;
new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
if (!new) {
@@ -2544,7 +2496,7 @@ xfs_alloc_busy_insert(
* block, make this a synchronous transaction to insure that
* the block is not reused before this transaction commits.
*/
- trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+ trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
xfs_trans_set_sync(tp);
return;
}
@@ -2552,66 +2504,29 @@ xfs_alloc_busy_insert(
new->agno = agno;
new->bno = bno;
new->length = len;
- new->tid = xfs_log_get_trans_ident(tp);
-
INIT_LIST_HEAD(&new->list);
+ new->flags = flags;
/* trace before insert to be able to see failed inserts */
- trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+ trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
pag = xfs_perag_get(tp->t_mountp, new->agno);
-restart:
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
- parent = NULL;
- busyp = NULL;
- match = 0;
- while (*rbp && match >= 0) {
+ while (*rbp) {
parent = *rbp;
busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
if (new->bno < busyp->bno) {
- /* may overlap, but exact start block is lower */
rbp = &(*rbp)->rb_left;
- if (new->bno + new->length > busyp->bno)
- match = busyp->tid == new->tid ? 1 : -1;
+ ASSERT(new->bno + new->length <= busyp->bno);
} else if (new->bno > busyp->bno) {
- /* may overlap, but exact start block is higher */
rbp = &(*rbp)->rb_right;
- if (bno < busyp->bno + busyp->length)
- match = busyp->tid == new->tid ? 1 : -1;
+ ASSERT(bno >= busyp->bno + busyp->length);
} else {
- match = busyp->tid == new->tid ? 1 : -1;
- break;
+ ASSERT(0);
}
}
- if (match < 0) {
- /* overlap marked busy in different transaction */
- spin_unlock(&pag->pagb_lock);
- xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
- goto restart;
- }
- if (match > 0) {
- /*
- * overlap marked busy in same transaction. Update if exact
- * start block match, otherwise combine the busy extents into
- * a single range.
- */
- if (busyp->bno == new->bno) {
- busyp->length = max(busyp->length, new->length);
- spin_unlock(&pag->pagb_lock);
- ASSERT(tp->t_flags & XFS_TRANS_SYNC);
- xfs_perag_put(pag);
- kmem_free(new);
- return;
- }
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
- new->length = max(busyp->bno + busyp->length,
- new->bno + new->length) -
- min(busyp->bno, new->bno);
- new->bno = min(busyp->bno, new->bno);
- } else
- busyp = NULL;
rb_link_node(&new->rb_node, parent, rbp);
rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2534,6 @@ restart:
list_add(&new->list, &tp->t_busy);
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
- kmem_free(busyp);
}
/*
@@ -2668,31 +2582,466 @@ xfs_alloc_busy_search(
}
}
spin_unlock(&pag->pagb_lock);
- trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
xfs_perag_put(pag);
return match;
}
+/*
+ * The found free extent [fbno, fend] overlaps part or all of the given busy
+ * extent. If the overlap covers the beginning, the end, or all of the busy
+ * extent, the overlapping portion can be made unbusy and used for the
+ * allocation. We can't split a busy extent because we can't modify a
+ * transaction/CIL context busy list, but we can update an entries block
+ * number or length.
+ *
+ * Returns true if the extent can safely be reused, or false if the search
+ * needs to be restarted.
+ */
+STATIC bool
+xfs_alloc_busy_update_extent(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_busy_extent *busyp,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
+{
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ /*
+ * This extent is currently being discarded. Give the thread
+ * performing the discard a chance to mark the extent unbusy
+ * and retry.
+ */
+ if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
+ spin_unlock(&pag->pagb_lock);
+ delay(1);
+ spin_lock(&pag->pagb_lock);
+ return false;
+ }
+
+ /*
+ * If there is a busy extent overlapping a user allocation, we have
+ * no choice but to force the log and retry the search.
+ *
+ * Fortunately this does not happen during normal operation, but
+ * only if the filesystem is very low on space and has to dip into
+ * the AGFL for normal allocations.
+ */
+ if (userdata)
+ goto out_force_log;
+
+ if (bbno < fbno && bend > fend) {
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ */
+
+ /*
+ * We would have to split the busy extent to be able to track
+ * it correct, which we cannot do because we would have to
+ * modify the list of busy extents attached to the transaction
+ * or CIL context, which is immutable.
+ *
+ * Force out the log to clear the busy extent and retry the
+ * search.
+ */
+ goto out_force_log;
+ } else if (bbno >= fbno && bend <= fend) {
+ /*
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ */
+
+ /*
+ * The busy extent is fully covered by the extent we are
+ * allocating, and can simply be removed from the rbtree.
+ * However we cannot remove it from the immutable list
+ * tracking busy extents in the transaction or CIL context,
+ * so set the length to zero to mark it invalid.
+ *
+ * We also need to restart the busy extent search from the
+ * tree root, because erasing the node can rearrange the
+ * tree topology.
+ */
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ busyp->length = 0;
+ return false;
+ } else if (fend < bend) {
+ /*
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ */
+ busyp->bno = fend;
+ } else if (bbno < fbno) {
+ /*
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ */
+ busyp->length = fbno - busyp->bno;
+ } else {
+ ASSERT(0);
+ }
+
+ trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
+ return true;
+
+out_force_log:
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
+ spin_lock(&pag->pagb_lock);
+ return false;
+}
+
+
+/*
+ * For a given extent [fbno, flen], make sure we can reuse it safely.
+ */
void
-xfs_alloc_busy_clear(
+xfs_alloc_busy_reuse(
struct xfs_mount *mp,
- struct xfs_busy_extent *busyp)
+ xfs_agnumber_t agno,
+ xfs_agblock_t fbno,
+ xfs_extlen_t flen,
+ bool userdata)
{
struct xfs_perag *pag;
+ struct rb_node *rbp;
- trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
- busyp->length);
+ ASSERT(flen > 0);
- ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
- busyp->length) == 1);
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+restart:
+ rbp = pag->pagb_tree.rb_node;
+ while (rbp) {
+ struct xfs_busy_extent *busyp =
+ rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
- list_del_init(&busyp->list);
+ if (fbno + flen <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
- pag = xfs_perag_get(mp, busyp->agno);
- spin_lock(&pag->pagb_lock);
- rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
+ userdata))
+ goto restart;
+ }
spin_unlock(&pag->pagb_lock);
xfs_perag_put(pag);
+}
+
+/*
+ * For a given extent [fbno, flen], search the busy extent list to find a
+ * subset of the extent that is not busy. If *rlen is smaller than
+ * args->minlen no suitable extent could be found, and the higher level
+ * code needs to force out the log and retry the allocation.
+ */
+STATIC void
+xfs_alloc_busy_trim(
+ struct xfs_alloc_arg *args,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ xfs_agblock_t *rbno,
+ xfs_extlen_t *rlen)
+{
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ struct rb_node *rbp;
+
+ ASSERT(len > 0);
+
+ spin_lock(&args->pag->pagb_lock);
+restart:
+ fbno = bno;
+ flen = len;
+ rbp = args->pag->pagb_tree.rb_node;
+ while (rbp && flen >= args->minlen) {
+ struct xfs_busy_extent *busyp =
+ rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ xfs_agblock_t fend = fbno + flen;
+ xfs_agblock_t bbno = busyp->bno;
+ xfs_agblock_t bend = bbno + busyp->length;
+
+ if (fend <= bbno) {
+ rbp = rbp->rb_left;
+ continue;
+ } else if (fbno >= bend) {
+ rbp = rbp->rb_right;
+ continue;
+ }
+
+ /*
+ * If this is a metadata allocation, try to reuse the busy
+ * extent instead of trimming the allocation.
+ */
+ if (!args->userdata &&
+ !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
+ if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
+ busyp, fbno, flen,
+ false))
+ goto restart;
+ continue;
+ }
+
+ if (bbno <= fbno) {
+ /* start overlap */
+
+ /*
+ * Case 1:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +---------+
+ * fbno fend
+ *
+ * Case 2:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 3:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-------------+
+ * fbno fend
+ *
+ * Case 4:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------+
+ * fbno fend
+ *
+ * No unbusy region in extent, return failure.
+ */
+ if (fend <= bend)
+ goto fail;
+
+ /*
+ * Case 5:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +----------------------+
+ * fbno fend
+ *
+ * Case 6:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fbno = bend;
+ } else if (bend >= fend) {
+ /* end overlap */
+
+ /*
+ * Case 7:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +------------------+
+ * fbno fend
+ *
+ * Case 8:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +--------------------------+
+ * fbno fend
+ *
+ * Needs to be trimmed to:
+ * +-------+
+ * fbno fend
+ */
+ fend = bbno;
+ } else {
+ /* middle overlap */
+
+ /*
+ * Case 9:
+ * bbno bend
+ * +BBBBBBBBBBBBBBBBB+
+ * +-----------------------------------+
+ * fbno fend
+ *
+ * Can be trimmed to:
+ * +-------+ OR +-------+
+ * fbno fend fbno fend
+ *
+ * Backward allocation leads to significant
+ * fragmentation of directories, which degrades
+ * directory performance, therefore we always want to
+ * choose the option that produces forward allocation
+ * patterns.
+ * Preferring the lower bno extent will make the next
+ * request use "fend" as the start of the next
+ * allocation; if the segment is no longer busy at
+ * that point, we'll get a contiguous allocation, but
+ * even if it is still busy, we will get a forward
+ * allocation.
+ * We try to avoid choosing the segment at "bend",
+ * because that can lead to the next allocation
+ * taking the segment at "fbno", which would be a
+ * backward allocation. We only use the segment at
+ * "fbno" if it is much larger than the current
+ * requested size, because in that case there's a
+ * good chance subsequent allocations will be
+ * contiguous.
+ */
+ if (bbno - fbno >= args->maxlen) {
+ /* left candidate fits perfect */
+ fend = bbno;
+ } else if (fend - bend >= args->maxlen * 4) {
+ /* right candidate has enough free space */
+ fbno = bend;
+ } else if (bbno - fbno >= args->minlen) {
+ /* left candidate fits minimum requirement */
+ fend = bbno;
+ } else {
+ goto fail;
+ }
+ }
+
+ flen = fend - fbno;
+ }
+ spin_unlock(&args->pag->pagb_lock);
+
+ if (fbno != bno || flen != len) {
+ trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
+ fbno, flen);
+ }
+ *rbno = fbno;
+ *rlen = flen;
+ return;
+fail:
+ /*
+ * Return a zero extent length as failure indications. All callers
+ * re-check if the trimmed extent satisfies the minlen requirement.
+ */
+ spin_unlock(&args->pag->pagb_lock);
+ trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
+ *rbno = fbno;
+ *rlen = 0;
+}
+
+static void
+xfs_alloc_busy_clear_one(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ struct xfs_busy_extent *busyp)
+{
+ if (busyp->length) {
+ trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
+ busyp->length);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ }
+ list_del_init(&busyp->list);
kmem_free(busyp);
}
+
+/*
+ * Remove all extents on the passed in list from the busy extents tree.
+ * If do_discard is set skip extents that need to be discarded, and mark
+ * these as undergoing a discard operation instead.
+ */
+void
+xfs_alloc_busy_clear(
+ struct xfs_mount *mp,
+ struct list_head *list,
+ bool do_discard)
+{
+ struct xfs_busy_extent *busyp, *n;
+ struct xfs_perag *pag = NULL;
+ xfs_agnumber_t agno = NULLAGNUMBER;
+
+ list_for_each_entry_safe(busyp, n, list, list) {
+ if (busyp->agno != agno) {
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ agno = busyp->agno;
+ }
+
+ if (do_discard && busyp->length &&
+ !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
+ busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
+ else
+ xfs_alloc_busy_clear_one(mp, pag, busyp);
+ }
+
+ if (pag) {
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ }
+}
+
+/*
+ * Callback for list_sort to sort busy extents by the AG they reside in.
+ */
+int
+xfs_busy_extent_ag_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ return container_of(a, struct xfs_busy_extent, list)->agno -
+ container_of(b, struct xfs_busy_extent, list)->agno;
+}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index d0b3bc72005b..2f52b924be79 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -137,14 +137,28 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
#ifdef __KERNEL__
void
xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno,
- xfs_agblock_t bno, xfs_extlen_t len);
+ xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
-xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list,
+ bool do_discard);
int
xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t bno, xfs_extlen_t len);
+
+void
+xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
+
+int
+xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
+
+static inline void xfs_alloc_busy_sort(struct list_head *list)
+{
+ list_sort(NULL, list, xfs_busy_extent_ag_cmp);
+}
+
#endif /* __KERNEL__ */
/*
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3916925e2584..2b3518826a69 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -95,6 +95,8 @@ xfs_allocbt_alloc_block(
return 0;
}
+ xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false);
+
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
@@ -118,18 +120,8 @@ xfs_allocbt_free_block(
if (error)
return error;
- /*
- * Since blocks move to the free list without the coordination used in
- * xfs_bmap_finish, we can't allow block to be available for
- * reallocation and non-transaction writing (user data) until we know
- * that the transaction that moved it to the free list is permanently
- * on disk. We track the blocks by declaring these blocks as "busy";
- * the busy list is maintained on a per-ag basis and each transaction
- * records which entries should be removed when the iclog commits to
- * disk. If a busy block is allocated, the iclog is pushed up to the
- * LSN that freed the block.
- */
- xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
+ XFS_ALLOC_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
return 0;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index fa00788de2f5..e546a33214c9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -89,36 +89,19 @@ xfs_bmap_add_attrfork_local(
int *flags); /* inode logging flags */
/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after allocating space (or doing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_add_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- xfs_fsblock_t *first, /* pointer to firstblock variable */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd); /* OK to allocate reserved blocks */
-
-/*
* Called by xfs_bmap_add_extent to handle cases converting a delayed
* allocation to a real allocation.
*/
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int rsvd); /* OK to allocate reserved blocks */
+ int *logflagsp); /* inode logging flags */
/*
* Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -127,10 +110,9 @@ xfs_bmap_add_extent_delay_real(
STATIC int /* error */
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp,/* inode logging flags */
- int rsvd); /* OK to allocate reserved blocks */
+ int *logflagsp); /* inode logging flags */
/*
* Called by xfs_bmap_add_extent to handle cases converting a hole
@@ -139,7 +121,7 @@ xfs_bmap_add_extent_hole_delay(
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
@@ -152,7 +134,7 @@ xfs_bmap_add_extent_hole_real(
STATIC int /* error */
xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp); /* inode logging flags */
@@ -180,22 +162,6 @@ xfs_bmap_btree_to_extents(
int whichfork); /* data or attr fork */
/*
- * Called by xfs_bmapi to update file extent records and the btree
- * after removing space (or undoing a delayed allocation).
- */
-STATIC int /* error */
-xfs_bmap_del_extent(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_trans_t *tp, /* current trans pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
- xfs_bmap_free_t *flist, /* list of extents to be freed */
- xfs_btree_cur_t *cur, /* if null, not a btree */
- xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp,/* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd); /* OK to allocate reserved blocks */
-
-/*
* Remove the entry "free" from the free item list. Prev points to the
* previous entry, unless "free" is the head of the list.
*/
@@ -474,14 +440,13 @@ xfs_bmap_add_attrfork_local(
STATIC int /* error */
xfs_bmap_add_extent(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd) /* OK to use reserved data blocks */
+ int whichfork) /* data or attr fork */
{
xfs_btree_cur_t *cur; /* btree cursor or null */
xfs_filblks_t da_new; /* new count del alloc blocks used */
@@ -492,23 +457,27 @@ xfs_bmap_add_extent(
xfs_extnum_t nextents; /* number of extents in file now */
XFS_STATS_INC(xs_add_exlist);
+
cur = *curp;
ifp = XFS_IFORK_PTR(ip, whichfork);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
- ASSERT(idx <= nextents);
da_old = da_new = 0;
error = 0;
+
+ ASSERT(*idx >= 0);
+ ASSERT(*idx <= nextents);
+
/*
* This is the first extent added to a new/empty file.
* Special case this one, so other routines get to assume there are
* already extents in the list.
*/
if (nextents == 0) {
- xfs_iext_insert(ip, 0, 1, new,
+ xfs_iext_insert(ip, *idx, 1, new,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
ASSERT(cur == NULL);
- ifp->if_lastex = 0;
+
if (!isnullstartblock(new->br_startblock)) {
XFS_IFORK_NEXT_SET(ip, whichfork, 1);
logflags = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
@@ -522,27 +491,25 @@ xfs_bmap_add_extent(
if (cur)
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
- &logflags, rsvd)))
- goto done;
+ error = xfs_bmap_add_extent_hole_delay(ip, idx, new,
+ &logflags);
}
/*
* Real allocation off the end of the file.
*/
- else if (idx == nextents) {
+ else if (*idx == nextents) {
if (cur)
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
- &logflags, whichfork)))
- goto done;
+ error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new,
+ &logflags, whichfork);
} else {
xfs_bmbt_irec_t prev; /* old extent at offset idx */
/*
* Get the record referred to by idx.
*/
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &prev);
/*
* If it's a real allocation record, and the new allocation ends
* after the start of the referred to record, then we're filling
@@ -557,22 +524,18 @@ xfs_bmap_add_extent(
if (cur)
ASSERT(cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL);
- if ((error = xfs_bmap_add_extent_delay_real(ip,
- idx, &cur, new, &da_new, first, flist,
- &logflags, rsvd)))
- goto done;
- } else if (new->br_state == XFS_EXT_NORM) {
- ASSERT(new->br_state == XFS_EXT_NORM);
- if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags)))
- goto done;
+ error = xfs_bmap_add_extent_delay_real(ip,
+ idx, &cur, new, &da_new,
+ first, flist, &logflags);
} else {
- ASSERT(new->br_state == XFS_EXT_UNWRITTEN);
- if ((error = xfs_bmap_add_extent_unwritten_real(
- ip, idx, &cur, new, &logflags)))
+ ASSERT(new->br_state == XFS_EXT_NORM ||
+ new->br_state == XFS_EXT_UNWRITTEN);
+
+ error = xfs_bmap_add_extent_unwritten_real(ip,
+ idx, &cur, new, &logflags);
+ if (error)
goto done;
}
- ASSERT(*curp == cur || *curp == NULL);
}
/*
* Otherwise we're filling in a hole with an allocation.
@@ -581,13 +544,15 @@ xfs_bmap_add_extent(
if (cur)
ASSERT((cur->bc_private.b.flags &
XFS_BTCUR_BPRV_WASDEL) == 0);
- if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
- new, &logflags, whichfork)))
- goto done;
+ error = xfs_bmap_add_extent_hole_real(ip, idx, cur,
+ new, &logflags, whichfork);
}
}
+ if (error)
+ goto done;
ASSERT(*curp == cur || *curp == NULL);
+
/*
* Convert to a btree if necessary.
*/
@@ -615,7 +580,7 @@ xfs_bmap_add_extent(
ASSERT(nblks <= da_old);
if (nblks < da_old)
xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - nblks), rsvd);
+ (int64_t)(da_old - nblks), 0);
}
/*
* Clear out the allocated field, done with it now in any case.
@@ -640,14 +605,13 @@ done:
STATIC int /* error */
xfs_bmap_add_extent_delay_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */
xfs_fsblock_t *first, /* pointer to firstblock variable */
xfs_bmap_free_t *flist, /* list of extents to be freed */
- int *logflagsp, /* inode logging flags */
- int rsvd) /* OK to use reserved data block allocation */
+ int *logflagsp) /* inode logging flags */
{
xfs_btree_cur_t *cur; /* btree cursor */
int diff; /* temp value */
@@ -673,7 +637,7 @@ xfs_bmap_add_extent_delay_real(
*/
cur = *curp;
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &PREV);
new_endoff = new->br_startoff + new->br_blockcount;
ASSERT(PREV.br_startoff <= new->br_startoff);
@@ -692,9 +656,9 @@ xfs_bmap_add_extent_delay_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -712,9 +676,9 @@ xfs_bmap_add_extent_delay_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
@@ -745,14 +709,14 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left and right neighbors are both contiguous with new.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, idx, 2, state);
- ip->i_df.if_lastex = idx - 1;
+ xfs_iext_remove(ip, *idx + 1, 2, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -784,13 +748,14 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx - 1;
- xfs_iext_remove(ip, idx, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -814,14 +779,13 @@ xfs_bmap_add_extent_delay_real(
* Filling in all of a previously delayed allocation extent.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
- xfs_iext_remove(ip, idx + 1, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -837,6 +801,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount, PREV.br_state)))
goto done;
}
+
*dnew = 0;
break;
@@ -846,11 +811,10 @@ xfs_bmap_add_extent_delay_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, new->br_startblock);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -866,6 +830,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
+
*dnew = 0;
break;
@@ -874,17 +839,16 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- ip->i_df.if_lastex = idx - 1;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -904,7 +868,9 @@ xfs_bmap_add_extent_delay_real(
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock));
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
*dnew = temp;
break;
@@ -913,12 +879,11 @@ xfs_bmap_add_extent_delay_real(
* Filling in the first part of a previous delayed allocation.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, new_endoff);
temp = PREV.br_blockcount - new->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(ip, idx, 1, new, state);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -946,9 +911,10 @@ xfs_bmap_add_extent_delay_real(
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock) -
(cur ? cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, idx + 1);
+ ep = xfs_iext_get_ext(ifp, *idx + 1);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
+
*dnew = temp;
break;
@@ -958,15 +924,13 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is contiguous with the new allocation.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx + 1, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx + 1),
new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
RIGHT.br_state);
- trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
- ip->i_df.if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx + 1, state, _THIS_IP_);
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -983,10 +947,14 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_state)))
goto done;
}
+
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock));
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
*dnew = temp;
break;
@@ -996,10 +964,9 @@ xfs_bmap_add_extent_delay_real(
* The right neighbor is not contiguous.
*/
temp = PREV.br_blockcount - new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- xfs_iext_insert(ip, idx + 1, 1, new, state);
- ip->i_df.if_lastex = idx + 1;
+ xfs_iext_insert(ip, *idx + 1, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1027,9 +994,11 @@ xfs_bmap_add_extent_delay_real(
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
startblockval(PREV.br_startblock) -
(cur ? cur->bc_private.b.allocated : 0));
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
*dnew = temp;
break;
@@ -1056,7 +1025,7 @@ xfs_bmap_add_extent_delay_real(
*/
temp = new->br_startoff - PREV.br_startoff;
temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff;
- trace_xfs_bmap_pre_update(ip, idx, 0, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, 0, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */
LEFT = *new;
RIGHT.br_state = PREV.br_state;
@@ -1065,8 +1034,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_startoff = new_endoff;
RIGHT.br_blockcount = temp2;
/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
- xfs_iext_insert(ip, idx + 1, 2, &LEFT, state);
- ip->i_df.if_lastex = idx + 1;
+ xfs_iext_insert(ip, *idx + 1, 2, &LEFT, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1097,7 +1065,7 @@ xfs_bmap_add_extent_delay_real(
(cur ? cur->bc_private.b.allocated : 0));
if (diff > 0 &&
xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- -((int64_t)diff), rsvd)) {
+ -((int64_t)diff), 0)) {
/*
* Ick gross gag me with a spoon.
*/
@@ -1109,7 +1077,7 @@ xfs_bmap_add_extent_delay_real(
if (!diff ||
!xfs_icsb_modify_counters(ip->i_mount,
XFS_SBS_FDBLOCKS,
- -((int64_t)diff), rsvd))
+ -((int64_t)diff), 0))
break;
}
if (temp2) {
@@ -1118,18 +1086,20 @@ xfs_bmap_add_extent_delay_real(
if (!diff ||
!xfs_icsb_modify_counters(ip->i_mount,
XFS_SBS_FDBLOCKS,
- -((int64_t)diff), rsvd))
+ -((int64_t)diff), 0))
break;
}
}
}
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx + 2, state, _THIS_IP_);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2),
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx + 2, state, _THIS_IP_);
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx + 2),
nullstartblock((int)temp2));
- trace_xfs_bmap_post_update(ip, idx + 2, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx + 2, state, _THIS_IP_);
+
+ ++*idx;
*dnew = temp + temp2;
break;
@@ -1161,7 +1131,7 @@ done:
STATIC int /* error */
xfs_bmap_add_extent_unwritten_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t **curp, /* if *curp is null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp) /* inode logging flags */
@@ -1188,7 +1158,7 @@ xfs_bmap_add_extent_unwritten_real(
error = 0;
cur = *curp;
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &PREV);
newext = new->br_state;
oldext = (newext == XFS_EXT_UNWRITTEN) ?
@@ -1211,9 +1181,9 @@ xfs_bmap_add_extent_unwritten_real(
* Check and set flags if this segment has a left neighbor.
* Don't set contiguous if the combined extent would be too large.
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT);
if (isnullstartblock(LEFT.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -1231,9 +1201,9 @@ xfs_bmap_add_extent_unwritten_real(
* Don't set contiguous if the combined extent would be too large.
* Also check for all-three-contiguous being too large.
*/
- if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
if (isnullstartblock(RIGHT.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1262,14 +1232,15 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left and right neighbors are both contiguous with new.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount +
RIGHT.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, idx, 2, state);
- ip->i_df.if_lastex = idx - 1;
+ xfs_iext_remove(ip, *idx + 1, 2, state);
ip->i_d.di_nextents -= 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1305,13 +1276,14 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The left neighbor is contiguous, the right is not.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
LEFT.br_blockcount + PREV.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx - 1;
- xfs_iext_remove(ip, idx, 1, state);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1341,13 +1313,12 @@ xfs_bmap_add_extent_unwritten_real(
* Setting all of a previous oldext extent to newext.
* The right neighbor is contiguous, the left is not.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount + RIGHT.br_blockcount);
xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
- xfs_iext_remove(ip, idx + 1, 1, state);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_remove(ip, *idx + 1, 1, state);
ip->i_d.di_nextents--;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1378,11 +1349,10 @@ xfs_bmap_add_extent_unwritten_real(
* Neither the left nor right neighbors are contiguous with
* the new one.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_state(ep, newext);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1404,21 +1374,22 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1),
LEFT.br_blockcount + new->br_blockcount);
xfs_bmbt_set_startoff(ep,
PREV.br_startoff + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startblock(ep,
new->br_startblock + new->br_blockcount);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ --*idx;
- ip->i_df.if_lastex = idx - 1;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1449,17 +1420,16 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the first part of a previous oldext extent to newext.
* The left neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
ASSERT(ep && xfs_bmbt_get_state(ep) == oldext);
xfs_bmbt_set_startoff(ep, new_endoff);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
xfs_bmbt_set_startblock(ep,
new->br_startblock + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_insert(ip, idx, 1, new, state);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1488,17 +1458,19 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is contiguous with the new allocation.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
- trace_xfs_bmap_pre_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1),
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
new->br_startoff, new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount, newext);
- trace_xfs_bmap_post_update(ip, idx + 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ip->i_df.if_lastex = idx + 1;
if (cur == NULL)
rval = XFS_ILOG_DEXT;
else {
@@ -1528,13 +1500,14 @@ xfs_bmap_add_extent_unwritten_real(
* Setting the last part of a previous oldext extent to newext.
* The right neighbor is not contiguous.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
PREV.br_blockcount - new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
- xfs_iext_insert(ip, idx + 1, 1, new, state);
- ip->i_df.if_lastex = idx + 1;
ip->i_d.di_nextents++;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1568,10 +1541,10 @@ xfs_bmap_add_extent_unwritten_real(
* newext. Contiguity is impossible here.
* One extent becomes three extents.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep,
new->br_startoff - PREV.br_startoff);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
r[0] = *new;
r[1].br_startoff = new_endoff;
@@ -1579,8 +1552,10 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startoff + PREV.br_blockcount - new_endoff;
r[1].br_startblock = new->br_startblock + new->br_blockcount;
r[1].br_state = oldext;
- xfs_iext_insert(ip, idx + 1, 2, &r[0], state);
- ip->i_df.if_lastex = idx + 1;
+
+ ++*idx;
+ xfs_iext_insert(ip, *idx, 2, &r[0], state);
+
ip->i_d.di_nextents += 2;
if (cur == NULL)
rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
@@ -1650,12 +1625,10 @@ done:
STATIC int /* error */
xfs_bmap_add_extent_hole_delay(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
- int *logflagsp, /* inode logging flags */
- int rsvd) /* OK to allocate reserved blocks */
+ int *logflagsp) /* inode logging flags */
{
- xfs_bmbt_rec_host_t *ep; /* extent record for idx */
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_filblks_t newlen=0; /* new indirect size */
@@ -1665,16 +1638,15 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t temp=0; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
- ep = xfs_iext_get_ext(ifp, idx);
state = 0;
ASSERT(isnullstartblock(new->br_startblock));
/*
* Check and set flags if this segment has a left neighbor
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
@@ -1684,9 +1656,9 @@ xfs_bmap_add_extent_hole_delay(
* Check and set flags if the current (right) segment exists.
* If it doesn't exist, we're converting the hole at end-of-file.
*/
- if (idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(ep, &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
@@ -1719,21 +1691,21 @@ xfs_bmap_add_extent_hole_delay(
* on the left and on the right.
* Merge all three into a single extent record.
*/
+ --*idx;
temp = left.br_blockcount + new->br_blockcount +
right.br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- xfs_iext_remove(ip, idx, 1, state);
- ip->i_df.if_lastex = idx - 1;
+ xfs_iext_remove(ip, *idx + 1, 1, state);
break;
case BMAP_LEFT_CONTIG:
@@ -1742,17 +1714,17 @@ xfs_bmap_add_extent_hole_delay(
* on the left.
* Merge the new allocation with the left neighbor.
*/
+ --*idx;
temp = left.br_blockcount + new->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp);
+
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
oldlen = startblockval(left.br_startblock) +
startblockval(new->br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1),
+ xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
nullstartblock((int)newlen));
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
-
- ip->i_df.if_lastex = idx - 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case BMAP_RIGHT_CONTIG:
@@ -1761,16 +1733,15 @@ xfs_bmap_add_extent_hole_delay(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
temp = new->br_blockcount + right.br_blockcount;
oldlen = startblockval(new->br_startblock) +
startblockval(right.br_startblock);
newlen = xfs_bmap_worst_indlen(ip, temp);
- xfs_bmbt_set_allf(ep, new->br_startoff,
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff,
nullstartblock((int)newlen), temp, right.br_state);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
-
- ip->i_df.if_lastex = idx;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
break;
case 0:
@@ -1780,14 +1751,13 @@ xfs_bmap_add_extent_hole_delay(
* Insert a new entry.
*/
oldlen = newlen = 0;
- xfs_iext_insert(ip, idx, 1, new, state);
- ip->i_df.if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
break;
}
if (oldlen != newlen) {
ASSERT(oldlen > newlen);
xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
- (int64_t)(oldlen - newlen), rsvd);
+ (int64_t)(oldlen - newlen), 0);
/*
* Nothing to do for disk quota accounting here.
*/
@@ -1803,13 +1773,12 @@ xfs_bmap_add_extent_hole_delay(
STATIC int /* error */
xfs_bmap_add_extent_hole_real(
xfs_inode_t *ip, /* incore inode pointer */
- xfs_extnum_t idx, /* extent number to update/insert */
+ xfs_extnum_t *idx, /* extent number to update/insert */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *new, /* new data to add to file extents */
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_rec_host_t *ep; /* pointer to extent entry ins. point */
int error; /* error return value */
int i; /* temp state */
xfs_ifork_t *ifp; /* inode fork pointer */
@@ -1819,8 +1788,7 @@ xfs_bmap_add_extent_hole_real(
int state; /* state bits, accessed thru macros */
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
- ep = xfs_iext_get_ext(ifp, idx);
+ ASSERT(*idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -1829,9 +1797,9 @@ xfs_bmap_add_extent_hole_real(
/*
* Check and set flags if this segment has a left neighbor.
*/
- if (idx > 0) {
+ if (*idx > 0) {
state |= BMAP_LEFT_VALID;
- xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
if (isnullstartblock(left.br_startblock))
state |= BMAP_LEFT_DELAY;
}
@@ -1840,9 +1808,9 @@ xfs_bmap_add_extent_hole_real(
* Check and set flags if this segment has a current value.
* Not true if we're inserting into the "hole" at eof.
*/
- if (idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+ if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
state |= BMAP_RIGHT_VALID;
- xfs_bmbt_get_all(ep, &right);
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
if (isnullstartblock(right.br_startblock))
state |= BMAP_RIGHT_DELAY;
}
@@ -1879,14 +1847,15 @@ xfs_bmap_add_extent_hole_real(
* left and on the right.
* Merge all three into a single extent record.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount +
right.br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+
+ xfs_iext_remove(ip, *idx + 1, 1, state);
- xfs_iext_remove(ip, idx, 1, state);
- ifp->if_lastex = idx - 1;
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
if (cur == NULL) {
@@ -1921,12 +1890,12 @@ xfs_bmap_add_extent_hole_real(
* on the left.
* Merge the new allocation with the left neighbor.
*/
- trace_xfs_bmap_pre_update(ip, idx - 1, state, _THIS_IP_);
- xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1),
+ --*idx;
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
left.br_blockcount + new->br_blockcount);
- trace_xfs_bmap_post_update(ip, idx - 1, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ifp->if_lastex = idx - 1;
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
@@ -1952,13 +1921,13 @@ xfs_bmap_add_extent_hole_real(
* on the right.
* Merge the new allocation with the right neighbor.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
- xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock,
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+ xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
+ new->br_startoff, new->br_startblock,
new->br_blockcount + right.br_blockcount,
right.br_state);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
- ifp->if_lastex = idx;
if (cur == NULL) {
rval = xfs_ilog_fext(whichfork);
} else {
@@ -1984,8 +1953,7 @@ xfs_bmap_add_extent_hole_real(
* real allocation.
* Insert a new entry.
*/
- xfs_iext_insert(ip, idx, 1, new, state);
- ifp->if_lastex = idx;
+ xfs_iext_insert(ip, *idx, 1, new, state);
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
if (cur == NULL) {
@@ -2833,13 +2801,12 @@ STATIC int /* error */
xfs_bmap_del_extent(
xfs_inode_t *ip, /* incore inode pointer */
xfs_trans_t *tp, /* current transaction pointer */
- xfs_extnum_t idx, /* extent number to update/delete */
+ xfs_extnum_t *idx, /* extent number to update/delete */
xfs_bmap_free_t *flist, /* list of extents to be freed */
xfs_btree_cur_t *cur, /* if null, not a btree */
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
- int whichfork, /* data or attr fork */
- int rsvd) /* OK to allocate reserved blocks */
+ int whichfork) /* data or attr fork */
{
xfs_filblks_t da_new; /* new delay-alloc indirect blocks */
xfs_filblks_t da_old; /* old delay-alloc indirect blocks */
@@ -2870,10 +2837,10 @@ xfs_bmap_del_extent(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
- ASSERT((idx >= 0) && (idx < ifp->if_bytes /
+ ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
ASSERT(del->br_blockcount > 0);
- ep = xfs_iext_get_ext(ifp, idx);
+ ep = xfs_iext_get_ext(ifp, *idx);
xfs_bmbt_get_all(ep, &got);
ASSERT(got.br_startoff <= del->br_startoff);
del_endoff = del->br_startoff + del->br_blockcount;
@@ -2947,11 +2914,12 @@ xfs_bmap_del_extent(
/*
* Matches the whole extent. Delete the entry.
*/
- xfs_iext_remove(ip, idx, 1,
+ xfs_iext_remove(ip, *idx, 1,
whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
- ifp->if_lastex = idx;
+ --*idx;
if (delay)
break;
+
XFS_IFORK_NEXT_SET(ip, whichfork,
XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
flags |= XFS_ILOG_CORE;
@@ -2968,21 +2936,20 @@ xfs_bmap_del_extent(
/*
* Deleting the first part of the extent.
*/
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_startoff(ep, del_endoff);
temp = got.br_blockcount - del->br_blockcount;
xfs_bmbt_set_blockcount(ep, temp);
- ifp->if_lastex = idx;
if (delay) {
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
da_old);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
da_new = temp;
break;
}
xfs_bmbt_set_startblock(ep, del_endblock);
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
@@ -2998,18 +2965,17 @@ xfs_bmap_del_extent(
* Deleting the last part of the extent.
*/
temp = got.br_blockcount - del->br_blockcount;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
- ifp->if_lastex = idx;
if (delay) {
temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
da_old);
xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
da_new = temp;
break;
}
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
if (!cur) {
flags |= xfs_ilog_fext(whichfork);
break;
@@ -3026,7 +2992,7 @@ xfs_bmap_del_extent(
* Deleting the middle of the extent.
*/
temp = del->br_startoff - got.br_startoff;
- trace_xfs_bmap_pre_update(ip, idx, state, _THIS_IP_);
+ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
xfs_bmbt_set_blockcount(ep, temp);
new.br_startoff = del_endoff;
temp2 = got_endoff - del_endoff;
@@ -3113,9 +3079,9 @@ xfs_bmap_del_extent(
}
}
}
- trace_xfs_bmap_post_update(ip, idx, state, _THIS_IP_);
- xfs_iext_insert(ip, idx + 1, 1, &new, state);
- ifp->if_lastex = idx + 1;
+ trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
+ xfs_iext_insert(ip, *idx + 1, 1, &new, state);
+ ++*idx;
break;
}
/*
@@ -3142,7 +3108,7 @@ xfs_bmap_del_extent(
ASSERT(da_old >= da_new);
if (da_old > da_new) {
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)(da_old - da_new), rsvd);
+ (int64_t)(da_old - da_new), 0);
}
done:
*logflagsp = flags;
@@ -4562,29 +4528,24 @@ xfs_bmapi(
if (rt) {
error = xfs_mod_incore_sb(mp,
XFS_SBS_FREXTENTS,
- -((int64_t)extsz), (flags &
- XFS_BMAPI_RSVBLOCKS));
+ -((int64_t)extsz), 0);
} else {
error = xfs_icsb_modify_counters(mp,
XFS_SBS_FDBLOCKS,
- -((int64_t)alen), (flags &
- XFS_BMAPI_RSVBLOCKS));
+ -((int64_t)alen), 0);
}
if (!error) {
error = xfs_icsb_modify_counters(mp,
XFS_SBS_FDBLOCKS,
- -((int64_t)indlen), (flags &
- XFS_BMAPI_RSVBLOCKS));
+ -((int64_t)indlen), 0);
if (error && rt)
xfs_mod_incore_sb(mp,
XFS_SBS_FREXTENTS,
- (int64_t)extsz, (flags &
- XFS_BMAPI_RSVBLOCKS));
+ (int64_t)extsz, 0);
else if (error)
xfs_icsb_modify_counters(mp,
XFS_SBS_FDBLOCKS,
- (int64_t)alen, (flags &
- XFS_BMAPI_RSVBLOCKS));
+ (int64_t)alen, 0);
}
if (error) {
@@ -4701,13 +4662,12 @@ xfs_bmapi(
if (!wasdelay && (flags & XFS_BMAPI_PREALLOC))
got.br_state = XFS_EXT_UNWRITTEN;
}
- error = xfs_bmap_add_extent(ip, lastx, &cur, &got,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, &got,
firstblock, flist, &tmp_logflags,
- whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+ whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
- lastx = ifp->if_lastex;
ep = xfs_iext_get_ext(ifp, lastx);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
xfs_bmbt_get_all(ep, &got);
@@ -4803,13 +4763,12 @@ xfs_bmapi(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM
: XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur, mval,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, mval,
firstblock, flist, &tmp_logflags,
- whichfork, (flags & XFS_BMAPI_RSVBLOCKS));
+ whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
- lastx = ifp->if_lastex;
ep = xfs_iext_get_ext(ifp, lastx);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
xfs_bmbt_get_all(ep, &got);
@@ -4868,14 +4827,14 @@ xfs_bmapi(
/*
* Else go on to the next record.
*/
- ep = xfs_iext_get_ext(ifp, ++lastx);
prev = got;
- if (lastx >= nextents)
- eof = 1;
- else
+ if (++lastx < nextents) {
+ ep = xfs_iext_get_ext(ifp, lastx);
xfs_bmbt_get_all(ep, &got);
+ } else {
+ eof = 1;
+ }
}
- ifp->if_lastex = lastx;
*nmap = n;
/*
* Transform from btree to extents, give it cur.
@@ -4984,7 +4943,6 @@ xfs_bmapi_single(
ASSERT(!isnullstartblock(got.br_startblock));
ASSERT(bno < got.br_startoff + got.br_blockcount);
*fsb = got.br_startblock + (bno - got.br_startoff);
- ifp->if_lastex = lastx;
return 0;
}
@@ -5026,7 +4984,6 @@ xfs_bunmapi(
int tmp_logflags; /* partial logging flags */
int wasdel; /* was a delayed alloc extent */
int whichfork; /* data or attribute fork */
- int rsvd; /* OK to allocate reserved blocks */
xfs_fsblock_t sum;
trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
@@ -5044,7 +5001,7 @@ xfs_bunmapi(
mp = ip->i_mount;
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0;
+
ASSERT(len > 0);
ASSERT(nexts >= 0);
ASSERT(ifp->if_ext_max ==
@@ -5160,9 +5117,9 @@ xfs_bunmapi(
del.br_blockcount = mod;
}
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur, &del,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur, &del,
firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ XFS_DATA_FORK);
if (error)
goto error0;
goto nodelete;
@@ -5188,9 +5145,12 @@ xfs_bunmapi(
*/
ASSERT(bno >= del.br_blockcount);
bno -= del.br_blockcount;
- if (bno < got.br_startoff) {
- if (--lastx >= 0)
- xfs_bmbt_get_all(--ep, &got);
+ if (got.br_startoff > bno) {
+ if (--lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ xfs_bmbt_get_all(ep, &got);
+ }
}
continue;
} else if (del.br_state == XFS_EXT_UNWRITTEN) {
@@ -5214,18 +5174,19 @@ xfs_bunmapi(
prev.br_startoff = start;
}
prev.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx - 1, &cur,
+ lastx--;
+ error = xfs_bmap_add_extent(ip, &lastx, &cur,
&prev, firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ XFS_DATA_FORK);
if (error)
goto error0;
goto nodelete;
} else {
ASSERT(del.br_state == XFS_EXT_NORM);
del.br_state = XFS_EXT_UNWRITTEN;
- error = xfs_bmap_add_extent(ip, lastx, &cur,
+ error = xfs_bmap_add_extent(ip, &lastx, &cur,
&del, firstblock, flist, &logflags,
- XFS_DATA_FORK, 0);
+ XFS_DATA_FORK);
if (error)
goto error0;
goto nodelete;
@@ -5240,13 +5201,13 @@ xfs_bunmapi(
rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
do_div(rtexts, mp->m_sb.sb_rextsize);
xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
- (int64_t)rtexts, rsvd);
+ (int64_t)rtexts, 0);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_RTBLKS);
} else {
xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
- (int64_t)del.br_blockcount, rsvd);
+ (int64_t)del.br_blockcount, 0);
(void)xfs_trans_reserve_quota_nblks(NULL,
ip, -((long)del.br_blockcount), 0,
XFS_QMOPT_RES_REGBLKS);
@@ -5277,31 +5238,29 @@ xfs_bunmapi(
error = XFS_ERROR(ENOSPC);
goto error0;
}
- error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del,
- &tmp_logflags, whichfork, rsvd);
+ error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
+ &tmp_logflags, whichfork);
logflags |= tmp_logflags;
if (error)
goto error0;
bno = del.br_startoff - 1;
nodelete:
- lastx = ifp->if_lastex;
/*
* If not done go on to the next (previous) record.
- * Reset ep in case the extents array was re-alloced.
*/
- ep = xfs_iext_get_ext(ifp, lastx);
if (bno != (xfs_fileoff_t)-1 && bno >= start) {
- if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) ||
- xfs_bmbt_get_startoff(ep) > bno) {
- if (--lastx >= 0)
- ep = xfs_iext_get_ext(ifp, lastx);
- }
- if (lastx >= 0)
+ if (lastx >= 0) {
+ ep = xfs_iext_get_ext(ifp, lastx);
+ if (xfs_bmbt_get_startoff(ep) > bno) {
+ if (--lastx >= 0)
+ ep = xfs_iext_get_ext(ifp,
+ lastx);
+ }
xfs_bmbt_get_all(ep, &got);
+ }
extno++;
}
}
- ifp->if_lastex = lastx;
*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
ASSERT(ifp->if_ext_max ==
XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 3651191daea1..c62234bde053 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -69,7 +69,6 @@ typedef struct xfs_bmap_free
#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */
#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */
#define XFS_BMAPI_ATTRFORK 0x010 /* use attribute fork not data */
-#define XFS_BMAPI_RSVBLOCKS 0x020 /* OK to alloc. reserved data blocks */
#define XFS_BMAPI_PREALLOC 0x040 /* preallocation op: unwritten space */
#define XFS_BMAPI_IGSTATE 0x080 /* Ignore state - */
/* combine contig. space */
@@ -87,7 +86,6 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
{ XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \
- { XFS_BMAPI_RSVBLOCKS, "RSVBLOCKS" }, \
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index be628677c288..9a84a85c03b1 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -202,7 +202,7 @@ xfs_swap_extents(
xfs_inode_t *tip, /* tmp inode */
xfs_swapext_t *sxp)
{
- xfs_mount_t *mp;
+ xfs_mount_t *mp = ip->i_mount;
xfs_trans_t *tp;
xfs_bstat_t *sbp = &sxp->sx_stat;
xfs_ifork_t *tempifp, *ifp, *tifp;
@@ -212,16 +212,12 @@ xfs_swap_extents(
int taforkblks = 0;
__uint64_t tmp;
- mp = ip->i_mount;
-
tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
if (!tempifp) {
error = XFS_ERROR(ENOMEM);
goto out;
}
- sbp = &sxp->sx_stat;
-
/*
* we have to do two separate lock calls here to keep lockdep
* happy. If we try to get all the locks in one call, lock will
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a37480a6e023..a098a20ca63e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -920,7 +920,6 @@ xfs_iread_extents(
/*
* We know that the size is valid (it's checked in iformat_btree)
*/
- ifp->if_lastex = NULLEXTNUM;
ifp->if_bytes = ifp->if_real_bytes = 0;
ifp->if_flags |= XFS_IFEXTENTS;
xfs_iext_add(ifp, 0, nextents);
@@ -1354,7 +1353,7 @@ xfs_itruncate_start(
return 0;
}
last_byte = xfs_file_last_byte(ip);
- trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte);
+ trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte);
if (last_byte > toss_start) {
if (flags & XFS_ITRUNC_DEFINITE) {
xfs_tosspages(ip, toss_start,
@@ -1470,7 +1469,7 @@ xfs_itruncate_finish(
* file but the log buffers containing the free and reallocation
* don't, then we'd end up with garbage in the blocks being freed.
* As long as we make the new_size permanent before actually
- * freeing any blocks it doesn't matter if they get writtten to.
+ * freeing any blocks it doesn't matter if they get written to.
*
* The callers must signal into us whether or not the size
* setting here must be synchronous. There are a few cases
@@ -2558,12 +2557,9 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_EXTENTS:
ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
!(iip->ili_format.ilf_fields & extflag[whichfork]));
- ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) ||
- (ifp->if_bytes == 0));
- ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) ||
- (ifp->if_bytes > 0));
if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
(ifp->if_bytes > 0)) {
+ ASSERT(xfs_iext_get_ext(ifp, 0));
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
whichfork);
@@ -3112,6 +3108,8 @@ xfs_iext_get_ext(
xfs_extnum_t idx) /* index of target extent */
{
ASSERT(idx >= 0);
+ ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+
if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
return ifp->if_u1.if_ext_irec->er_extbuf;
} else if (ifp->if_flags & XFS_IFEXTIREC) {
@@ -3191,7 +3189,6 @@ xfs_iext_add(
}
ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
ifp->if_real_bytes = 0;
- ifp->if_lastex = nextents + ext_diff;
}
/*
* Otherwise use a linear (direct) extent list.
@@ -3886,8 +3883,10 @@ xfs_iext_idx_to_irec(
xfs_extnum_t page_idx = *idxp; /* extent index in target list */
ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- ASSERT(page_idx >= 0 && page_idx <=
- ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t));
+ ASSERT(page_idx >= 0);
+ ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+ ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
erp_idx = 0;
low = 0;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index ff4e2a30227d..3ae6d58e5473 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -67,7 +67,6 @@ typedef struct xfs_ifork {
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
unsigned char if_ext_max; /* max # of extent records */
- xfs_extnum_t if_lastex; /* last if_extents used */
union {
xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 576fdfe81d60..09983a3344a5 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -970,7 +970,6 @@ xfs_iflush_abort(
{
xfs_inode_log_item_t *iip = ip->i_itemp;
- iip = ip->i_itemp;
if (iip) {
struct xfs_ail *ailp = iip->ili_item.li_ailp;
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b612ce4520ae..211930246f20 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log)
xlog_cil_destroy(log);
+ /*
+ * always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it.
+ */
+ xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size);
+ xfs_buf_free(log->l_xbuf);
+
iclog = log->l_iclog;
for (i=0; i<log->l_iclog_bufs; i++) {
xfs_buf_free(iclog->ic_bp);
@@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log)
}
spinlock_destroy(&log->l_icloglock);
- xfs_buf_free(log->l_xbuf);
log->l_mp->m_log = NULL;
kmem_free(log);
} /* xlog_dealloc_log */
@@ -3248,13 +3254,6 @@ xfs_log_ticket_get(
return ticket;
}
-xlog_tid_t
-xfs_log_get_trans_ident(
- struct xfs_trans *tp)
-{
- return tp->t_ticket->t_tid;
-}
-
/*
* Allocate and initialise a new log ticket.
*/
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 3bd3291ef8d2..78c9039994af 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -189,8 +189,6 @@ void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
-
void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
struct xfs_log_vec *log_vector,
xfs_lsn_t *commit_lsn, int flags);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9ca59be08977..c7755d5a5fbe 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -29,6 +29,7 @@
#include "xfs_mount.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
+#include "xfs_discard.h"
/*
* Perform initial CIL structure initialisation. If the CIL is not
@@ -361,19 +362,28 @@ xlog_cil_committed(
int abort)
{
struct xfs_cil_ctx *ctx = args;
- struct xfs_busy_extent *busyp, *n;
+ struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
ctx->start_lsn, abort);
- list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
- xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+ xfs_alloc_busy_sort(&ctx->busy_extents);
+ xfs_alloc_busy_clear(mp, &ctx->busy_extents,
+ (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
spin_lock(&ctx->cil->xc_cil_lock);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_cil_lock);
xlog_cil_free_logvec(ctx->lv_chain);
+
+ if (!list_empty(&ctx->busy_extents)) {
+ ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+ xfs_discard_extents(mp, &ctx->busy_extents);
+ xfs_alloc_busy_clear(mp, &ctx->busy_extents, false);
+ }
+
kmem_free(ctx);
}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 5864850e9e34..2d3b6a498d63 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i)
shutdown */
#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
+typedef __uint32_t xlog_tid_t;
+
#ifdef __KERNEL__
/*
* Below are states for covering allocation transactions.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5cc464a17c93..04142caedb2b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -205,6 +205,35 @@ xlog_bread(
}
/*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+ xlog_t *log,
+ xfs_daddr_t blk_no, /* block to read from */
+ int nbblks, /* blocks to read */
+ xfs_buf_t *bp,
+ xfs_caddr_t offset)
+{
+ xfs_caddr_t orig_offset = XFS_BUF_PTR(bp);
+ int orig_len = bp->b_buffer_length;
+ int error, error2;
+
+ error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
+ if (error)
+ return error;
+
+ error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+ /* must reset buffer pointer even on error */
+ error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
+ if (error)
+ return error;
+ return error2;
+}
+
+/*
* Write out the buffer at the given block for the given number of blocks.
* The buffer is kept locked across the write and is returned locked.
* This can only be used for synchronous log writes.
@@ -1229,20 +1258,12 @@ xlog_write_log_records(
*/
ealign = round_down(end_block, sectbb);
if (j == 0 && (start_block + endcount > ealign)) {
- offset = XFS_BUF_PTR(bp);
- balign = BBTOB(ealign - start_block);
- error = XFS_BUF_SET_PTR(bp, offset + balign,
- BBTOB(sectbb));
+ offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block);
+ error = xlog_bread_offset(log, ealign, sectbb,
+ bp, offset);
if (error)
break;
- error = xlog_bread_noalign(log, ealign, sectbb, bp);
- if (error)
- break;
-
- error = XFS_BUF_SET_PTR(bp, offset, bufblks);
- if (error)
- break;
}
offset = xlog_align(log, start_block, endcount, bp);
@@ -3448,19 +3469,9 @@ xlog_do_recovery_pass(
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- error = XFS_BUF_SET_PTR(hbp,
- offset + BBTOB(split_hblks),
- BBTOB(hblks - split_hblks));
- if (error)
- goto bread_err2;
-
- error = xlog_bread_noalign(log, 0,
- wrapped_hblks, hbp);
- if (error)
- goto bread_err2;
-
- error = XFS_BUF_SET_PTR(hbp, offset,
- BBTOB(hblks));
+ error = xlog_bread_offset(log, 0,
+ wrapped_hblks, hbp,
+ offset + BBTOB(split_hblks));
if (error)
goto bread_err2;
}
@@ -3511,19 +3522,9 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
- error = XFS_BUF_SET_PTR(dbp,
- offset + BBTOB(split_bblks),
- BBTOB(bblks - split_bblks));
- if (error)
- goto bread_err2;
-
- error = xlog_bread_noalign(log, wrapped_hblks,
- bblks - split_bblks,
- dbp);
- if (error)
- goto bread_err2;
-
- error = XFS_BUF_SET_PTR(dbp, offset, h_size);
+ error = xlog_bread_offset(log, 0,
+ bblks - split_bblks, hbp,
+ offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb3f9a7b24ed..b49b82363d20 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch(
uint nmsb,
int rsvd)
{
- xfs_mod_sb_t *msbp = &msb[0];
+ xfs_mod_sb_t *msbp;
int error = 0;
/*
@@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch(
* changes will be atomic.
*/
spin_lock(&mp->m_sb_lock);
- for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) {
+ for (msbp = msb; msbp < (msb + nmsb); msbp++) {
ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
msbp->msb_field > XFS_SBS_FDBLOCKS);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 19af0ab0d0c6..3d68bb267c5f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -224,6 +224,7 @@ typedef struct xfs_mount {
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
operations, typically for
disk errors in metadata */
+#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */
#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
user */
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 76922793f64f..7c7bc2b786bd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -608,10 +608,8 @@ STATIC void
xfs_trans_free(
struct xfs_trans *tp)
{
- struct xfs_busy_extent *busyp, *n;
-
- list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
- xfs_alloc_busy_clear(tp->t_mountp, busyp);
+ xfs_alloc_busy_sort(&tp->t_busy);
+ xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy, false);
atomic_dec(&tp->t_mountp->m_active_trans);
xfs_trans_free_dqinfo(tp);
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 26d1867d8156..65584b55607d 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
-typedef __uint32_t xlog_tid_t; /* transaction ID type */
-
/*
* These types are 64 bits on disk but are either 32 or 64 bits in memory.
* Disk based types: