summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/inode.c15
-rw-r--r--fs/affs/file.c18
-rw-r--r--fs/affs/inode.c5
-rw-r--r--fs/bfs/file.c15
-rw-r--r--fs/btrfs/ctree.c14
-rw-r--r--fs/btrfs/inode.c16
-rw-r--r--fs/cachefiles/interface.c57
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/cachefiles/rdwr.c114
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/addr.c60
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/file.c73
-rw-r--r--fs/ceph/inode.c15
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/dcache.c35
-rw-r--r--fs/exec.c21
-rw-r--r--fs/exportfs/expfs.c4
-rw-r--r--fs/f2fs/Kconfig53
-rw-r--r--fs/f2fs/Makefile7
-rw-r--r--fs/f2fs/acl.c414
-rw-r--r--fs/f2fs/acl.h57
-rw-r--r--fs/f2fs/checkpoint.c794
-rw-r--r--fs/f2fs/data.c702
-rw-r--r--fs/f2fs/debug.c361
-rw-r--r--fs/f2fs/dir.c672
-rw-r--r--fs/f2fs/f2fs.h1083
-rw-r--r--fs/f2fs/file.c636
-rw-r--r--fs/f2fs/gc.c742
-rw-r--r--fs/f2fs/gc.h117
-rw-r--r--fs/f2fs/hash.c97
-rw-r--r--fs/f2fs/inode.c268
-rw-r--r--fs/f2fs/namei.c503
-rw-r--r--fs/f2fs/node.c1764
-rw-r--r--fs/f2fs/node.h353
-rw-r--r--fs/f2fs/recovery.c375
-rw-r--r--fs/f2fs/segment.c1791
-rw-r--r--fs/f2fs/segment.h618
-rw-r--r--fs/f2fs/super.c657
-rw-r--r--fs/f2fs/xattr.c440
-rw-r--r--fs/f2fs/xattr.h145
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fscache/cache.c8
-rw-r--r--fs/fscache/cookie.c78
-rw-r--r--fs/fscache/internal.h15
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/object.c101
-rw-r--r--fs/fscache/operation.c140
-rw-r--r--fs/fscache/page.c195
-rw-r--r--fs/fscache/stats.c17
-rw-r--r--fs/hfs/inode.c26
-rw-r--r--fs/hfsplus/inode.c27
-rw-r--r--fs/hpfs/file.c20
-rw-r--r--fs/hpfs/hpfs_fn.h1
-rw-r--r--fs/hpfs/inode.c5
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c20
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/logfs/readwrite.c10
-rw-r--r--fs/minix/file.c6
-rw-r--r--fs/minix/inode.c17
-rw-r--r--fs/namei.c118
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/fscache.h20
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/fault_inject.c113
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/netns.h66
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c6
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfs4callback.c69
-rw-r--r--fs/nfsd/nfs4proc.c74
-rw-r--r--fs/nfsd/nfs4recover.c561
-rw-r--r--fs/nfsd/nfs4state.c1015
-rw-r--r--fs/nfsd/nfs4xdr.c324
-rw-r--r--fs/nfsd/nfsctl.c100
-rw-r--r--fs/nfsd/nfsd.h36
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfssvc.c203
-rw-r--r--fs/nfsd/nfsxdr.c11
-rw-r--r--fs/nfsd/state.h64
-rw-r--r--fs/nfsd/vfs.c53
-rw-r--r--fs/nfsd/xdr4.h15
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/inode.c24
-rw-r--r--fs/nilfs2/nilfs.h1
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ocfs2/file.c18
-rw-r--r--fs/omfs/file.c22
-rw-r--r--fs/open.c97
-rw-r--r--fs/proc/base.c7
-rw-r--r--fs/proc/generic.c9
-rw-r--r--fs/proc/proc_sysctl.c7
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c15
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/stat.c16
-rw-r--r--fs/statfs.c9
-rw-r--r--fs/sysv/file.c5
-rw-r--r--fs/sysv/itree.c17
-rw-r--r--fs/ufs/inode.c15
-rw-r--r--fs/utimes.c6
-rw-r--r--fs/xattr.c72
119 files changed, 15683 insertions, 1514 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index eaff24a19502..cfe512fd1caf 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -220,6 +220,7 @@ source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
+source "fs/f2fs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 1d7af79288a0..9d53192236fc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_F2FS_FS) += f2fs/
obj-y += exofs/ # Multiple modules
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e9bad5093a3f..5f95d1ed9c6d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, adfs_get_block);
}
+static void adfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int adfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ adfs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 2f4c935cb327..af3261b78102 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {
};
const struct inode_operations affs_file_inode_operations = {
- .truncate = affs_truncate,
.setattr = affs_notify_change,
};
@@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, affs_get_block);
}
+static void affs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ affs_truncate(inode);
+ }
+}
+
static int affs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ affs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 15c484268229..0e092d08680e 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ affs_truncate(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index f20e8a71062f..ad3ea1497cc3 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, bfs_get_block);
}
+static void bfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int bfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
bfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ bfs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c7b67cf24bba..eea5da7a2b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
switch (tm->op) {
case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
BUG_ON(tm->slot < n);
- case MOD_LOG_KEY_REMOVE:
- n++;
case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case MOD_LOG_KEY_REMOVE:
btrfs_set_node_key(eb, &tm->key, tm->slot);
btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
+ n++;
break;
case MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
@@ -4611,12 +4611,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 nritems;
int ret;
- if (level) {
- ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
- MOD_LOG_KEY_REMOVE);
- BUG_ON(ret < 0);
- }
-
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level)
@@ -4627,6 +4621,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_node_key_ptr_offset(slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
+ } else if (level) {
+ ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+ MOD_LOG_KEY_REMOVE);
+ BUG_ON(ret < 0);
}
nritems--;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67ed24ae86bb..16d9e8e191e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4262,16 +4262,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- if (unlikely(d_need_lookup(dentry))) {
- memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
- kfree(dentry->d_fsdata);
- dentry->d_fsdata = NULL;
- /* This thing is hashed, drop it for now */
- d_drop(dentry);
- } else {
- ret = btrfs_inode_by_name(dir, dentry, &location);
- }
-
+ ret = btrfs_inode_by_name(dir, dentry, &location);
if (ret < 0)
return ERR_PTR(ret);
@@ -4341,11 +4332,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *ret;
ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
- if (unlikely(d_need_lookup(dentry))) {
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
- spin_unlock(&dentry->d_lock);
- }
return ret;
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 67bef6d01484..746ce532e130 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object(
_enter("{%s},%p,", cache->cache.identifier, cookie);
- lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL);
+ lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
if (!lookup_data)
goto nomem_lookup_data;
/* create a new object record and a temporary leaf image */
- object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL);
+ object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
if (!object)
goto nomem_object;
@@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object(
* - stick the length on the front and leave space on the back for the
* encoder
*/
- buffer = kmalloc((2 + 512) + 3, GFP_KERNEL);
+ buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
if (!buffer)
goto nomem_buffer;
@@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
return;
}
- auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL);
+ auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
if (!auxdata) {
_leave(" [nomem]");
return;
@@ -441,6 +441,54 @@ truncate_failed:
}
/*
+ * Invalidate an object
+ */
+static void cachefiles_invalidate_object(struct fscache_operation *op)
+{
+ struct cachefiles_object *object;
+ struct cachefiles_cache *cache;
+ const struct cred *saved_cred;
+ struct path path;
+ uint64_t ni_size;
+ int ret;
+
+ object = container_of(op->object, struct cachefiles_object, fscache);
+ cache = container_of(object->fscache.cache,
+ struct cachefiles_cache, cache);
+
+ op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
+ &ni_size);
+
+ _enter("{OBJ%x},[%llu]",
+ op->object->debug_id, (unsigned long long)ni_size);
+
+ if (object->backer) {
+ ASSERT(S_ISREG(object->backer->d_inode->i_mode));
+
+ fscache_set_store_limit(&object->fscache, ni_size);
+
+ path.dentry = object->backer;
+ path.mnt = cache->mnt;
+
+ cachefiles_begin_secure(cache, &saved_cred);
+ ret = vfs_truncate(&path, 0);
+ if (ret == 0)
+ ret = vfs_truncate(&path, ni_size);
+ cachefiles_end_secure(cache, saved_cred);
+
+ if (ret != 0) {
+ fscache_set_store_limit(&object->fscache, 0);
+ if (ret == -EIO)
+ cachefiles_io_error_obj(object,
+ "Invalidate failed");
+ }
+ }
+
+ fscache_op_complete(op, true);
+ _leave("");
+}
+
+/*
* dissociate a cache from all the pages it was backing
*/
static void cachefiles_dissociate_pages(struct fscache_cache *cache)
@@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.lookup_complete = cachefiles_lookup_complete,
.grab_object = cachefiles_grab_object,
.update_object = cachefiles_update_object,
+ .invalidate_object = cachefiles_invalidate_object,
.drop_object = cachefiles_drop_object,
.put_object = cachefiles_put_object,
.sync_cache = cachefiles_sync_cache,
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bd6bc1bde2d7..49382519907a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -23,6 +23,8 @@ extern unsigned cachefiles_debug;
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
+#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+
/*
* node records
*/
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 81b8b2b3a674..33b58c60f2d1 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
_debug("max: %d", max);
- key = kmalloc(max, GFP_KERNEL);
+ key = kmalloc(max, cachefiles_gfp);
if (!key)
return NULL;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index b0b5f7cdfffa..8c01c5fcdf75 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
prefix, fscache_object_states[object->fscache.state],
object->fscache.flags, work_busy(&object->fscache.work),
- object->fscache.events,
- object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
+ object->fscache.events, object->fscache.event_mask);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
prefix, object->fscache.n_ops, object->fscache.n_in_progress,
object->fscache.n_exclusive);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c994691d9445..480992259707 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
struct page *backpage = monitor->back_page, *backpage2;
int ret;
- kenter("{ino=%lx},{%lx,%lx}",
+ _enter("{ino=%lx},{%lx,%lx}",
object->backer->d_inode->i_ino,
backpage->index, backpage->flags);
/* skip if the page was truncated away completely */
if (backpage->mapping != bmapping) {
- kleave(" = -ENODATA [mapping]");
+ _leave(" = -ENODATA [mapping]");
return -ENODATA;
}
backpage2 = find_get_page(bmapping, backpage->index);
if (!backpage2) {
- kleave(" = -ENODATA [gone]");
+ _leave(" = -ENODATA [gone]");
return -ENODATA;
}
if (backpage != backpage2) {
put_page(backpage2);
- kleave(" = -ENODATA [different]");
+ _leave(" = -ENODATA [different]");
return -ENODATA;
}
@@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
if (PageUptodate(backpage))
goto unlock_discard;
- kdebug("reissue read");
+ _debug("reissue read");
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
goto unlock_discard;
@@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
}
/* it'll reappear on the todo list */
- kleave(" = -EINPROGRESS");
+ _leave(" = -EINPROGRESS");
return -EINPROGRESS;
unlock_discard:
@@ -137,7 +137,7 @@ unlock_discard:
spin_lock_irq(&object->work_lock);
list_del(&monitor->op_link);
spin_unlock_irq(&object->work_lock);
- kleave(" = %d", ret);
+ _leave(" = %d", ret);
return ret;
}
@@ -174,11 +174,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
_debug("- copy {%lu}", monitor->back_page->index);
recheck:
- if (PageUptodate(monitor->back_page)) {
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING,
+ &object->fscache.cookie->flags)) {
+ error = -ESTALE;
+ } else if (PageUptodate(monitor->back_page)) {
copy_highpage(monitor->netfs_page, monitor->back_page);
-
- pagevec_add(&pagevec, monitor->netfs_page);
- fscache_mark_pages_cached(monitor->op, &pagevec);
+ fscache_mark_page_cached(monitor->op,
+ monitor->netfs_page);
error = 0;
} else if (!PageError(monitor->back_page)) {
/* the page has probably been truncated */
@@ -198,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
fscache_end_io(op, monitor->netfs_page, error);
page_cache_release(monitor->netfs_page);
+ fscache_retrieval_complete(op, 1);
fscache_put_retrieval(op);
kfree(monitor);
@@ -239,7 +242,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
_debug("read back %p{%lu,%d}",
netpage, netpage->index, page_count(netpage));
- monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+ monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
if (!monitor)
goto nomem;
@@ -258,13 +261,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
goto backing_page_already_present;
if (!newpage) {
- newpage = page_cache_alloc_cold(bmapping);
+ newpage = __page_cache_alloc(cachefiles_gfp |
+ __GFP_COLD);
if (!newpage)
goto nomem_monitor;
}
ret = add_to_page_cache(newpage, bmapping,
- netpage->index, GFP_KERNEL);
+ netpage->index, cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
@@ -335,11 +339,11 @@ backing_page_already_present:
backing_page_already_uptodate:
_debug("- uptodate");
- pagevec_add(pagevec, netpage);
- fscache_mark_pages_cached(op, pagevec);
+ fscache_mark_page_cached(op, netpage);
copy_highpage(netpage, backpage);
fscache_end_io(op, netpage, 0);
+ fscache_retrieval_complete(op, 1);
success:
_debug("success");
@@ -357,10 +361,13 @@ out:
read_error:
_debug("read error %d", ret);
- if (ret == -ENOMEM)
+ if (ret == -ENOMEM) {
+ fscache_retrieval_complete(op, 1);
goto out;
+ }
io_error:
cachefiles_io_error_obj(object, "Page read error on backing file");
+ fscache_retrieval_complete(op, 1);
ret = -ENOBUFS;
goto out;
@@ -370,6 +377,7 @@ nomem_monitor:
fscache_put_retrieval(monitor->op);
kfree(monitor);
nomem:
+ fscache_retrieval_complete(op, 1);
_leave(" = -ENOMEM");
return -ENOMEM;
}
@@ -408,7 +416,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
_enter("{%p},{%lx},,,", object, page->index);
if (!object->backer)
- return -ENOBUFS;
+ goto enobufs;
inode = object->backer->d_inode;
ASSERT(S_ISREG(inode->i_mode));
@@ -417,7 +425,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
/* calculate the shift required to use bmap */
if (inode->i_sb->s_blocksize > PAGE_SIZE)
- return -ENOBUFS;
+ goto enobufs;
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -448,15 +456,20 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
&pagevec);
} else if (cachefiles_has_space(cache, 0, 1) == 0) {
/* there's space in the cache we can use */
- pagevec_add(&pagevec, page);
- fscache_mark_pages_cached(op, &pagevec);
+ fscache_mark_page_cached(op, page);
+ fscache_retrieval_complete(op, 1);
ret = -ENODATA;
} else {
- ret = -ENOBUFS;
+ goto enobufs;
}
_leave(" = %d", ret);
return ret;
+
+enobufs:
+ fscache_retrieval_complete(op, 1);
+ _leave(" = -ENOBUFS");
+ return -ENOBUFS;
}
/*
@@ -465,8 +478,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
*/
static int cachefiles_read_backing_file(struct cachefiles_object *object,
struct fscache_retrieval *op,
- struct list_head *list,
- struct pagevec *mark_pvec)
+ struct list_head *list)
{
struct cachefiles_one_read *monitor = NULL;
struct address_space *bmapping = object->backer->d_inode->i_mapping;
@@ -485,7 +497,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
netpage, netpage->index, page_count(netpage));
if (!monitor) {
- monitor = kzalloc(sizeof(*monitor), GFP_KERNEL);
+ monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
if (!monitor)
goto nomem;
@@ -500,13 +512,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
goto backing_page_already_present;
if (!newpage) {
- newpage = page_cache_alloc_cold(bmapping);
+ newpage = __page_cache_alloc(cachefiles_gfp |
+ __GFP_COLD);
if (!newpage)
goto nomem;
}
ret = add_to_page_cache(newpage, bmapping,
- netpage->index, GFP_KERNEL);
+ netpage->index, cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
@@ -536,10 +549,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
_debug("- monitor add");
ret = add_to_page_cache(netpage, op->mapping, netpage->index,
- GFP_KERNEL);
+ cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
+ fscache_retrieval_complete(op, 1);
continue;
}
goto nomem;
@@ -612,10 +626,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
_debug("- uptodate");
ret = add_to_page_cache(netpage, op->mapping, netpage->index,
- GFP_KERNEL);
+ cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
+ fscache_retrieval_complete(op, 1);
continue;
}
goto nomem;
@@ -626,16 +641,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
page_cache_release(backpage);
backpage = NULL;
- if (!pagevec_add(mark_pvec, netpage))
- fscache_mark_pages_cached(op, mark_pvec);
+ fscache_mark_page_cached(op, netpage);
page_cache_get(netpage);
if (!pagevec_add(&lru_pvec, netpage))
__pagevec_lru_add_file(&lru_pvec);
+ /* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
page_cache_release(netpage);
netpage = NULL;
+ fscache_retrieval_complete(op, 1);
continue;
}
@@ -661,6 +677,7 @@ out:
list_for_each_entry_safe(netpage, _n, list, lru) {
list_del(&netpage->lru);
page_cache_release(netpage);
+ fscache_retrieval_complete(op, 1);
}
_leave(" = %d", ret);
@@ -669,15 +686,17 @@ out:
nomem:
_debug("nomem");
ret = -ENOMEM;
- goto out;
+ goto record_page_complete;
read_error:
_debug("read error %d", ret);
if (ret == -ENOMEM)
- goto out;
+ goto record_page_complete;
io_error:
cachefiles_io_error_obj(object, "Page read error on backing file");
ret = -ENOBUFS;
+record_page_complete:
+ fscache_retrieval_complete(op, 1);
goto out;
}
@@ -709,7 +728,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
*nr_pages);
if (!object->backer)
- return -ENOBUFS;
+ goto all_enobufs;
space = 1;
if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
@@ -722,7 +741,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
/* calculate the shift required to use bmap */
if (inode->i_sb->s_blocksize > PAGE_SIZE)
- return -ENOBUFS;
+ goto all_enobufs;
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
@@ -762,7 +781,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
nrbackpages++;
} else if (space && pagevec_add(&pagevec, page) == 0) {
fscache_mark_pages_cached(op, &pagevec);
+ fscache_retrieval_complete(op, 1);
ret = -ENODATA;
+ } else {
+ fscache_retrieval_complete(op, 1);
}
}
@@ -775,18 +797,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
/* submit the apparently valid pages to the backing fs to be read from
* disk */
if (nrbackpages > 0) {
- ret2 = cachefiles_read_backing_file(object, op, &backpages,
- &pagevec);
+ ret2 = cachefiles_read_backing_file(object, op, &backpages);
if (ret2 == -ENOMEM || ret2 == -EINTR)
ret = ret2;
}
- if (pagevec_count(&pagevec) > 0)
- fscache_mark_pages_cached(op, &pagevec);
-
_leave(" = %d [nr=%u%s]",
ret, *nr_pages, list_empty(pages) ? " empty" : "");
return ret;
+
+all_enobufs:
+ fscache_retrieval_complete(op, *nr_pages);
+ return -ENOBUFS;
}
/*
@@ -806,7 +828,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
{
struct cachefiles_object *object;
struct cachefiles_cache *cache;
- struct pagevec pagevec;
int ret;
object = container_of(op->op.object,
@@ -817,14 +838,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
_enter("%p,{%lx},", object, page->index);
ret = cachefiles_has_space(cache, 0, 1);
- if (ret == 0) {
- pagevec_init(&pagevec, 0);
- pagevec_add(&pagevec, page);
- fscache_mark_pages_cached(op, &pagevec);
- } else {
+ if (ret == 0)
+ fscache_mark_page_cached(op, page);
+ else
ret = -ENOBUFS;
- }
+ fscache_retrieval_complete(op, 1);
_leave(" = %d", ret);
return ret;
}
@@ -874,6 +893,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
ret = -ENOBUFS;
}
+ fscache_retrieval_complete(op, *nr_pages);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index e18b183b47e1..73b46288b54b 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
ASSERT(dentry);
ASSERT(dentry->d_inode);
- auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL);
+ auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
if (!auxbuf) {
_leave(" = -ENOMEM");
return -ENOMEM;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6690269f5dde..064d1a68d2c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
kfree(req->r_pages);
}
+static void ceph_unlock_page_vector(struct page **pages, int num_pages)
+{
+ int i;
+
+ for (i = 0; i < num_pages; i++)
+ unlock_page(pages[i]);
+}
+
/*
* start an async read(ahead) operation. return nr_pages we submitted
* a read for on success, or negative error code.
@@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
return nr_pages;
out_pages:
+ ceph_unlock_page_vector(pages, nr_pages);
ceph_release_page_vector(pages, nr_pages);
out:
ceph_osdc_put_request(req);
@@ -1078,23 +1087,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = file->private_data;
struct page *page;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- int r;
+ int r, want, got = 0;
+
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+
+ dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos, len, inode->i_size);
+ r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
+ if (r < 0)
+ return r;
+ dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+ if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
+ ceph_put_cap_refs(ci, got);
+ return -EAGAIN;
+ }
do {
/* get a page */
page = grab_cache_page_write_begin(mapping, index, 0);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ if (!page) {
+ r = -ENOMEM;
+ break;
+ }
dout("write_begin file %p inode %p page %p %d~%d\n", file,
inode, page, (int)pos, (int)len);
r = ceph_update_writeable_page(file, pos, len, page);
+ if (r)
+ page_cache_release(page);
} while (r == -EAGAIN);
+ if (r) {
+ ceph_put_cap_refs(ci, got);
+ } else {
+ *pagep = page;
+ *(int *)fsdata = got;
+ }
return r;
}
@@ -1108,10 +1145,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
+ int got = (unsigned long)fsdata;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len);
@@ -1134,6 +1173,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
up_read(&mdsc->snap_rwsem);
page_cache_release(page);
+ if (copied > 0) {
+ int dirty;
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
+ inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+
if (check_cap)
ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3251e9cc6401..a1d9bb30c1bf 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
if (!ctx) {
cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
if (cap) {
+ spin_lock(&mdsc->caps_list_lock);
mdsc->caps_use_count++;
mdsc->caps_total_count++;
+ spin_unlock(&mdsc->caps_list_lock);
}
return cap;
}
@@ -1349,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
if (!ci->i_head_snapc)
ci->i_head_snapc = ceph_get_snap_context(
ci->i_snap_realm->cached_context);
- dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
- ci->i_head_snapc);
+ dout(" inode %p now dirty snapc %p auth cap %p\n",
+ &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ if (ci->i_auth_cap)
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ else
+ list_add(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
@@ -2388,7 +2394,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
&atime);
/* max size increase? */
- if (max_size != ci->i_max_size) {
+ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
ci->i_max_size = max_size;
if (max_size >= ci->i_wanted_max_size) {
@@ -2745,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
/* make sure we re-request max_size, if necessary */
spin_lock(&ci->i_ceph_lock);
+ ci->i_wanted_max_size = 0; /* reset */
ci->i_requested_max_size = 0;
spin_unlock(&ci->i_ceph_lock);
}
@@ -2840,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, session,
snaptrace, snaptrace_len);
- ceph_check_caps(ceph_inode(inode), 0, session);
- goto done_unlocked;
}
/* the rest require a cap */
@@ -2858,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
+ case CEPH_CAP_OP_IMPORT:
handle_cap_grant(inode, h, session, cap, msg->middle);
goto done_unlocked;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d4dfdcf76d7f..e51558fca3a3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
loff_t endoff = pos + iov->iov_len;
- int want, got = 0;
- int ret, err;
+ int got = 0;
+ int ret, err, written;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
retry_snap:
+ written = 0;
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
return -ENOSPC;
__ceph_do_pending_vmtruncate(inode);
- dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- inode->i_size);
- if (fi->fmode & CEPH_FILE_MODE_LAZY)
- want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
- else
- want = CEPH_CAP_FILE_BUFFER;
- ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
- if (ret < 0)
- goto out_put;
-
- dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
-
- if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
- (fi->flags & CEPH_F_SYNC)) {
- ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
- &iocb->ki_pos);
- } else {
- /*
- * buffered write; drop Fw early to avoid slow
- * revocation if we get stuck on balance_dirty_pages
- */
- int dirty;
-
- spin_lock(&ci->i_ceph_lock);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
- spin_unlock(&ci->i_ceph_lock);
- ceph_put_cap_refs(ci, got);
+ /*
+ * try to do a buffered write. if we don't have sufficient
+ * caps, we'll get -EAGAIN from generic_file_aio_write, or a
+ * short write if we only get caps for some pages.
+ */
+ if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
+ !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
+ !(fi->flags & CEPH_F_SYNC)) {
ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (ret >= 0)
+ written = ret;
+
if ((ret >= 0 || ret == -EIOCBQUEUED) &&
((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
|| ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
- err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
+ err = vfs_fsync_range(file, pos, pos + written - 1, 1);
if (err < 0)
ret = err;
}
+ if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
+ goto out;
+ }
- if (dirty)
- __mark_inode_dirty(inode, dirty);
+ dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
+ inode, ceph_vinop(inode), pos + written,
+ (unsigned)iov->iov_len - written, inode->i_size);
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
+ if (ret < 0)
goto out;
- }
+ dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), pos + written,
+ (unsigned)iov->iov_len - written, ceph_cap_string(got));
+ ret = ceph_sync_write(file, iov->iov_base + written,
+ iov->iov_len - written, &iocb->ki_pos);
if (ret >= 0) {
int dirty;
spin_lock(&ci->i_ceph_lock);
@@ -777,13 +767,10 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
}
-
-out_put:
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
- ceph_cap_string(got));
+ inode, ceph_vinop(inode), pos + written,
+ (unsigned)iov->iov_len - written, ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
-
out:
if (ret == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ba95eea201bf..2971eaa65cdc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
u64 to;
- int wrbuffer_refs, wake = 0;
+ int wrbuffer_refs, finish = 0;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1498,15 +1498,18 @@ retry:
truncate_inode_pages(inode->i_mapping, to);
spin_lock(&ci->i_ceph_lock);
- ci->i_truncate_pending--;
- if (ci->i_truncate_pending == 0)
- wake = 1;
+ if (to == ci->i_truncate_size) {
+ ci->i_truncate_pending = 0;
+ finish = 1;
+ }
spin_unlock(&ci->i_ceph_lock);
+ if (!finish)
+ goto retry;
if (wrbuffer_refs == 0)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- if (wake)
- wake_up_all(&ci->i_cap_wq);
+
+ wake_up_all(&ci->i_cap_wq);
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1bcf712655d9..9165eb8309eb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
} else if (rpath || rino) {
*ino = rino;
*ppath = rpath;
- *pathlen = strlen(rpath);
+ *pathlen = rpath ? strlen(rpath) : 0;
dout(" path %.*s\n", *pathlen, rpath);
}
@@ -1876,9 +1876,14 @@ finish:
static void __wake_requests(struct ceph_mds_client *mdsc,
struct list_head *head)
{
- struct ceph_mds_request *req, *nreq;
+ struct ceph_mds_request *req;
+ LIST_HEAD(tmp_list);
+
+ list_splice_init(head, &tmp_list);
- list_for_each_entry_safe(req, nreq, head, r_wait) {
+ while (!list_empty(&tmp_list)) {
+ req = list_entry(tmp_list.next,
+ struct ceph_mds_request, r_wait);
list_del_init(&req->r_wait);
__do_request(mdsc, req);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2eb43f211325..e86aa9948124 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
- if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
- seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, ",osdkeepalivetimeout=%d",
opt->osd_keepalive_timeout);
@@ -849,7 +847,7 @@ static int ceph_register_bdi(struct super_block *sb,
fsc->backing_dev_info.ra_pages =
default_backing_dev_info.ra_pages;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
+ err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
if (!err)
sb->s_bdi = &fsc->backing_dev_info;
diff --git a/fs/dcache.c b/fs/dcache.c
index 3a463d0c4fe8..19153a0a810c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -455,24 +455,6 @@ void d_drop(struct dentry *dentry)
EXPORT_SYMBOL(d_drop);
/*
- * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
- * @dentry: dentry to drop
- *
- * This is called when we do a lookup on a placeholder dentry that needed to be
- * looked up. The dentry should have been hashed in order for it to be found by
- * the lookup code, but now needs to be unhashed while we do the actual lookup
- * and clear the DCACHE_NEED_LOOKUP flag.
- */
-void d_clear_need_lookup(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- __d_drop(dentry);
- dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
- spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(d_clear_need_lookup);
-
-/*
* Finish off a dentry we've decided to kill.
* dentry->d_lock must be held, returns with it unlocked.
* If ref is non-zero, then decrement the refcount too.
@@ -565,13 +547,7 @@ repeat:
if (d_unhashed(dentry))
goto kill_it;
- /*
- * If this dentry needs lookup, don't set the referenced flag so that it
- * is more likely to be cleaned up by the dcache shrinker in case of
- * memory pressure.
- */
- if (!d_need_lookup(dentry))
- dentry->d_flags |= DCACHE_REFERENCED;
+ dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
dentry->d_count--;
@@ -1583,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias);
*/
struct dentry *d_obtain_alias(struct inode *inode)
{
- static const struct qstr anonstring = { .name = "" };
+ static const struct qstr anonstring = QSTR_INIT("/", 1);
struct dentry *tmp;
struct dentry *res;
@@ -1737,13 +1713,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
}
/*
- * We are going to instantiate this dentry, unhash it and clear the
- * lookup flag so we can do that.
- */
- if (unlikely(d_need_lookup(found)))
- d_clear_need_lookup(found);
-
- /*
* Negative dentry: instantiate it unless the inode is a directory and
* already has a dentry.
*/
diff --git a/fs/exec.c b/fs/exec.c
index 237d5342786c..18c45cac368f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1669,7 +1669,6 @@ int get_dumpable(struct mm_struct *mm)
return __get_dumpable(mm->flags);
}
-#ifdef __ARCH_WANT_SYS_EXECVE
SYSCALL_DEFINE3(execve,
const char __user *, filename,
const char __user *const __user *, argv,
@@ -1697,23 +1696,3 @@ asmlinkage long compat_sys_execve(const char __user * filename,
return error;
}
#endif
-#endif
-
-#ifdef __ARCH_WANT_KERNEL_EXECVE
-int kernel_execve(const char *filename,
- const char *const argv[],
- const char *const envp[])
-{
- int ret = do_execve(filename,
- (const char __user *const __user *)argv,
- (const char __user *const __user *)envp);
- if (ret < 0)
- return ret;
-
- /*
- * We were successful. We won't be returning to our caller, but
- * instead to user space by manipulating the kernel stack.
- */
- ret_from_kernel_execve(current_pt_regs());
-}
-#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 606bb074c501..5df4bb4aab14 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -322,10 +322,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
if (parent && (len < 4)) {
*max_len = 4;
- return 255;
+ return FILEID_INVALID;
} else if (len < 2) {
*max_len = 2;
- return 255;
+ return FILEID_INVALID;
}
len = 2;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
new file mode 100644
index 000000000000..fd27e7e6326e
--- /dev/null
+++ b/fs/f2fs/Kconfig
@@ -0,0 +1,53 @@
+config F2FS_FS
+ tristate "F2FS filesystem support (EXPERIMENTAL)"
+ depends on BLOCK
+ help
+ F2FS is based on Log-structured File System (LFS), which supports
+ versatile "flash-friendly" features. The design has been focused on
+ addressing the fundamental issues in LFS, which are snowball effect
+ of wandering tree and high cleaning overhead.
+
+ Since flash-based storages show different characteristics according to
+ the internal geometry or flash memory management schemes aka FTL, F2FS
+ and tools support various parameters not only for configuring on-disk
+ layout, but also for selecting allocation and cleaning algorithms.
+
+ If unsure, say N.
+
+config F2FS_STAT_FS
+ bool "F2FS Status Information"
+ depends on F2FS_FS && DEBUG_FS
+ default y
+ help
+ /sys/kernel/debug/f2fs/ contains information about all the partitions
+ mounted as f2fs. Each file shows the whole f2fs information.
+
+ /sys/kernel/debug/f2fs/status includes:
+ - major file system information managed by f2fs currently
+ - average SIT information about whole segments
+ - current memory footprint consumed by f2fs.
+
+config F2FS_FS_XATTR
+ bool "F2FS extended attributes"
+ depends on F2FS_FS
+ default y
+ help
+ Extended attributes are name:value pairs associated with inodes by
+ the kernel or by users (see the attr(5) manual page, or visit
+ <http://acl.bestbits.at/> for details).
+
+ If unsure, say N.
+
+config F2FS_FS_POSIX_ACL
+ bool "F2FS Access Control Lists"
+ depends on F2FS_FS_XATTR
+ select FS_POSIX_ACL
+ default y
+ help
+ Posix Access Control Lists (ACLs) support permissions for users and
+ gourps beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
new file mode 100644
index 000000000000..27a0820340b9
--- /dev/null
+++ b/fs/f2fs/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_F2FS_FS) += f2fs.o
+
+f2fs-y := dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
+f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
+f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
+f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
new file mode 100644
index 000000000000..fed74d193ffb
--- /dev/null
+++ b/fs/f2fs/acl.c
@@ -0,0 +1,414 @@
+/*
+ * fs/f2fs/acl.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+
+static inline size_t f2fs_acl_size(int count)
+{
+ if (count <= 4) {
+ return sizeof(struct f2fs_acl_header) +
+ count * sizeof(struct f2fs_acl_entry_short);
+ } else {
+ return sizeof(struct f2fs_acl_header) +
+ 4 * sizeof(struct f2fs_acl_entry_short) +
+ (count - 4) * sizeof(struct f2fs_acl_entry);
+ }
+}
+
+static inline int f2fs_acl_count(size_t size)
+{
+ ssize_t s;
+ size -= sizeof(struct f2fs_acl_header);
+ s = size - 4 * sizeof(struct f2fs_acl_entry_short);
+ if (s < 0) {
+ if (size % sizeof(struct f2fs_acl_entry_short))
+ return -1;
+ return size / sizeof(struct f2fs_acl_entry_short);
+ } else {
+ if (s % sizeof(struct f2fs_acl_entry))
+ return -1;
+ return s / sizeof(struct f2fs_acl_entry) + 4;
+ }
+}
+
+static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
+{
+ int i, count;
+ struct posix_acl *acl;
+ struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value;
+ struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
+ const char *end = value + size;
+
+ if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
+ return ERR_PTR(-EINVAL);
+
+ count = f2fs_acl_count(size);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+
+ acl = posix_acl_alloc(count, GFP_KERNEL);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < count; i++) {
+
+ if ((char *)entry > end)
+ goto fail;
+
+ acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm);
+
+ switch (acl->a_entries[i].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry_short));
+ break;
+
+ case ACL_USER:
+ acl->a_entries[i].e_uid =
+ make_kuid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ case ACL_GROUP:
+ acl->a_entries[i].e_gid =
+ make_kgid(&init_user_ns,
+ le32_to_cpu(entry->e_id));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ default:
+ goto fail;
+ }
+ }
+ if ((char *)entry != end)
+ goto fail;
+ return acl;
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+
+static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
+{
+ struct f2fs_acl_header *f2fs_acl;
+ struct f2fs_acl_entry *entry;
+ int i;
+
+ f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
+ sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+ if (!f2fs_acl)
+ return ERR_PTR(-ENOMEM);
+
+ f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION);
+ entry = (struct f2fs_acl_entry *)(f2fs_acl + 1);
+
+ for (i = 0; i < acl->a_count; i++) {
+
+ entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm);
+
+ switch (acl->a_entries[i].e_tag) {
+ case ACL_USER:
+ entry->e_id = cpu_to_le32(
+ from_kuid(&init_user_ns,
+ acl->a_entries[i].e_uid));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(
+ from_kgid(&init_user_ns,
+ acl->a_entries[i].e_gid));
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry));
+ break;
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ entry = (struct f2fs_acl_entry *)((char *)entry +
+ sizeof(struct f2fs_acl_entry_short));
+ break;
+ default:
+ goto fail;
+ }
+ }
+ *size = f2fs_acl_size(acl->a_count);
+ return (void *)f2fs_acl;
+
+fail:
+ kfree(f2fs_acl);
+ return ERR_PTR(-EINVAL);
+}
+
+struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ void *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return NULL;
+
+ acl = get_cached_acl(inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
+
+ if (type == ACL_TYPE_ACCESS)
+ name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+
+ retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_KERNEL);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = f2fs_getxattr(inode, name_index, "", value, retval);
+ }
+
+ if (retval < 0) {
+ if (retval == -ENODATA)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+ } else {
+ acl = f2fs_acl_from_disk(value, retval);
+ }
+ kfree(value);
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int error;
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return 0;
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (error < 0)
+ return error;
+ set_acl_inode(fi, inode->i_mode);
+ if (error == 0)
+ acl = NULL;
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ value = f2fs_acl_to_disk(acl, &size);
+ if (IS_ERR(value)) {
+ cond_clear_inode_flag(fi, FI_ACL_MODE);
+ return (int)PTR_ERR(value);
+ }
+ }
+
+ error = f2fs_setxattr(inode, name_index, "", value, size);
+
+ kfree(value);
+ if (!error)
+ set_cached_acl(inode, type, acl);
+
+ cond_clear_inode_flag(fi, FI_ACL_MODE);
+ return error;
+}
+
+int f2fs_init_acl(struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ int error = 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (test_opt(sbi, POSIX_ACL)) {
+ acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl)
+ inode->i_mode &= ~current_umask();
+ }
+
+ if (test_opt(sbi, POSIX_ACL) && acl) {
+
+ if (S_ISDIR(inode->i_mode)) {
+ error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+ if (error)
+ goto cleanup;
+ }
+ error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
+ if (error < 0)
+ return error;
+ if (error > 0)
+ error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+ }
+cleanup:
+ posix_acl_release(acl);
+ return error;
+}
+
+int f2fs_acl_chmod(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int error;
+ mode_t mode = get_inode_mode(inode);
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return 0;
+ if (S_ISLNK(mode))
+ return -EOPNOTSUPP;
+
+ acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+
+ error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
+ if (error)
+ return error;
+ error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+ posix_acl_release(acl);
+ return error;
+}
+
+static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ const char *xname = POSIX_ACL_XATTR_DEFAULT;
+ size_t size;
+
+ if (!test_opt(sbi, POSIX_ACL))
+ return 0;
+
+ if (type == ACL_TYPE_ACCESS)
+ xname = POSIX_ACL_XATTR_ACCESS;
+
+ size = strlen(xname) + 1;
+ if (list && size <= list_size)
+ memcpy(list, xname, size);
+ return size;
+}
+
+static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct posix_acl *acl;
+ int error;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(sbi, POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acl = f2fs_get_acl(dentry->d_inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (!acl)
+ return -ENODATA;
+ error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl = NULL;
+ int error;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!test_opt(sbi, POSIX_ACL))
+ return -EOPNOTSUPP;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ error = posix_acl_valid(acl);
+ if (error)
+ goto release_and_out;
+ }
+ } else {
+ acl = NULL;
+ }
+
+ error = f2fs_set_acl(inode, type, acl);
+
+release_and_out:
+ posix_acl_release(acl);
+ return error;
+}
+
+const struct xattr_handler f2fs_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .list = f2fs_xattr_list_acl,
+ .get = f2fs_xattr_get_acl,
+ .set = f2fs_xattr_set_acl,
+};
+
+const struct xattr_handler f2fs_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .list = f2fs_xattr_list_acl,
+ .get = f2fs_xattr_get_acl,
+ .set = f2fs_xattr_set_acl,
+};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
new file mode 100644
index 000000000000..80f430674417
--- /dev/null
+++ b/fs/f2fs/acl.h
@@ -0,0 +1,57 @@
+/*
+ * fs/f2fs/acl.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/acl.h
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_ACL_H__
+#define __F2FS_ACL_H__
+
+#include <linux/posix_acl_xattr.h>
+
+#define F2FS_ACL_VERSION 0x0001
+
+struct f2fs_acl_entry {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+};
+
+struct f2fs_acl_entry_short {
+ __le16 e_tag;
+ __le16 e_perm;
+};
+
+struct f2fs_acl_header {
+ __le32 a_version;
+};
+
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+
+extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
+extern int f2fs_acl_chmod(struct inode *inode);
+extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
+#else
+#define f2fs_check_acl NULL
+#define f2fs_get_acl NULL
+#define f2fs_set_acl NULL
+
+static inline int f2fs_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+
+static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
+{
+ return 0;
+}
+#endif
+#endif /* __F2FS_ACL_H__ */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
new file mode 100644
index 000000000000..6ef36c37e2be
--- /dev/null
+++ b/fs/f2fs/checkpoint.c
@@ -0,0 +1,794 @@
+/*
+ * fs/f2fs/checkpoint.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+static struct kmem_cache *orphan_entry_slab;
+static struct kmem_cache *inode_entry_slab;
+
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct page *page = NULL;
+repeat:
+ page = grab_cache_page(mapping, index);
+ if (!page) {
+ cond_resched();
+ goto repeat;
+ }
+
+ /* We wait writeback only inside grab_meta_page() */
+ wait_on_page_writeback(page);
+ SetPageUptodate(page);
+ return page;
+}
+
+/*
+ * We guarantee no failure on the returned page.
+ */
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct page *page;
+repeat:
+ page = grab_cache_page(mapping, index);
+ if (!page) {
+ cond_resched();
+ goto repeat;
+ }
+ if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
+ mark_page_accessed(page);
+
+ /* We do not allow returning an errorneous page */
+ return page;
+}
+
+static int f2fs_write_meta_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int err;
+
+ wait_on_page_writeback(page);
+
+ err = write_meta_page(sbi, page, wbc);
+ if (err) {
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ }
+
+ dec_page_count(sbi, F2FS_DIRTY_META);
+
+ /* In this case, we should not unlock this page */
+ if (err != AOP_WRITEPAGE_ACTIVATE)
+ unlock_page(page);
+ return err;
+}
+
+static int f2fs_write_meta_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct block_device *bdev = sbi->sb->s_bdev;
+ long written;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ if (get_pages(sbi, F2FS_DIRTY_META) == 0)
+ return 0;
+
+ /* if mounting is failed, skip writing node pages */
+ mutex_lock(&sbi->cp_mutex);
+ written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+ mutex_unlock(&sbi->cp_mutex);
+ wbc->nr_to_write -= written;
+ return 0;
+}
+
+long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
+ long nr_to_write)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ pgoff_t index = 0, end = LONG_MAX;
+ struct pagevec pvec;
+ long nwritten = 0;
+ struct writeback_control wbc = {
+ .for_reclaim = 0,
+ };
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ lock_page(page);
+ BUG_ON(page->mapping != mapping);
+ BUG_ON(!PageDirty(page));
+ clear_page_dirty_for_io(page);
+ f2fs_write_meta_page(page, &wbc);
+ if (nwritten++ >= nr_to_write)
+ break;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (nwritten)
+ f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+
+ return nwritten;
+}
+
+static int f2fs_set_meta_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ __set_page_dirty_nobuffers(page);
+ inc_page_count(sbi, F2FS_DIRTY_META);
+ F2FS_SET_SB_DIRT(sbi);
+ return 1;
+ }
+ return 0;
+}
+
+const struct address_space_operations f2fs_meta_aops = {
+ .writepage = f2fs_write_meta_page,
+ .writepages = f2fs_write_meta_pages,
+ .set_page_dirty = f2fs_set_meta_page_dirty,
+};
+
+int check_orphan_space(struct f2fs_sb_info *sbi)
+{
+ unsigned int max_orphans;
+ int err = 0;
+
+ /*
+ * considering 512 blocks in a segment 5 blocks are needed for cp
+ * and log segment summaries. Remaining blocks are used to keep
+ * orphan entries with the limitation one reserved segment
+ * for cp pack we can have max 1020*507 orphan entries
+ */
+ max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
+ mutex_lock(&sbi->orphan_inode_mutex);
+ if (sbi->n_orphans >= max_orphans)
+ err = -ENOSPC;
+ mutex_unlock(&sbi->orphan_inode_mutex);
+ return err;
+}
+
+void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *head, *this;
+ struct orphan_inode_entry *new = NULL, *orphan = NULL;
+
+ mutex_lock(&sbi->orphan_inode_mutex);
+ head = &sbi->orphan_inode_list;
+ list_for_each(this, head) {
+ orphan = list_entry(this, struct orphan_inode_entry, list);
+ if (orphan->ino == ino)
+ goto out;
+ if (orphan->ino > ino)
+ break;
+ orphan = NULL;
+ }
+retry:
+ new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->ino = ino;
+ INIT_LIST_HEAD(&new->list);
+
+ /* add new_oentry into list which is sorted by inode number */
+ if (orphan) {
+ struct orphan_inode_entry *prev;
+
+ /* get previous entry */
+ prev = list_entry(orphan->list.prev, typeof(*prev), list);
+ if (&prev->list != head)
+ /* insert new orphan inode entry */
+ list_add(&new->list, &prev->list);
+ else
+ list_add(&new->list, head);
+ } else {
+ list_add_tail(&new->list, head);
+ }
+ sbi->n_orphans++;
+out:
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
+void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct list_head *this, *next, *head;
+ struct orphan_inode_entry *orphan;
+
+ mutex_lock(&sbi->orphan_inode_mutex);
+ head = &sbi->orphan_inode_list;
+ list_for_each_safe(this, next, head) {
+ orphan = list_entry(this, struct orphan_inode_entry, list);
+ if (orphan->ino == ino) {
+ list_del(&orphan->list);
+ kmem_cache_free(orphan_entry_slab, orphan);
+ sbi->n_orphans--;
+ break;
+ }
+ }
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
+static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ struct inode *inode = f2fs_iget(sbi->sb, ino);
+ BUG_ON(IS_ERR(inode));
+ clear_nlink(inode);
+
+ /* truncate all the data during iput */
+ iput(inode);
+}
+
+int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+{
+ block_t start_blk, orphan_blkaddr, i, j;
+
+ if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
+ return 0;
+
+ sbi->por_doing = 1;
+ start_blk = __start_cp_addr(sbi) + 1;
+ orphan_blkaddr = __start_sum_addr(sbi) - 1;
+
+ for (i = 0; i < orphan_blkaddr; i++) {
+ struct page *page = get_meta_page(sbi, start_blk + i);
+ struct f2fs_orphan_block *orphan_blk;
+
+ orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+ for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
+ nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
+ recover_orphan_inode(sbi, ino);
+ }
+ f2fs_put_page(page, 1);
+ }
+ /* clear Orphan Flag */
+ clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
+ sbi->por_doing = 0;
+ return 0;
+}
+
+static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+ struct list_head *head, *this, *next;
+ struct f2fs_orphan_block *orphan_blk = NULL;
+ struct page *page = NULL;
+ unsigned int nentries = 0;
+ unsigned short index = 1;
+ unsigned short orphan_blocks;
+
+ orphan_blocks = (unsigned short)((sbi->n_orphans +
+ (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+
+ mutex_lock(&sbi->orphan_inode_mutex);
+ head = &sbi->orphan_inode_list;
+
+ /* loop for each orphan inode entry and write them in Jornal block */
+ list_for_each_safe(this, next, head) {
+ struct orphan_inode_entry *orphan;
+
+ orphan = list_entry(this, struct orphan_inode_entry, list);
+
+ if (nentries == F2FS_ORPHANS_PER_BLOCK) {
+ /*
+ * an orphan block is full of 1020 entries,
+ * then we need to flush current orphan blocks
+ * and bring another one in memory
+ */
+ orphan_blk->blk_addr = cpu_to_le16(index);
+ orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+ orphan_blk->entry_count = cpu_to_le32(nentries);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ index++;
+ start_blk++;
+ nentries = 0;
+ page = NULL;
+ }
+ if (page)
+ goto page_exist;
+
+ page = grab_meta_page(sbi, start_blk);
+ orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+ memset(orphan_blk, 0, sizeof(*orphan_blk));
+page_exist:
+ orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+ }
+ if (!page)
+ goto end;
+
+ orphan_blk->blk_addr = cpu_to_le16(index);
+ orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+ orphan_blk->entry_count = cpu_to_le32(nentries);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+end:
+ mutex_unlock(&sbi->orphan_inode_mutex);
+}
+
+static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+ block_t cp_addr, unsigned long long *version)
+{
+ struct page *cp_page_1, *cp_page_2 = NULL;
+ unsigned long blk_size = sbi->blocksize;
+ struct f2fs_checkpoint *cp_block;
+ unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int crc = 0;
+ size_t crc_offset;
+
+ /* Read the 1st cp block in this CP pack */
+ cp_page_1 = get_meta_page(sbi, cp_addr);
+
+ /* get the version number */
+ cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
+ crc_offset = le32_to_cpu(cp_block->checksum_offset);
+ if (crc_offset >= blk_size)
+ goto invalid_cp1;
+
+ crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ goto invalid_cp1;
+
+ pre_version = le64_to_cpu(cp_block->checkpoint_ver);
+
+ /* Read the 2nd cp block in this CP pack */
+ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_page_2 = get_meta_page(sbi, cp_addr);
+
+ cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
+ crc_offset = le32_to_cpu(cp_block->checksum_offset);
+ if (crc_offset >= blk_size)
+ goto invalid_cp2;
+
+ crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
+ if (!f2fs_crc_valid(crc, cp_block, crc_offset))
+ goto invalid_cp2;
+
+ cur_version = le64_to_cpu(cp_block->checkpoint_ver);
+
+ if (cur_version == pre_version) {
+ *version = cur_version;
+ f2fs_put_page(cp_page_2, 1);
+ return cp_page_1;
+ }
+invalid_cp2:
+ f2fs_put_page(cp_page_2, 1);
+invalid_cp1:
+ f2fs_put_page(cp_page_1, 1);
+ return NULL;
+}
+
+int get_valid_checkpoint(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *cp_block;
+ struct f2fs_super_block *fsb = sbi->raw_super;
+ struct page *cp1, *cp2, *cur_page;
+ unsigned long blk_size = sbi->blocksize;
+ unsigned long long cp1_version = 0, cp2_version = 0;
+ unsigned long long cp_start_blk_no;
+
+ sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
+ if (!sbi->ckpt)
+ return -ENOMEM;
+ /*
+ * Finding out valid cp block involves read both
+ * sets( cp pack1 and cp pack 2)
+ */
+ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+ cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
+
+ /* The second checkpoint pack should start at the next segment */
+ cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+ cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
+
+ if (cp1 && cp2) {
+ if (ver_after(cp2_version, cp1_version))
+ cur_page = cp2;
+ else
+ cur_page = cp1;
+ } else if (cp1) {
+ cur_page = cp1;
+ } else if (cp2) {
+ cur_page = cp2;
+ } else {
+ goto fail_no_cp;
+ }
+
+ cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
+ memcpy(sbi->ckpt, cp_block, blk_size);
+
+ f2fs_put_page(cp1, 1);
+ f2fs_put_page(cp2, 1);
+ return 0;
+
+fail_no_cp:
+ kfree(sbi->ckpt);
+ return -EINVAL;
+}
+
+void set_dirty_dir_page(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct list_head *head = &sbi->dir_inode_list;
+ struct dir_inode_entry *new;
+ struct list_head *this;
+
+ if (!S_ISDIR(inode->i_mode))
+ return;
+retry:
+ new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ if (!new) {
+ cond_resched();
+ goto retry;
+ }
+ new->inode = inode;
+ INIT_LIST_HEAD(&new->list);
+
+ spin_lock(&sbi->dir_inode_lock);
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode) {
+ kmem_cache_free(inode_entry_slab, new);
+ goto out;
+ }
+ }
+ list_add_tail(&new->list, head);
+ sbi->n_dirty_dirs++;
+
+ BUG_ON(!S_ISDIR(inode->i_mode));
+out:
+ inc_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_inc_dirty_dents(inode);
+ SetPagePrivate(page);
+
+ spin_unlock(&sbi->dir_inode_lock);
+}
+
+void remove_dirty_dir_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *this;
+
+ if (!S_ISDIR(inode->i_mode))
+ return;
+
+ spin_lock(&sbi->dir_inode_lock);
+ if (atomic_read(&F2FS_I(inode)->dirty_dents))
+ goto out;
+
+ list_for_each(this, head) {
+ struct dir_inode_entry *entry;
+ entry = list_entry(this, struct dir_inode_entry, list);
+ if (entry->inode == inode) {
+ list_del(&entry->list);
+ kmem_cache_free(inode_entry_slab, entry);
+ sbi->n_dirty_dirs--;
+ break;
+ }
+ }
+out:
+ spin_unlock(&sbi->dir_inode_lock);
+}
+
+void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+{
+ struct list_head *head = &sbi->dir_inode_list;
+ struct dir_inode_entry *entry;
+ struct inode *inode;
+retry:
+ spin_lock(&sbi->dir_inode_lock);
+ if (list_empty(head)) {
+ spin_unlock(&sbi->dir_inode_lock);
+ return;
+ }
+ entry = list_entry(head->next, struct dir_inode_entry, list);
+ inode = igrab(entry->inode);
+ spin_unlock(&sbi->dir_inode_lock);
+ if (inode) {
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+ } else {
+ /*
+ * We should submit bio, since it exists several
+ * wribacking dentry pages in the freeing inode.
+ */
+ f2fs_submit_bio(sbi, DATA, true);
+ }
+ goto retry;
+}
+
+/*
+ * Freeze all the FS-operations for checkpoint.
+ */
+void block_operations(struct f2fs_sb_info *sbi)
+{
+ int t;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+
+ /* Stop renaming operation */
+ mutex_lock_op(sbi, RENAME);
+ mutex_lock_op(sbi, DENTRY_OPS);
+
+retry_dents:
+ /* write all the dirty dentry pages */
+ sync_dirty_dir_inodes(sbi);
+
+ mutex_lock_op(sbi, DATA_WRITE);
+ if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
+ mutex_unlock_op(sbi, DATA_WRITE);
+ goto retry_dents;
+ }
+
+ /* block all the operations */
+ for (t = DATA_NEW; t <= NODE_TRUNC; t++)
+ mutex_lock_op(sbi, t);
+
+ mutex_lock(&sbi->write_inode);
+
+ /*
+ * POR: we should ensure that there is no dirty node pages
+ * until finishing nat/sit flush.
+ */
+retry:
+ sync_node_pages(sbi, 0, &wbc);
+
+ mutex_lock_op(sbi, NODE_WRITE);
+
+ if (get_pages(sbi, F2FS_DIRTY_NODES)) {
+ mutex_unlock_op(sbi, NODE_WRITE);
+ goto retry;
+ }
+ mutex_unlock(&sbi->write_inode);
+}
+
+static void unblock_operations(struct f2fs_sb_info *sbi)
+{
+ int t;
+ for (t = NODE_WRITE; t >= RENAME; t--)
+ mutex_unlock_op(sbi, t);
+}
+
+static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ nid_t last_nid = 0;
+ block_t start_blk;
+ struct page *cp_page;
+ unsigned int data_sum_blocks, orphan_blocks;
+ unsigned int crc32 = 0;
+ void *kaddr;
+ int i;
+
+ /* Flush all the NAT/SIT pages */
+ while (get_pages(sbi, F2FS_DIRTY_META))
+ sync_meta_pages(sbi, META, LONG_MAX);
+
+ next_free_nid(sbi, &last_nid);
+
+ /*
+ * modify checkpoint
+ * version number is already updated
+ */
+ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
+ ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
+ ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
+ for (i = 0; i < 3; i++) {
+ ckpt->cur_node_segno[i] =
+ cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
+ ckpt->cur_node_blkoff[i] =
+ cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
+ ckpt->alloc_type[i + CURSEG_HOT_NODE] =
+ curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
+ }
+ for (i = 0; i < 3; i++) {
+ ckpt->cur_data_segno[i] =
+ cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
+ ckpt->cur_data_blkoff[i] =
+ cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
+ ckpt->alloc_type[i + CURSEG_HOT_DATA] =
+ curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
+ }
+
+ ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
+ ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
+ ckpt->next_free_nid = cpu_to_le32(last_nid);
+
+ /* 2 cp + n data seg summary + orphan inode blocks */
+ data_sum_blocks = npages_for_summary_flush(sbi);
+ if (data_sum_blocks < 3)
+ set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+ else
+ clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
+
+ orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
+ / F2FS_ORPHANS_PER_BLOCK;
+ ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
+
+ if (is_umount) {
+ set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
+ } else {
+ clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
+ data_sum_blocks + orphan_blocks);
+ }
+
+ if (sbi->n_orphans)
+ set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+ else
+ clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
+
+ /* update SIT/NAT bitmap */
+ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
+ get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
+
+ crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
+ *(__le32 *)((unsigned char *)ckpt +
+ le32_to_cpu(ckpt->checksum_offset))
+ = cpu_to_le32(crc32);
+
+ start_blk = __start_cp_addr(sbi);
+
+ /* write out checkpoint buffer at block 0 */
+ cp_page = grab_meta_page(sbi, start_blk++);
+ kaddr = page_address(cp_page);
+ memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+ set_page_dirty(cp_page);
+ f2fs_put_page(cp_page, 1);
+
+ if (sbi->n_orphans) {
+ write_orphan_inodes(sbi, start_blk);
+ start_blk += orphan_blocks;
+ }
+
+ write_data_summaries(sbi, start_blk);
+ start_blk += data_sum_blocks;
+ if (is_umount) {
+ write_node_summaries(sbi, start_blk);
+ start_blk += NR_CURSEG_NODE_TYPE;
+ }
+
+ /* writeout checkpoint block */
+ cp_page = grab_meta_page(sbi, start_blk);
+ kaddr = page_address(cp_page);
+ memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+ set_page_dirty(cp_page);
+ f2fs_put_page(cp_page, 1);
+
+ /* wait for previous submitted node/meta pages writeback */
+ while (get_pages(sbi, F2FS_WRITEBACK))
+ congestion_wait(BLK_RW_ASYNC, HZ / 50);
+
+ filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
+ filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+
+ /* update user_block_counts */
+ sbi->last_valid_block_count = sbi->total_valid_block_count;
+ sbi->alloc_valid_block_count = 0;
+
+ /* Here, we only have one bio having CP pack */
+ if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
+ sbi->sb->s_flags |= MS_RDONLY;
+ else
+ sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+
+ clear_prefree_segments(sbi);
+ F2FS_RESET_SB_DIRT(sbi);
+}
+
+/*
+ * We guarantee that this checkpoint procedure should not fail.
+ */
+void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned long long ckpt_ver;
+
+ if (!blocked) {
+ mutex_lock(&sbi->cp_mutex);
+ block_operations(sbi);
+ }
+
+ f2fs_submit_bio(sbi, DATA, true);
+ f2fs_submit_bio(sbi, NODE, true);
+ f2fs_submit_bio(sbi, META, true);
+
+ /*
+ * update checkpoint pack index
+ * Increase the version number so that
+ * SIT entries and seg summaries are written at correct place
+ */
+ ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
+ ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
+
+ /* write cached NAT/SIT entries to NAT/SIT area */
+ flush_nat_entries(sbi);
+ flush_sit_entries(sbi);
+
+ reset_victim_segmap(sbi);
+
+ /* unlock all the fs_lock[] in do_checkpoint() */
+ do_checkpoint(sbi, is_umount);
+
+ unblock_operations(sbi);
+ mutex_unlock(&sbi->cp_mutex);
+}
+
+void init_orphan_info(struct f2fs_sb_info *sbi)
+{
+ mutex_init(&sbi->orphan_inode_mutex);
+ INIT_LIST_HEAD(&sbi->orphan_inode_list);
+ sbi->n_orphans = 0;
+}
+
+int create_checkpoint_caches(void)
+{
+ orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
+ sizeof(struct orphan_inode_entry), NULL);
+ if (unlikely(!orphan_entry_slab))
+ return -ENOMEM;
+ inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
+ sizeof(struct dir_inode_entry), NULL);
+ if (unlikely(!inode_entry_slab)) {
+ kmem_cache_destroy(orphan_entry_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_checkpoint_caches(void)
+{
+ kmem_cache_destroy(orphan_entry_slab);
+ kmem_cache_destroy(inode_entry_slab);
+}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
new file mode 100644
index 000000000000..655aeabc1dd4
--- /dev/null
+++ b/fs/f2fs/data.c
@@ -0,0 +1,702 @@
+/*
+ * fs/f2fs/data.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+/*
+ * Lock ordering for the change of data block address:
+ * ->data_page
+ * ->node_page
+ * update block addresses in the node page
+ */
+static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+{
+ struct f2fs_node *rn;
+ __le32 *addr_array;
+ struct page *node_page = dn->node_page;
+ unsigned int ofs_in_node = dn->ofs_in_node;
+
+ wait_on_page_writeback(node_page);
+
+ rn = (struct f2fs_node *)page_address(node_page);
+
+ /* Get physical address of data block */
+ addr_array = blkaddr_in_node(rn);
+ addr_array[ofs_in_node] = cpu_to_le32(new_addr);
+ set_page_dirty(node_page);
+}
+
+int reserve_new_block(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+
+ if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ return -EPERM;
+ if (!inc_valid_block_count(sbi, dn->inode, 1))
+ return -ENOSPC;
+
+ __set_data_blkaddr(dn, NEW_ADDR);
+ dn->data_blkaddr = NEW_ADDR;
+ sync_inode_page(dn);
+ return 0;
+}
+
+static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct buffer_head *bh_result)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ pgoff_t start_fofs, end_fofs;
+ block_t start_blkaddr;
+
+ read_lock(&fi->ext.ext_lock);
+ if (fi->ext.len == 0) {
+ read_unlock(&fi->ext.ext_lock);
+ return 0;
+ }
+
+ sbi->total_hit_ext++;
+ start_fofs = fi->ext.fofs;
+ end_fofs = fi->ext.fofs + fi->ext.len - 1;
+ start_blkaddr = fi->ext.blk_addr;
+
+ if (pgofs >= start_fofs && pgofs <= end_fofs) {
+ unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+ size_t count;
+
+ clear_buffer_new(bh_result);
+ map_bh(bh_result, inode->i_sb,
+ start_blkaddr + pgofs - start_fofs);
+ count = end_fofs - pgofs + 1;
+ if (count < (UINT_MAX >> blkbits))
+ bh_result->b_size = (count << blkbits);
+ else
+ bh_result->b_size = UINT_MAX;
+
+ sbi->read_hit_ext++;
+ read_unlock(&fi->ext.ext_lock);
+ return 1;
+ }
+ read_unlock(&fi->ext.ext_lock);
+ return 0;
+}
+
+void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+{
+ struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+ pgoff_t fofs, start_fofs, end_fofs;
+ block_t start_blkaddr, end_blkaddr;
+
+ BUG_ON(blk_addr == NEW_ADDR);
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node;
+
+ /* Update the page address in the parent node */
+ __set_data_blkaddr(dn, blk_addr);
+
+ write_lock(&fi->ext.ext_lock);
+
+ start_fofs = fi->ext.fofs;
+ end_fofs = fi->ext.fofs + fi->ext.len - 1;
+ start_blkaddr = fi->ext.blk_addr;
+ end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+
+ /* Drop and initialize the matched extent */
+ if (fi->ext.len == 1 && fofs == start_fofs)
+ fi->ext.len = 0;
+
+ /* Initial extent */
+ if (fi->ext.len == 0) {
+ if (blk_addr != NULL_ADDR) {
+ fi->ext.fofs = fofs;
+ fi->ext.blk_addr = blk_addr;
+ fi->ext.len = 1;
+ }
+ goto end_update;
+ }
+
+ /* Frone merge */
+ if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
+ fi->ext.fofs--;
+ fi->ext.blk_addr--;
+ fi->ext.len++;
+ goto end_update;
+ }
+
+ /* Back merge */
+ if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
+ fi->ext.len++;
+ goto end_update;
+ }
+
+ /* Split the existing extent */
+ if (fi->ext.len > 1 &&
+ fofs >= start_fofs && fofs <= end_fofs) {
+ if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
+ fi->ext.len = fofs - start_fofs;
+ } else {
+ fi->ext.fofs = fofs + 1;
+ fi->ext.blk_addr = start_blkaddr +
+ fofs - start_fofs + 1;
+ fi->ext.len -= fofs - start_fofs + 1;
+ }
+ goto end_update;
+ }
+ write_unlock(&fi->ext.ext_lock);
+ return;
+
+end_update:
+ write_unlock(&fi->ext.ext_lock);
+ sync_inode_page(dn);
+ return;
+}
+
+struct page *find_data_page(struct inode *inode, pgoff_t index)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ struct dnode_of_data dn;
+ struct page *page;
+ int err;
+
+ page = find_get_page(mapping, index);
+ if (page && PageUptodate(page))
+ return page;
+ f2fs_put_page(page, 0);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ if (err)
+ return ERR_PTR(err);
+ f2fs_put_dnode(&dn);
+
+ if (dn.data_blkaddr == NULL_ADDR)
+ return ERR_PTR(-ENOENT);
+
+ /* By fallocate(), there is no cached page, but with NEW_ADDR */
+ if (dn.data_blkaddr == NEW_ADDR)
+ return ERR_PTR(-EINVAL);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+ unlock_page(page);
+ return page;
+}
+
+/*
+ * If it tries to access a hole, return an error.
+ * Because, the callers, functions in dir.c and GC, should be able to know
+ * whether this page exists or not.
+ */
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ struct dnode_of_data dn;
+ struct page *page;
+ int err;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ if (err)
+ return ERR_PTR(err);
+ f2fs_put_dnode(&dn);
+
+ if (dn.data_blkaddr == NULL_ADDR)
+ return ERR_PTR(-ENOENT);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ if (PageUptodate(page))
+ return page;
+
+ BUG_ON(dn.data_blkaddr == NEW_ADDR);
+ BUG_ON(dn.data_blkaddr == NULL_ADDR);
+
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+ return page;
+}
+
+/*
+ * Caller ensures that this data page is never allocated.
+ * A new zero-filled data page is allocated in the page cache.
+ */
+struct page *get_new_data_page(struct inode *inode, pgoff_t index,
+ bool new_i_size)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page;
+ struct dnode_of_data dn;
+ int err;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, 0);
+ if (err)
+ return ERR_PTR(err);
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ if (reserve_new_block(&dn)) {
+ f2fs_put_dnode(&dn);
+ return ERR_PTR(-ENOSPC);
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ if (PageUptodate(page))
+ return page;
+
+ if (dn.data_blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ } else {
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+ }
+ SetPageUptodate(page);
+
+ if (new_i_size &&
+ i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
+ i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ mark_inode_dirty_sync(inode);
+ }
+ return page;
+}
+
+static void read_end_io(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+
+ if (uptodate) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ } while (bvec >= bio->bi_io_vec);
+ kfree(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * Fill the locked page with data located in the block address.
+ * Read operation is synchronous, and caller must unlock the page.
+ */
+int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, int type)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ bool sync = (type == READ_SYNC);
+ struct bio *bio;
+
+ /* This page can be already read by other threads */
+ if (PageUptodate(page)) {
+ if (!sync)
+ unlock_page(page);
+ return 0;
+ }
+
+ down_read(&sbi->bio_sem);
+
+ /* Allocate a new bio */
+ bio = f2fs_bio_alloc(bdev, 1);
+
+ /* Initialize the bio */
+ bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_end_io = read_end_io;
+
+ if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ kfree(bio->bi_private);
+ bio_put(bio);
+ up_read(&sbi->bio_sem);
+ return -EFAULT;
+ }
+
+ submit_bio(type, bio);
+ up_read(&sbi->bio_sem);
+
+ /* wait for read completion if sync */
+ if (sync) {
+ lock_page(page);
+ if (PageError(page))
+ return -EIO;
+ }
+ return 0;
+}
+
+/*
+ * This function should be used by the data read flow only where it
+ * does not check the "create" flag that indicates block allocation.
+ * The reason for this special functionality is to exploit VFS readahead
+ * mechanism.
+ */
+static int get_data_block_ro(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+ unsigned maxblocks = bh_result->b_size >> blkbits;
+ struct dnode_of_data dn;
+ pgoff_t pgofs;
+ int err;
+
+ /* Get the page offset from the block offset(iblock) */
+ pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
+
+ if (check_extent_cache(inode, pgofs, bh_result))
+ return 0;
+
+ /* When reading holes, we need its node page */
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
+ if (err)
+ return (err == -ENOENT) ? 0 : err;
+
+ /* It does not support data allocation */
+ BUG_ON(create);
+
+ if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
+ int i;
+ unsigned int end_offset;
+
+ end_offset = IS_INODE(dn.node_page) ?
+ ADDRS_PER_INODE :
+ ADDRS_PER_BLOCK;
+
+ clear_buffer_new(bh_result);
+
+ /* Give more consecutive addresses for the read ahead */
+ for (i = 0; i < end_offset - dn.ofs_in_node; i++)
+ if (((datablock_addr(dn.node_page,
+ dn.ofs_in_node + i))
+ != (dn.data_blkaddr + i)) || maxblocks == i)
+ break;
+ map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ bh_result->b_size = (i << blkbits);
+ }
+ f2fs_put_dnode(&dn);
+ return 0;
+}
+
+static int f2fs_read_data_page(struct file *file, struct page *page)
+{
+ return mpage_readpage(page, get_data_block_ro);
+}
+
+static int f2fs_read_data_pages(struct file *file,
+ struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+}
+
+int do_write_data_page(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ block_t old_blk_addr, new_blk_addr;
+ struct dnode_of_data dn;
+ int err = 0;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
+ if (err)
+ return err;
+
+ old_blk_addr = dn.data_blkaddr;
+
+ /* This page is already truncated */
+ if (old_blk_addr == NULL_ADDR)
+ goto out_writepage;
+
+ set_page_writeback(page);
+
+ /*
+ * If current allocation needs SSR,
+ * it had better in-place writes for updated data.
+ */
+ if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
+ need_inplace_update(inode)) {
+ rewrite_data_page(F2FS_SB(inode->i_sb), page,
+ old_blk_addr);
+ } else {
+ write_data_page(inode, page, &dn,
+ old_blk_addr, &new_blk_addr);
+ update_extent_cache(new_blk_addr, &dn);
+ F2FS_I(inode)->data_version =
+ le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
+ }
+out_writepage:
+ f2fs_put_dnode(&dn);
+ return err;
+}
+
+static int f2fs_write_data_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ loff_t i_size = i_size_read(inode);
+ const pgoff_t end_index = ((unsigned long long) i_size)
+ >> PAGE_CACHE_SHIFT;
+ unsigned offset;
+ int err = 0;
+
+ if (page->index < end_index)
+ goto out;
+
+ /*
+ * If the offset is out-of-range of file size,
+ * this page does not have to be written to disk.
+ */
+ offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if ((page->index >= end_index + 1) || !offset) {
+ if (S_ISDIR(inode->i_mode)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ goto unlock_out;
+ }
+
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+out:
+ if (sbi->por_doing)
+ goto redirty_out;
+
+ if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
+ goto redirty_out;
+
+ mutex_lock_op(sbi, DATA_WRITE);
+ if (S_ISDIR(inode->i_mode)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ err = do_write_data_page(page);
+ if (err && err != -ENOENT) {
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ }
+ mutex_unlock_op(sbi, DATA_WRITE);
+
+ if (wbc->for_reclaim)
+ f2fs_submit_bio(sbi, DATA, true);
+
+ if (err == -ENOENT)
+ goto unlock_out;
+
+ clear_cold_data(page);
+ unlock_page(page);
+
+ if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
+ f2fs_balance_fs(sbi);
+ return 0;
+
+unlock_out:
+ unlock_page(page);
+ return (err == -ENOENT) ? 0 : err;
+
+redirty_out:
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
+}
+
+#define MAX_DESIRED_PAGES_WP 4096
+
+static int f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int ret;
+ long excess_nrtw = 0, desired_nrtw;
+
+ if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
+ desired_nrtw = MAX_DESIRED_PAGES_WP;
+ excess_nrtw = desired_nrtw - wbc->nr_to_write;
+ wbc->nr_to_write = desired_nrtw;
+ }
+
+ if (!S_ISDIR(inode->i_mode))
+ mutex_lock(&sbi->writepages);
+ ret = generic_writepages(mapping, wbc);
+ if (!S_ISDIR(inode->i_mode))
+ mutex_unlock(&sbi->writepages);
+ f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+
+ remove_dirty_dir_inode(inode);
+
+ wbc->nr_to_write -= excess_nrtw;
+ return ret;
+}
+
+static int f2fs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+ pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
+ struct dnode_of_data dn;
+ int err = 0;
+
+ /* for nobh_write_end */
+ *fsdata = NULL;
+
+ f2fs_balance_fs(sbi);
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ mutex_lock_op(sbi, DATA_NEW);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, 0);
+ if (err) {
+ mutex_unlock_op(sbi, DATA_NEW);
+ f2fs_put_page(page, 1);
+ return err;
+ }
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ err = reserve_new_block(&dn);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_NEW);
+ f2fs_put_page(page, 1);
+ return err;
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+ return 0;
+
+ if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+ unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+ unsigned end = start + len;
+
+ /* Reading beyond i_size is simple: memset to zero */
+ zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+ return 0;
+ }
+
+ if (dn.data_blkaddr == NEW_ADDR) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ } else {
+ err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return err;
+ }
+ }
+ SetPageUptodate(page);
+ clear_cold_data(page);
+ return 0;
+}
+
+static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+ if (rw == WRITE)
+ return 0;
+
+ /* Needs synchronization with the cleaner */
+ return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ get_data_block_ro);
+}
+
+static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ ClearPagePrivate(page);
+}
+
+static int f2fs_release_data_page(struct page *page, gfp_t wait)
+{
+ ClearPagePrivate(page);
+ return 0;
+}
+
+static int f2fs_set_data_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ __set_page_dirty_nobuffers(page);
+ set_dirty_dir_page(inode, page);
+ return 1;
+ }
+ return 0;
+}
+
+const struct address_space_operations f2fs_dblock_aops = {
+ .readpage = f2fs_read_data_page,
+ .readpages = f2fs_read_data_pages,
+ .writepage = f2fs_write_data_page,
+ .writepages = f2fs_write_data_pages,
+ .write_begin = f2fs_write_begin,
+ .write_end = nobh_write_end,
+ .set_page_dirty = f2fs_set_data_page_dirty,
+ .invalidatepage = f2fs_invalidate_data_page,
+ .releasepage = f2fs_release_data_page,
+ .direct_IO = f2fs_direct_IO,
+};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
new file mode 100644
index 000000000000..0e0380a588ad
--- /dev/null
+++ b/fs/f2fs/debug.c
@@ -0,0 +1,361 @@
+/*
+ * f2fs debugging statistics
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ * Copyright (c) 2012 Linux Foundation
+ * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/blkdev.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+
+static LIST_HEAD(f2fs_stat_list);
+static struct dentry *debugfs_root;
+
+static void update_general_status(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+ int i;
+
+ /* valid check of the segment numbers */
+ si->hit_ext = sbi->read_hit_ext;
+ si->total_ext = sbi->total_hit_ext;
+ si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
+ si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
+ si->ndirty_dirs = sbi->n_dirty_dirs;
+ si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+ si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+ si->rsvd_segs = reserved_segments(sbi);
+ si->overp_segs = overprovision_segments(sbi);
+ si->valid_count = valid_user_blocks(sbi);
+ si->valid_node_count = valid_node_count(sbi);
+ si->valid_inode_count = valid_inode_count(sbi);
+ si->utilization = utilization(sbi);
+
+ si->free_segs = free_segments(sbi);
+ si->free_secs = free_sections(sbi);
+ si->prefree_count = prefree_segments(sbi);
+ si->dirty_count = dirty_segments(sbi);
+ si->node_pages = sbi->node_inode->i_mapping->nrpages;
+ si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+ si->nats = NM_I(sbi)->nat_cnt;
+ si->sits = SIT_I(sbi)->dirty_sentries;
+ si->fnids = NM_I(sbi)->fcnt;
+ si->bg_gc = sbi->bg_gc;
+ si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+ * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+ / 2;
+ si->util_valid = (int)(written_block_count(sbi) >>
+ sbi->log_blocks_per_seg)
+ * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
+ / 2;
+ si->util_invalid = 50 - si->util_free - si->util_valid;
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
+ struct curseg_info *curseg = CURSEG_I(sbi, i);
+ si->curseg[i] = curseg->segno;
+ si->cursec[i] = curseg->segno / sbi->segs_per_sec;
+ si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
+ }
+
+ for (i = 0; i < 2; i++) {
+ si->segment_count[i] = sbi->segment_count[i];
+ si->block_count[i] = sbi->block_count[i];
+ }
+}
+
+/*
+ * This function calculates BDF of every segments
+ */
+static void update_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+ unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno, vblocks;
+ int ndirty = 0;
+
+ bimodal = 0;
+ total_vblocks = 0;
+ blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+ hblks_per_sec = blks_per_sec / 2;
+ mutex_lock(&sit_i->sentry_lock);
+ for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ dist = abs(vblocks - hblks_per_sec);
+ bimodal += dist * dist;
+
+ if (vblocks > 0 && vblocks < blks_per_sec) {
+ total_vblocks += vblocks;
+ ndirty++;
+ }
+ }
+ mutex_unlock(&sit_i->sentry_lock);
+ dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
+ si->bimodal = bimodal / dist;
+ if (si->dirty_count)
+ si->avg_vblocks = total_vblocks / ndirty;
+ else
+ si->avg_vblocks = 0;
+}
+
+/*
+ * This function calculates memory footprint.
+ */
+static void update_mem_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+ unsigned npages;
+
+ if (si->base_mem)
+ goto get_cache;
+
+ si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
+ si->base_mem += 2 * sizeof(struct f2fs_inode_info);
+ si->base_mem += sizeof(*sbi->ckpt);
+
+ /* build sm */
+ si->base_mem += sizeof(struct f2fs_sm_info);
+
+ /* build sit */
+ si->base_mem += sizeof(struct sit_info);
+ si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
+ si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
+ if (sbi->segs_per_sec > 1)
+ si->base_mem += sbi->total_sections *
+ sizeof(struct sec_entry);
+ si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
+
+ /* build free segmap */
+ si->base_mem += sizeof(struct free_segmap_info);
+ si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += f2fs_bitmap_size(sbi->total_sections);
+
+ /* build curseg */
+ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
+ si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+
+ /* build dirty segmap */
+ si->base_mem += sizeof(struct dirty_seglist_info);
+ si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
+
+ /* buld nm */
+ si->base_mem += sizeof(struct f2fs_nm_info);
+ si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+
+ /* build gc */
+ si->base_mem += sizeof(struct f2fs_gc_kthread);
+
+get_cache:
+ /* free nids */
+ si->cache_mem = NM_I(sbi)->fcnt;
+ si->cache_mem += NM_I(sbi)->nat_cnt;
+ npages = sbi->node_inode->i_mapping->nrpages;
+ si->cache_mem += npages << PAGE_CACHE_SHIFT;
+ npages = sbi->meta_inode->i_mapping->nrpages;
+ si->cache_mem += npages << PAGE_CACHE_SHIFT;
+ si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
+ si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
+}
+
+static int stat_show(struct seq_file *s, void *v)
+{
+ struct f2fs_stat_info *si, *next;
+ int i = 0;
+ int j;
+
+ list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
+
+ mutex_lock(&si->stat_lock);
+ if (!si->sbi) {
+ mutex_unlock(&si->stat_lock);
+ continue;
+ }
+ update_general_status(si->sbi);
+
+ seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
+ seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ",
+ si->nat_area_segs, si->sit_area_segs);
+ seq_printf(s, "[SSA: %d] [MAIN: %d",
+ si->ssa_area_segs, si->main_area_segs);
+ seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
+ si->overp_segs, si->rsvd_segs);
+ seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
+ si->utilization, si->valid_count);
+ seq_printf(s, " - Node: %u (Inode: %u, ",
+ si->valid_node_count, si->valid_inode_count);
+ seq_printf(s, "Other: %u)\n - Data: %u\n",
+ si->valid_node_count - si->valid_inode_count,
+ si->valid_count - si->valid_node_count);
+ seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
+ si->main_area_segs, si->main_area_sections,
+ si->main_area_zones);
+ seq_printf(s, " - COLD data: %d, %d, %d\n",
+ si->curseg[CURSEG_COLD_DATA],
+ si->cursec[CURSEG_COLD_DATA],
+ si->curzone[CURSEG_COLD_DATA]);
+ seq_printf(s, " - WARM data: %d, %d, %d\n",
+ si->curseg[CURSEG_WARM_DATA],
+ si->cursec[CURSEG_WARM_DATA],
+ si->curzone[CURSEG_WARM_DATA]);
+ seq_printf(s, " - HOT data: %d, %d, %d\n",
+ si->curseg[CURSEG_HOT_DATA],
+ si->cursec[CURSEG_HOT_DATA],
+ si->curzone[CURSEG_HOT_DATA]);
+ seq_printf(s, " - Dir dnode: %d, %d, %d\n",
+ si->curseg[CURSEG_HOT_NODE],
+ si->cursec[CURSEG_HOT_NODE],
+ si->curzone[CURSEG_HOT_NODE]);
+ seq_printf(s, " - File dnode: %d, %d, %d\n",
+ si->curseg[CURSEG_WARM_NODE],
+ si->cursec[CURSEG_WARM_NODE],
+ si->curzone[CURSEG_WARM_NODE]);
+ seq_printf(s, " - Indir nodes: %d, %d, %d\n",
+ si->curseg[CURSEG_COLD_NODE],
+ si->cursec[CURSEG_COLD_NODE],
+ si->curzone[CURSEG_COLD_NODE]);
+ seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
+ si->main_area_segs - si->dirty_count -
+ si->prefree_count - si->free_segs,
+ si->dirty_count);
+ seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
+ si->prefree_count, si->free_segs, si->free_secs);
+ seq_printf(s, "GC calls: %d (BG: %d)\n",
+ si->call_count, si->bg_gc);
+ seq_printf(s, " - data segments : %d\n", si->data_segs);
+ seq_printf(s, " - node segments : %d\n", si->node_segs);
+ seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
+ seq_printf(s, " - data blocks : %d\n", si->data_blks);
+ seq_printf(s, " - node blocks : %d\n", si->node_blks);
+ seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
+ si->hit_ext, si->total_ext);
+ seq_printf(s, "\nBalancing F2FS Async:\n");
+ seq_printf(s, " - nodes %4d in %4d\n",
+ si->ndirty_node, si->node_pages);
+ seq_printf(s, " - dents %4d in dirs:%4d\n",
+ si->ndirty_dent, si->ndirty_dirs);
+ seq_printf(s, " - meta %4d in %4d\n",
+ si->ndirty_meta, si->meta_pages);
+ seq_printf(s, " - NATs %5d > %lu\n",
+ si->nats, NM_WOUT_THRESHOLD);
+ seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
+ si->sits, si->fnids);
+ seq_printf(s, "\nDistribution of User Blocks:");
+ seq_printf(s, " [ valid | invalid | free ]\n");
+ seq_printf(s, " [");
+
+ for (j = 0; j < si->util_valid; j++)
+ seq_printf(s, "-");
+ seq_printf(s, "|");
+
+ for (j = 0; j < si->util_invalid; j++)
+ seq_printf(s, "-");
+ seq_printf(s, "|");
+
+ for (j = 0; j < si->util_free; j++)
+ seq_printf(s, "-");
+ seq_printf(s, "]\n\n");
+ seq_printf(s, "SSR: %u blocks in %u segments\n",
+ si->block_count[SSR], si->segment_count[SSR]);
+ seq_printf(s, "LFS: %u blocks in %u segments\n",
+ si->block_count[LFS], si->segment_count[LFS]);
+
+ /* segment usage info */
+ update_sit_info(si->sbi);
+ seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
+ si->bimodal, si->avg_vblocks);
+
+ /* memory footprint */
+ update_mem_info(si->sbi);
+ seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
+ (si->base_mem + si->cache_mem) >> 10,
+ si->base_mem >> 10, si->cache_mem >> 10);
+ mutex_unlock(&si->stat_lock);
+ }
+ return 0;
+}
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, stat_show, inode->i_private);
+}
+
+static const struct file_operations stat_fops = {
+ .open = stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int init_stats(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_stat_info *si;
+
+ sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
+ if (!sbi->stat_info)
+ return -ENOMEM;
+
+ si = sbi->stat_info;
+ mutex_init(&si->stat_lock);
+ list_add_tail(&si->stat_list, &f2fs_stat_list);
+
+ si->all_area_segs = le32_to_cpu(raw_super->segment_count);
+ si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
+ si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
+ si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa);
+ si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
+ si->main_area_sections = le32_to_cpu(raw_super->section_count);
+ si->main_area_zones = si->main_area_sections /
+ le32_to_cpu(raw_super->secs_per_zone);
+ si->sbi = sbi;
+ return 0;
+}
+
+int f2fs_build_stats(struct f2fs_sb_info *sbi)
+{
+ int retval;
+
+ retval = init_stats(sbi);
+ if (retval)
+ return retval;
+
+ if (!debugfs_root)
+ debugfs_root = debugfs_create_dir("f2fs", NULL);
+
+ debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops);
+ return 0;
+}
+
+void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_stat_info *si = sbi->stat_info;
+
+ list_del(&si->stat_list);
+ mutex_lock(&si->stat_lock);
+ si->sbi = NULL;
+ mutex_unlock(&si->stat_lock);
+ kfree(sbi->stat_info);
+}
+
+void destroy_root_stats(void)
+{
+ debugfs_remove_recursive(debugfs_root);
+ debugfs_root = NULL;
+}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
new file mode 100644
index 000000000000..b4e24f32b54e
--- /dev/null
+++ b/fs/f2fs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * fs/f2fs/dir.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "acl.h"
+
+static unsigned long dir_blocks(struct inode *inode)
+{
+ return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
+ >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned int dir_buckets(unsigned int level)
+{
+ if (level < MAX_DIR_HASH_DEPTH / 2)
+ return 1 << level;
+ else
+ return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
+}
+
+static unsigned int bucket_blocks(unsigned int level)
+{
+ if (level < MAX_DIR_HASH_DEPTH / 2)
+ return 2;
+ else
+ return 4;
+}
+
+static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
+ [F2FS_FT_UNKNOWN] = DT_UNKNOWN,
+ [F2FS_FT_REG_FILE] = DT_REG,
+ [F2FS_FT_DIR] = DT_DIR,
+ [F2FS_FT_CHRDEV] = DT_CHR,
+ [F2FS_FT_BLKDEV] = DT_BLK,
+ [F2FS_FT_FIFO] = DT_FIFO,
+ [F2FS_FT_SOCK] = DT_SOCK,
+ [F2FS_FT_SYMLINK] = DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
+};
+
+static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+{
+ mode_t mode = inode->i_mode;
+ de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static unsigned long dir_block_index(unsigned int level, unsigned int idx)
+{
+ unsigned long i;
+ unsigned long bidx = 0;
+
+ for (i = 0; i < level; i++)
+ bidx += dir_buckets(i) * bucket_blocks(i);
+ bidx += idx * bucket_blocks(level);
+ return bidx;
+}
+
+static bool early_match_name(const char *name, int namelen,
+ f2fs_hash_t namehash, struct f2fs_dir_entry *de)
+{
+ if (le16_to_cpu(de->name_len) != namelen)
+ return false;
+
+ if (de->hash_code != namehash)
+ return false;
+
+ return true;
+}
+
+static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
+ const char *name, int namelen, int *max_slots,
+ f2fs_hash_t namehash, struct page **res_page)
+{
+ struct f2fs_dir_entry *de;
+ unsigned long bit_pos, end_pos, next_pos;
+ struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
+ int slots;
+
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK, 0);
+ while (bit_pos < NR_DENTRY_IN_BLOCK) {
+ de = &dentry_blk->dentry[bit_pos];
+ slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+
+ if (early_match_name(name, namelen, namehash, de)) {
+ if (!memcmp(dentry_blk->filename[bit_pos],
+ name, namelen)) {
+ *res_page = dentry_page;
+ goto found;
+ }
+ }
+ next_pos = bit_pos + slots;
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK, next_pos);
+ if (bit_pos >= NR_DENTRY_IN_BLOCK)
+ end_pos = NR_DENTRY_IN_BLOCK;
+ else
+ end_pos = bit_pos;
+ if (*max_slots < end_pos - next_pos)
+ *max_slots = end_pos - next_pos;
+ }
+
+ de = NULL;
+ kunmap(dentry_page);
+found:
+ return de;
+}
+
+static struct f2fs_dir_entry *find_in_level(struct inode *dir,
+ unsigned int level, const char *name, int namelen,
+ f2fs_hash_t namehash, struct page **res_page)
+{
+ int s = GET_DENTRY_SLOTS(namelen);
+ unsigned int nbucket, nblock;
+ unsigned int bidx, end_block;
+ struct page *dentry_page;
+ struct f2fs_dir_entry *de = NULL;
+ bool room = false;
+ int max_slots = 0;
+
+ BUG_ON(level > MAX_DIR_HASH_DEPTH);
+
+ nbucket = dir_buckets(level);
+ nblock = bucket_blocks(level);
+
+ bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
+ end_block = bidx + nblock;
+
+ for (; bidx < end_block; bidx++) {
+ /* no need to allocate new dentry pages to all the indices */
+ dentry_page = find_data_page(dir, bidx);
+ if (IS_ERR(dentry_page)) {
+ room = true;
+ continue;
+ }
+
+ de = find_in_block(dentry_page, name, namelen,
+ &max_slots, namehash, res_page);
+ if (de)
+ break;
+
+ if (max_slots >= s)
+ room = true;
+ f2fs_put_page(dentry_page, 0);
+ }
+
+ if (!de && room && F2FS_I(dir)->chash != namehash) {
+ F2FS_I(dir)->chash = namehash;
+ F2FS_I(dir)->clevel = level;
+ }
+
+ return de;
+}
+
+/*
+ * Find an entry in the specified directory with the wanted name.
+ * It returns the page where the entry was found (as a parameter - res_page),
+ * and the entry itself. Page is returned mapped and unlocked.
+ * Entry is guaranteed to be valid.
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
+ struct qstr *child, struct page **res_page)
+{
+ const char *name = child->name;
+ int namelen = child->len;
+ unsigned long npages = dir_blocks(dir);
+ struct f2fs_dir_entry *de = NULL;
+ f2fs_hash_t name_hash;
+ unsigned int max_depth;
+ unsigned int level;
+
+ if (npages == 0)
+ return NULL;
+
+ *res_page = NULL;
+
+ name_hash = f2fs_dentry_hash(name, namelen);
+ max_depth = F2FS_I(dir)->i_current_depth;
+
+ for (level = 0; level < max_depth; level++) {
+ de = find_in_level(dir, level, name,
+ namelen, name_hash, res_page);
+ if (de)
+ break;
+ }
+ if (!de && F2FS_I(dir)->chash != name_hash) {
+ F2FS_I(dir)->chash = name_hash;
+ F2FS_I(dir)->clevel = level - 1;
+ }
+ return de;
+}
+
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+{
+ struct page *page = NULL;
+ struct f2fs_dir_entry *de = NULL;
+ struct f2fs_dentry_block *dentry_blk = NULL;
+
+ page = get_lock_data_page(dir, 0);
+ if (IS_ERR(page))
+ return NULL;
+
+ dentry_blk = kmap(page);
+ de = &dentry_blk->dentry[1];
+ *p = page;
+ unlock_page(page);
+ return de;
+}
+
+ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
+{
+ ino_t res = 0;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+
+ de = f2fs_find_entry(dir, qstr, &page);
+ if (de) {
+ res = le32_to_cpu(de->ino);
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ }
+
+ return res;
+}
+
+void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
+ struct page *page, struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+
+ mutex_lock_op(sbi, DENTRY_OPS);
+ lock_page(page);
+ wait_on_page_writeback(page);
+ de->ino = cpu_to_le32(inode->i_ino);
+ set_de_type(de, inode);
+ kunmap(page);
+ set_page_dirty(page);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(dir);
+
+ /* update parent inode number before releasing dentry page */
+ F2FS_I(inode)->i_pino = dir->i_ino;
+
+ f2fs_put_page(page, 1);
+ mutex_unlock_op(sbi, DENTRY_OPS);
+}
+
+void init_dent_inode(struct dentry *dentry, struct page *ipage)
+{
+ struct f2fs_node *rn;
+
+ if (IS_ERR(ipage))
+ return;
+
+ wait_on_page_writeback(ipage);
+
+ /* copy dentry info. to this inode page */
+ rn = (struct f2fs_node *)page_address(ipage);
+ rn->i.i_namelen = cpu_to_le32(dentry->d_name.len);
+ memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len);
+ set_page_dirty(ipage);
+}
+
+static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ int err;
+ err = new_inode_page(inode, dentry);
+ if (err)
+ return err;
+
+ if (S_ISDIR(inode->i_mode)) {
+ err = f2fs_make_empty(inode, dir);
+ if (err) {
+ remove_inode_page(inode);
+ return err;
+ }
+ }
+
+ err = f2fs_init_acl(inode, dir);
+ if (err) {
+ remove_inode_page(inode);
+ return err;
+ }
+ } else {
+ struct page *ipage;
+ ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+ init_dent_inode(dentry, ipage);
+ f2fs_put_page(ipage, 1);
+ }
+ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
+ inc_nlink(inode);
+ f2fs_write_inode(inode, NULL);
+ }
+ return 0;
+}
+
+static void update_parent_metadata(struct inode *dir, struct inode *inode,
+ unsigned int current_depth)
+{
+ bool need_dir_update = false;
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+ if (S_ISDIR(inode->i_mode)) {
+ inc_nlink(dir);
+ need_dir_update = true;
+ }
+ clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
+ }
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ if (F2FS_I(dir)->i_current_depth != current_depth) {
+ F2FS_I(dir)->i_current_depth = current_depth;
+ need_dir_update = true;
+ }
+
+ if (need_dir_update)
+ f2fs_write_inode(dir, NULL);
+ else
+ mark_inode_dirty(dir);
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+}
+
+static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
+{
+ int bit_start = 0;
+ int zero_start, zero_end;
+next:
+ zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ bit_start);
+ if (zero_start >= NR_DENTRY_IN_BLOCK)
+ return NR_DENTRY_IN_BLOCK;
+
+ zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ zero_start);
+ if (zero_end - zero_start >= slots)
+ return zero_start;
+
+ bit_start = zero_end + 1;
+
+ if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
+ return NR_DENTRY_IN_BLOCK;
+ goto next;
+}
+
+int f2fs_add_link(struct dentry *dentry, struct inode *inode)
+{
+ unsigned int bit_pos;
+ unsigned int level;
+ unsigned int current_depth;
+ unsigned long bidx, block;
+ f2fs_hash_t dentry_hash;
+ struct f2fs_dir_entry *de;
+ unsigned int nbucket, nblock;
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct page *dentry_page = NULL;
+ struct f2fs_dentry_block *dentry_blk = NULL;
+ int slots = GET_DENTRY_SLOTS(namelen);
+ int err = 0;
+ int i;
+
+ dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
+ level = 0;
+ current_depth = F2FS_I(dir)->i_current_depth;
+ if (F2FS_I(dir)->chash == dentry_hash) {
+ level = F2FS_I(dir)->clevel;
+ F2FS_I(dir)->chash = 0;
+ }
+
+start:
+ if (current_depth == MAX_DIR_HASH_DEPTH)
+ return -ENOSPC;
+
+ /* Increase the depth, if required */
+ if (level == current_depth)
+ ++current_depth;
+
+ nbucket = dir_buckets(level);
+ nblock = bucket_blocks(level);
+
+ bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
+
+ for (block = bidx; block <= (bidx + nblock - 1); block++) {
+ mutex_lock_op(sbi, DENTRY_OPS);
+ dentry_page = get_new_data_page(dir, block, true);
+ if (IS_ERR(dentry_page)) {
+ mutex_unlock_op(sbi, DENTRY_OPS);
+ return PTR_ERR(dentry_page);
+ }
+
+ dentry_blk = kmap(dentry_page);
+ bit_pos = room_for_filename(dentry_blk, slots);
+ if (bit_pos < NR_DENTRY_IN_BLOCK)
+ goto add_dentry;
+
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ mutex_unlock_op(sbi, DENTRY_OPS);
+ }
+
+ /* Move to next level to find the empty slot for new dentry */
+ ++level;
+ goto start;
+add_dentry:
+ err = init_inode_metadata(inode, dentry);
+ if (err)
+ goto fail;
+
+ wait_on_page_writeback(dentry_page);
+
+ de = &dentry_blk->dentry[bit_pos];
+ de->hash_code = dentry_hash;
+ de->name_len = cpu_to_le16(namelen);
+ memcpy(dentry_blk->filename[bit_pos], name, namelen);
+ de->ino = cpu_to_le32(inode->i_ino);
+ set_de_type(de, inode);
+ for (i = 0; i < slots; i++)
+ test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+ set_page_dirty(dentry_page);
+
+ update_parent_metadata(dir, inode, current_depth);
+
+ /* update parent inode number before releasing dentry page */
+ F2FS_I(inode)->i_pino = dir->i_ino;
+fail:
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ mutex_unlock_op(sbi, DENTRY_OPS);
+ return err;
+}
+
+/*
+ * It only removes the dentry from the dentry page,corresponding name
+ * entry in name page does not need to be touched during deletion.
+ */
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+ struct inode *inode)
+{
+ struct f2fs_dentry_block *dentry_blk;
+ unsigned int bit_pos;
+ struct address_space *mapping = page->mapping;
+ struct inode *dir = mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+ void *kaddr = page_address(page);
+ int i;
+
+ mutex_lock_op(sbi, DENTRY_OPS);
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+ bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
+ for (i = 0; i < slots; i++)
+ test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
+
+ /* Let's check and deallocate this dentry page */
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ 0);
+ kunmap(page); /* kunmap - pair of f2fs_find_entry */
+ set_page_dirty(page);
+
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+
+ if (inode && S_ISDIR(inode->i_mode)) {
+ drop_nlink(dir);
+ f2fs_write_inode(dir, NULL);
+ } else {
+ mark_inode_dirty(dir);
+ }
+
+ if (inode) {
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ drop_nlink(inode);
+ if (S_ISDIR(inode->i_mode)) {
+ drop_nlink(inode);
+ i_size_write(inode, 0);
+ }
+ f2fs_write_inode(inode, NULL);
+ if (inode->i_nlink == 0)
+ add_orphan_inode(sbi, inode->i_ino);
+ }
+
+ if (bit_pos == NR_DENTRY_IN_BLOCK) {
+ truncate_hole(dir, page->index, page->index + 1);
+ clear_page_dirty_for_io(page);
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(dir);
+ }
+ f2fs_put_page(page, 1);
+
+ mutex_unlock_op(sbi, DENTRY_OPS);
+}
+
+int f2fs_make_empty(struct inode *inode, struct inode *parent)
+{
+ struct page *dentry_page;
+ struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dir_entry *de;
+ void *kaddr;
+
+ dentry_page = get_new_data_page(inode, 0, true);
+ if (IS_ERR(dentry_page))
+ return PTR_ERR(dentry_page);
+
+ kaddr = kmap_atomic(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+
+ de = &dentry_blk->dentry[0];
+ de->name_len = cpu_to_le16(1);
+ de->hash_code = 0;
+ de->ino = cpu_to_le32(inode->i_ino);
+ memcpy(dentry_blk->filename[0], ".", 1);
+ set_de_type(de, inode);
+
+ de = &dentry_blk->dentry[1];
+ de->hash_code = 0;
+ de->name_len = cpu_to_le16(2);
+ de->ino = cpu_to_le32(parent->i_ino);
+ memcpy(dentry_blk->filename[1], "..", 2);
+ set_de_type(de, inode);
+
+ test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
+ test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
+ kunmap_atomic(kaddr);
+
+ set_page_dirty(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ return 0;
+}
+
+bool f2fs_empty_dir(struct inode *dir)
+{
+ unsigned long bidx;
+ struct page *dentry_page;
+ unsigned int bit_pos;
+ struct f2fs_dentry_block *dentry_blk;
+ unsigned long nblock = dir_blocks(dir);
+
+ for (bidx = 0; bidx < nblock; bidx++) {
+ void *kaddr;
+ dentry_page = get_lock_data_page(dir, bidx);
+ if (IS_ERR(dentry_page)) {
+ if (PTR_ERR(dentry_page) == -ENOENT)
+ continue;
+ else
+ return false;
+ }
+
+ kaddr = kmap_atomic(dentry_page);
+ dentry_blk = (struct f2fs_dentry_block *)kaddr;
+ if (bidx == 0)
+ bit_pos = 2;
+ else
+ bit_pos = 0;
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ bit_pos);
+ kunmap_atomic(kaddr);
+
+ f2fs_put_page(dentry_page, 1);
+
+ if (bit_pos < NR_DENTRY_IN_BLOCK)
+ return false;
+ }
+ return true;
+}
+
+static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ unsigned long pos = file->f_pos;
+ struct inode *inode = file->f_dentry->d_inode;
+ unsigned long npages = dir_blocks(inode);
+ unsigned char *types = NULL;
+ unsigned int bit_pos = 0, start_bit_pos = 0;
+ int over = 0;
+ struct f2fs_dentry_block *dentry_blk = NULL;
+ struct f2fs_dir_entry *de = NULL;
+ struct page *dentry_page = NULL;
+ unsigned int n = 0;
+ unsigned char d_type = DT_UNKNOWN;
+ int slots;
+
+ types = f2fs_filetype_table;
+ bit_pos = (pos % NR_DENTRY_IN_BLOCK);
+ n = (pos / NR_DENTRY_IN_BLOCK);
+
+ for ( ; n < npages; n++) {
+ dentry_page = get_lock_data_page(inode, n);
+ if (IS_ERR(dentry_page))
+ continue;
+
+ start_bit_pos = bit_pos;
+ dentry_blk = kmap(dentry_page);
+ while (bit_pos < NR_DENTRY_IN_BLOCK) {
+ d_type = DT_UNKNOWN;
+ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
+ NR_DENTRY_IN_BLOCK,
+ bit_pos);
+ if (bit_pos >= NR_DENTRY_IN_BLOCK)
+ break;
+
+ de = &dentry_blk->dentry[bit_pos];
+ if (types && de->file_type < F2FS_FT_MAX)
+ d_type = types[de->file_type];
+
+ over = filldir(dirent,
+ dentry_blk->filename[bit_pos],
+ le16_to_cpu(de->name_len),
+ (n * NR_DENTRY_IN_BLOCK) + bit_pos,
+ le32_to_cpu(de->ino), d_type);
+ if (over) {
+ file->f_pos += bit_pos - start_bit_pos;
+ goto success;
+ }
+ slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
+ bit_pos += slots;
+ }
+ bit_pos = 0;
+ file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ dentry_page = NULL;
+ }
+success:
+ if (dentry_page && !IS_ERR(dentry_page)) {
+ kunmap(dentry_page);
+ f2fs_put_page(dentry_page, 1);
+ }
+
+ return 0;
+}
+
+const struct file_operations f2fs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = f2fs_readdir,
+ .fsync = f2fs_sync_file,
+ .unlocked_ioctl = f2fs_ioctl,
+};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
new file mode 100644
index 000000000000..a18d63db2fb6
--- /dev/null
+++ b/fs/f2fs/f2fs.h
@@ -0,0 +1,1083 @@
+/*
+ * fs/f2fs/f2fs.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef _LINUX_F2FS_H
+#define _LINUX_F2FS_H
+
+#include <linux/types.h>
+#include <linux/page-flags.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/magic.h>
+
+/*
+ * For mount options
+ */
+#define F2FS_MOUNT_BG_GC 0x00000001
+#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
+#define F2FS_MOUNT_DISCARD 0x00000004
+#define F2FS_MOUNT_NOHEAP 0x00000008
+#define F2FS_MOUNT_XATTR_USER 0x00000010
+#define F2FS_MOUNT_POSIX_ACL 0x00000020
+#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
+
+#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
+#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
+#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
+
+#define ver_after(a, b) (typecheck(unsigned long long, a) && \
+ typecheck(unsigned long long, b) && \
+ ((long long)((a) - (b)) > 0))
+
+typedef u64 block_t;
+typedef u32 nid_t;
+
+struct f2fs_mount_info {
+ unsigned int opt;
+};
+
+static inline __u32 f2fs_crc32(void *buff, size_t len)
+{
+ return crc32_le(F2FS_SUPER_MAGIC, buff, len);
+}
+
+static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
+{
+ return f2fs_crc32(buff, buff_size) == blk_crc;
+}
+
+/*
+ * For checkpoint manager
+ */
+enum {
+ NAT_BITMAP,
+ SIT_BITMAP
+};
+
+/* for the list of orphan inodes */
+struct orphan_inode_entry {
+ struct list_head list; /* list head */
+ nid_t ino; /* inode number */
+};
+
+/* for the list of directory inodes */
+struct dir_inode_entry {
+ struct list_head list; /* list head */
+ struct inode *inode; /* vfs inode pointer */
+};
+
+/* for the list of fsync inodes, used only during recovery */
+struct fsync_inode_entry {
+ struct list_head list; /* list head */
+ struct inode *inode; /* vfs inode pointer */
+ block_t blkaddr; /* block address locating the last inode */
+};
+
+#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
+#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
+
+#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
+#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
+#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
+#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
+
+static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+ int before = nats_in_cursum(rs);
+ rs->n_nats = cpu_to_le16(before + i);
+ return before;
+}
+
+static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
+{
+ int before = sits_in_cursum(rs);
+ rs->n_sits = cpu_to_le16(before + i);
+ return before;
+}
+
+/*
+ * For INODE and NODE manager
+ */
+#define XATTR_NODE_OFFSET (-1) /*
+ * store xattrs to one node block per
+ * file keeping -1 as its node offset to
+ * distinguish from index node blocks.
+ */
+#define RDONLY_NODE 1 /*
+ * specify a read-only mode when getting
+ * a node block. 0 is read-write mode.
+ * used by get_dnode_of_data().
+ */
+#define F2FS_LINK_MAX 32000 /* maximum link count per file */
+
+/* for in-memory extent cache entry */
+struct extent_info {
+ rwlock_t ext_lock; /* rwlock for consistency */
+ unsigned int fofs; /* start offset in a file */
+ u32 blk_addr; /* start block address of the extent */
+ unsigned int len; /* lenth of the extent */
+};
+
+/*
+ * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
+ */
+#define FADVISE_COLD_BIT 0x01
+
+struct f2fs_inode_info {
+ struct inode vfs_inode; /* serve a vfs inode */
+ unsigned long i_flags; /* keep an inode flags for ioctl */
+ unsigned char i_advise; /* use to give file attribute hints */
+ unsigned int i_current_depth; /* use only in directory structure */
+ unsigned int i_pino; /* parent inode number */
+ umode_t i_acl_mode; /* keep file acl mode temporarily */
+
+ /* Use below internally in f2fs*/
+ unsigned long flags; /* use to pass per-file flags */
+ unsigned long long data_version;/* lastes version of data for fsync */
+ atomic_t dirty_dents; /* # of dirty dentry pages */
+ f2fs_hash_t chash; /* hash value of given file name */
+ unsigned int clevel; /* maximum level of given file name */
+ nid_t i_xattr_nid; /* node id that contains xattrs */
+ struct extent_info ext; /* in-memory extent cache entry */
+};
+
+static inline void get_extent_info(struct extent_info *ext,
+ struct f2fs_extent i_ext)
+{
+ write_lock(&ext->ext_lock);
+ ext->fofs = le32_to_cpu(i_ext.fofs);
+ ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
+ ext->len = le32_to_cpu(i_ext.len);
+ write_unlock(&ext->ext_lock);
+}
+
+static inline void set_raw_extent(struct extent_info *ext,
+ struct f2fs_extent *i_ext)
+{
+ read_lock(&ext->ext_lock);
+ i_ext->fofs = cpu_to_le32(ext->fofs);
+ i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
+ i_ext->len = cpu_to_le32(ext->len);
+ read_unlock(&ext->ext_lock);
+}
+
+struct f2fs_nm_info {
+ block_t nat_blkaddr; /* base disk address of NAT */
+ nid_t max_nid; /* maximum possible node ids */
+ nid_t init_scan_nid; /* the first nid to be scanned */
+ nid_t next_scan_nid; /* the next nid to be scanned */
+
+ /* NAT cache management */
+ struct radix_tree_root nat_root;/* root of the nat entry cache */
+ rwlock_t nat_tree_lock; /* protect nat_tree_lock */
+ unsigned int nat_cnt; /* the # of cached nat entries */
+ struct list_head nat_entries; /* cached nat entry list (clean) */
+ struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
+
+ /* free node ids management */
+ struct list_head free_nid_list; /* a list for free nids */
+ spinlock_t free_nid_list_lock; /* protect free nid list */
+ unsigned int fcnt; /* the number of free node id */
+ struct mutex build_lock; /* lock for build free nids */
+
+ /* for checkpoint */
+ char *nat_bitmap; /* NAT bitmap pointer */
+ int bitmap_size; /* bitmap size */
+};
+
+/*
+ * this structure is used as one of function parameters.
+ * all the information are dedicated to a given direct node block determined
+ * by the data offset in a file.
+ */
+struct dnode_of_data {
+ struct inode *inode; /* vfs inode pointer */
+ struct page *inode_page; /* its inode page, NULL is possible */
+ struct page *node_page; /* cached direct node page */
+ nid_t nid; /* node id of the direct node block */
+ unsigned int ofs_in_node; /* data offset in the node page */
+ bool inode_page_locked; /* inode page is locked or not */
+ block_t data_blkaddr; /* block address of the node block */
+};
+
+static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
+ struct page *ipage, struct page *npage, nid_t nid)
+{
+ dn->inode = inode;
+ dn->inode_page = ipage;
+ dn->node_page = npage;
+ dn->nid = nid;
+ dn->inode_page_locked = 0;
+}
+
+/*
+ * For SIT manager
+ *
+ * By default, there are 6 active log areas across the whole main area.
+ * When considering hot and cold data separation to reduce cleaning overhead,
+ * we split 3 for data logs and 3 for node logs as hot, warm, and cold types,
+ * respectively.
+ * In the current design, you should not change the numbers intentionally.
+ * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6
+ * logs individually according to the underlying devices. (default: 6)
+ * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for
+ * data and 8 for node logs.
+ */
+#define NR_CURSEG_DATA_TYPE (3)
+#define NR_CURSEG_NODE_TYPE (3)
+#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
+
+enum {
+ CURSEG_HOT_DATA = 0, /* directory entry blocks */
+ CURSEG_WARM_DATA, /* data blocks */
+ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */
+ CURSEG_HOT_NODE, /* direct node blocks of directory files */
+ CURSEG_WARM_NODE, /* direct node blocks of normal files */
+ CURSEG_COLD_NODE, /* indirect node blocks */
+ NO_CHECK_TYPE
+};
+
+struct f2fs_sm_info {
+ struct sit_info *sit_info; /* whole segment information */
+ struct free_segmap_info *free_info; /* free segment information */
+ struct dirty_seglist_info *dirty_info; /* dirty segment information */
+ struct curseg_info *curseg_array; /* active segment information */
+
+ struct list_head wblist_head; /* list of under-writeback pages */
+ spinlock_t wblist_lock; /* lock for checkpoint */
+
+ block_t seg0_blkaddr; /* block address of 0'th segment */
+ block_t main_blkaddr; /* start block address of main area */
+ block_t ssa_blkaddr; /* start block address of SSA area */
+
+ unsigned int segment_count; /* total # of segments */
+ unsigned int main_segments; /* # of segments in main area */
+ unsigned int reserved_segments; /* # of reserved segments */
+ unsigned int ovp_segments; /* # of overprovision segments */
+};
+
+/*
+ * For directory operation
+ */
+#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1)
+#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2)
+#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3)
+#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4)
+#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5)
+
+/*
+ * For superblock
+ */
+/*
+ * COUNT_TYPE for monitoring
+ *
+ * f2fs monitors the number of several block types such as on-writeback,
+ * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
+ */
+enum count_type {
+ F2FS_WRITEBACK,
+ F2FS_DIRTY_DENTS,
+ F2FS_DIRTY_NODES,
+ F2FS_DIRTY_META,
+ NR_COUNT_TYPE,
+};
+
+/*
+ * FS_LOCK nesting subclasses for the lock validator:
+ *
+ * The locking order between these classes is
+ * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
+ * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
+ */
+enum lock_type {
+ RENAME, /* for renaming operations */
+ DENTRY_OPS, /* for directory operations */
+ DATA_WRITE, /* for data write */
+ DATA_NEW, /* for data allocation */
+ DATA_TRUNC, /* for data truncate */
+ NODE_NEW, /* for node allocation */
+ NODE_TRUNC, /* for node truncate */
+ NODE_WRITE, /* for node write */
+ NR_LOCK_TYPE,
+};
+
+/*
+ * The below are the page types of bios used in submti_bio().
+ * The available types are:
+ * DATA User data pages. It operates as async mode.
+ * NODE Node pages. It operates as async mode.
+ * META FS metadata pages such as SIT, NAT, CP.
+ * NR_PAGE_TYPE The number of page types.
+ * META_FLUSH Make sure the previous pages are written
+ * with waiting the bio's completion
+ * ... Only can be used with META.
+ */
+enum page_type {
+ DATA,
+ NODE,
+ META,
+ NR_PAGE_TYPE,
+ META_FLUSH,
+};
+
+struct f2fs_sb_info {
+ struct super_block *sb; /* pointer to VFS super block */
+ struct buffer_head *raw_super_buf; /* buffer head of raw sb */
+ struct f2fs_super_block *raw_super; /* raw super block pointer */
+ int s_dirty; /* dirty flag for checkpoint */
+
+ /* for node-related operations */
+ struct f2fs_nm_info *nm_info; /* node manager */
+ struct inode *node_inode; /* cache node blocks */
+
+ /* for segment-related operations */
+ struct f2fs_sm_info *sm_info; /* segment manager */
+ struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */
+ sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */
+ struct rw_semaphore bio_sem; /* IO semaphore */
+
+ /* for checkpoint */
+ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
+ struct inode *meta_inode; /* cache meta blocks */
+ struct mutex cp_mutex; /* for checkpoint procedure */
+ struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */
+ struct mutex write_inode; /* mutex for write inode */
+ struct mutex writepages; /* mutex for writepages() */
+ int por_doing; /* recovery is doing or not */
+
+ /* for orphan inode management */
+ struct list_head orphan_inode_list; /* orphan inode list */
+ struct mutex orphan_inode_mutex; /* for orphan inode list */
+ unsigned int n_orphans; /* # of orphan inodes */
+
+ /* for directory inode management */
+ struct list_head dir_inode_list; /* dir inode list */
+ spinlock_t dir_inode_lock; /* for dir inode list lock */
+ unsigned int n_dirty_dirs; /* # of dir inodes */
+
+ /* basic file system units */
+ unsigned int log_sectors_per_block; /* log2 sectors per block */
+ unsigned int log_blocksize; /* log2 block size */
+ unsigned int blocksize; /* block size */
+ unsigned int root_ino_num; /* root inode number*/
+ unsigned int node_ino_num; /* node inode number*/
+ unsigned int meta_ino_num; /* meta inode number*/
+ unsigned int log_blocks_per_seg; /* log2 blocks per segment */
+ unsigned int blocks_per_seg; /* blocks per segment */
+ unsigned int segs_per_sec; /* segments per section */
+ unsigned int secs_per_zone; /* sections per zone */
+ unsigned int total_sections; /* total section count */
+ unsigned int total_node_count; /* total node block count */
+ unsigned int total_valid_node_count; /* valid node block count */
+ unsigned int total_valid_inode_count; /* valid inode count */
+ int active_logs; /* # of active logs */
+
+ block_t user_block_count; /* # of user blocks */
+ block_t total_valid_block_count; /* # of valid blocks */
+ block_t alloc_valid_block_count; /* # of allocated blocks */
+ block_t last_valid_block_count; /* for recovery */
+ u32 s_next_generation; /* for NFS support */
+ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
+
+ struct f2fs_mount_info mount_opt; /* mount options */
+
+ /* for cleaning operations */
+ struct mutex gc_mutex; /* mutex for GC */
+ struct f2fs_gc_kthread *gc_thread; /* GC thread */
+
+ /*
+ * for stat information.
+ * one is for the LFS mode, and the other is for the SSR mode.
+ */
+ struct f2fs_stat_info *stat_info; /* FS status information */
+ unsigned int segment_count[2]; /* # of allocated segments */
+ unsigned int block_count[2]; /* # of allocated blocks */
+ unsigned int last_victim[2]; /* last victim segment # */
+ int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
+ int bg_gc; /* background gc calls */
+ spinlock_t stat_lock; /* lock for stat operations */
+};
+
+/*
+ * Inline functions
+ */
+static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
+{
+ return container_of(inode, struct f2fs_inode_info, vfs_inode);
+}
+
+static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_super_block *)(sbi->raw_super);
+}
+
+static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_checkpoint *)(sbi->ckpt);
+}
+
+static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_nm_info *)(sbi->nm_info);
+}
+
+static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi)
+{
+ return (struct f2fs_sm_info *)(sbi->sm_info);
+}
+
+static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi)
+{
+ return (struct sit_info *)(SM_I(sbi)->sit_info);
+}
+
+static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi)
+{
+ return (struct free_segmap_info *)(SM_I(sbi)->free_info);
+}
+
+static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
+{
+ return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
+}
+
+static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+{
+ sbi->s_dirty = 1;
+}
+
+static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+{
+ sbi->s_dirty = 0;
+}
+
+static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ return ckpt_flags & f;
+}
+
+static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ ckpt_flags |= f;
+ cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+
+static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
+{
+ unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
+ ckpt_flags &= (~f);
+ cp->ckpt_flags = cpu_to_le32(ckpt_flags);
+}
+
+static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+{
+ mutex_lock_nested(&sbi->fs_lock[t], t);
+}
+
+static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
+{
+ mutex_unlock(&sbi->fs_lock[t]);
+}
+
+/*
+ * Check whether the given nid is within node id range.
+ */
+static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ BUG_ON((nid >= NM_I(sbi)->max_nid));
+}
+
+#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1
+
+/*
+ * Check whether the inode has blocks or not
+ */
+static inline int F2FS_HAS_BLOCKS(struct inode *inode)
+{
+ if (F2FS_I(inode)->i_xattr_nid)
+ return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+ else
+ return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+}
+
+static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
+ struct inode *inode, blkcnt_t count)
+{
+ block_t valid_block_count;
+
+ spin_lock(&sbi->stat_lock);
+ valid_block_count =
+ sbi->total_valid_block_count + (block_t)count;
+ if (valid_block_count > sbi->user_block_count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+ inode->i_blocks += count;
+ sbi->total_valid_block_count = valid_block_count;
+ sbi->alloc_valid_block_count += (block_t)count;
+ spin_unlock(&sbi->stat_lock);
+ return true;
+}
+
+static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+ struct inode *inode,
+ blkcnt_t count)
+{
+ spin_lock(&sbi->stat_lock);
+ BUG_ON(sbi->total_valid_block_count < (block_t) count);
+ BUG_ON(inode->i_blocks < count);
+ inode->i_blocks -= count;
+ sbi->total_valid_block_count -= (block_t)count;
+ spin_unlock(&sbi->stat_lock);
+ return 0;
+}
+
+static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+ atomic_inc(&sbi->nr_pages[count_type]);
+ F2FS_SET_SB_DIRT(sbi);
+}
+
+static inline void inode_inc_dirty_dents(struct inode *inode)
+{
+ atomic_inc(&F2FS_I(inode)->dirty_dents);
+}
+
+static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
+{
+ atomic_dec(&sbi->nr_pages[count_type]);
+}
+
+static inline void inode_dec_dirty_dents(struct inode *inode)
+{
+ atomic_dec(&F2FS_I(inode)->dirty_dents);
+}
+
+static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
+{
+ return atomic_read(&sbi->nr_pages[count_type]);
+}
+
+static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
+{
+ block_t ret;
+ spin_lock(&sbi->stat_lock);
+ ret = sbi->total_valid_block_count;
+ spin_unlock(&sbi->stat_lock);
+ return ret;
+}
+
+static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+
+ /* return NAT or SIT bitmap */
+ if (flag == NAT_BITMAP)
+ return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
+ else if (flag == SIT_BITMAP)
+ return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
+
+ return 0;
+}
+
+static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ int offset = (flag == NAT_BITMAP) ?
+ le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
+ return &ckpt->sit_nat_version_bitmap + offset;
+}
+
+static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
+{
+ block_t start_addr;
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver);
+
+ start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+
+ /*
+ * odd numbered checkpoint should at cp segment 0
+ * and even segent must be at cp segment 1
+ */
+ if (!(ckpt_version & 1))
+ start_addr += sbi->blocks_per_seg;
+
+ return start_addr;
+}
+
+static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
+{
+ return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+
+static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
+ struct inode *inode,
+ unsigned int count)
+{
+ block_t valid_block_count;
+ unsigned int valid_node_count;
+
+ spin_lock(&sbi->stat_lock);
+
+ valid_block_count = sbi->total_valid_block_count + (block_t)count;
+ sbi->alloc_valid_block_count += (block_t)count;
+ valid_node_count = sbi->total_valid_node_count + count;
+
+ if (valid_block_count > sbi->user_block_count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+
+ if (valid_node_count > sbi->total_node_count) {
+ spin_unlock(&sbi->stat_lock);
+ return false;
+ }
+
+ if (inode)
+ inode->i_blocks += count;
+ sbi->total_valid_node_count = valid_node_count;
+ sbi->total_valid_block_count = valid_block_count;
+ spin_unlock(&sbi->stat_lock);
+
+ return true;
+}
+
+static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
+ struct inode *inode,
+ unsigned int count)
+{
+ spin_lock(&sbi->stat_lock);
+
+ BUG_ON(sbi->total_valid_block_count < count);
+ BUG_ON(sbi->total_valid_node_count < count);
+ BUG_ON(inode->i_blocks < count);
+
+ inode->i_blocks -= count;
+ sbi->total_valid_node_count -= count;
+ sbi->total_valid_block_count -= (block_t)count;
+
+ spin_unlock(&sbi->stat_lock);
+}
+
+static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
+{
+ unsigned int ret;
+ spin_lock(&sbi->stat_lock);
+ ret = sbi->total_valid_node_count;
+ spin_unlock(&sbi->stat_lock);
+ return ret;
+}
+
+static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+ spin_lock(&sbi->stat_lock);
+ BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
+ sbi->total_valid_inode_count++;
+ spin_unlock(&sbi->stat_lock);
+}
+
+static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+{
+ spin_lock(&sbi->stat_lock);
+ BUG_ON(!sbi->total_valid_inode_count);
+ sbi->total_valid_inode_count--;
+ spin_unlock(&sbi->stat_lock);
+ return 0;
+}
+
+static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
+{
+ unsigned int ret;
+ spin_lock(&sbi->stat_lock);
+ ret = sbi->total_valid_inode_count;
+ spin_unlock(&sbi->stat_lock);
+ return ret;
+}
+
+static inline void f2fs_put_page(struct page *page, int unlock)
+{
+ if (!page || IS_ERR(page))
+ return;
+
+ if (unlock) {
+ BUG_ON(!PageLocked(page));
+ unlock_page(page);
+ }
+ page_cache_release(page);
+}
+
+static inline void f2fs_put_dnode(struct dnode_of_data *dn)
+{
+ if (dn->node_page)
+ f2fs_put_page(dn->node_page, 1);
+ if (dn->inode_page && dn->node_page != dn->inode_page)
+ f2fs_put_page(dn->inode_page, 0);
+ dn->node_page = NULL;
+ dn->inode_page = NULL;
+}
+
+static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
+ size_t size, void (*ctor)(void *))
+{
+ return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
+}
+
+#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
+
+static inline bool IS_INODE(struct page *page)
+{
+ struct f2fs_node *p = (struct f2fs_node *)page_address(page);
+ return RAW_IS_INODE(p);
+}
+
+static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
+{
+ return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
+}
+
+static inline block_t datablock_addr(struct page *node_page,
+ unsigned int offset)
+{
+ struct f2fs_node *raw_node;
+ __le32 *addr_array;
+ raw_node = (struct f2fs_node *)page_address(node_page);
+ addr_array = blkaddr_in_node(raw_node);
+ return le32_to_cpu(addr_array[offset]);
+}
+
+static inline int f2fs_test_bit(unsigned int nr, char *addr)
+{
+ int mask;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ return mask & *addr;
+}
+
+static inline int f2fs_set_bit(unsigned int nr, char *addr)
+{
+ int mask;
+ int ret;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ ret = mask & *addr;
+ *addr |= mask;
+ return ret;
+}
+
+static inline int f2fs_clear_bit(unsigned int nr, char *addr)
+{
+ int mask;
+ int ret;
+
+ addr += (nr >> 3);
+ mask = 1 << (7 - (nr & 0x07));
+ ret = mask & *addr;
+ *addr &= ~mask;
+ return ret;
+}
+
+/* used for f2fs_inode_info->flags */
+enum {
+ FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_NEED_CP, /* need to do checkpoint during fsync */
+ FI_INC_LINK, /* need to increment i_nlink */
+ FI_ACL_MODE, /* indicate acl mode */
+ FI_NO_ALLOC, /* should not allocate any blocks */
+};
+
+static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+ set_bit(flag, &fi->flags);
+}
+
+static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
+{
+ return test_bit(flag, &fi->flags);
+}
+
+static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+ clear_bit(flag, &fi->flags);
+}
+
+static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
+{
+ fi->i_acl_mode = mode;
+ set_inode_flag(fi, FI_ACL_MODE);
+}
+
+static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
+{
+ if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+ clear_inode_flag(fi, FI_ACL_MODE);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * file.c
+ */
+int f2fs_sync_file(struct file *, loff_t, loff_t, int);
+void truncate_data_blocks(struct dnode_of_data *);
+void f2fs_truncate(struct inode *);
+int f2fs_setattr(struct dentry *, struct iattr *);
+int truncate_hole(struct inode *, pgoff_t, pgoff_t);
+long f2fs_ioctl(struct file *, unsigned int, unsigned long);
+
+/*
+ * inode.c
+ */
+void f2fs_set_inode_flags(struct inode *);
+struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
+struct inode *f2fs_iget(struct super_block *, unsigned long);
+void update_inode(struct inode *, struct page *);
+int f2fs_write_inode(struct inode *, struct writeback_control *);
+void f2fs_evict_inode(struct inode *);
+
+/*
+ * namei.c
+ */
+struct dentry *f2fs_get_parent(struct dentry *child);
+
+/*
+ * dir.c
+ */
+struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
+ struct page **);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
+ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
+void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
+ struct page *, struct inode *);
+void init_dent_inode(struct dentry *, struct page *);
+int f2fs_add_link(struct dentry *, struct inode *);
+void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
+int f2fs_make_empty(struct inode *, struct inode *);
+bool f2fs_empty_dir(struct inode *);
+
+/*
+ * super.c
+ */
+int f2fs_sync_fs(struct super_block *, int);
+
+/*
+ * hash.c
+ */
+f2fs_hash_t f2fs_dentry_hash(const char *, int);
+
+/*
+ * node.c
+ */
+struct dnode_of_data;
+struct node_info;
+
+int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
+int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
+int truncate_inode_blocks(struct inode *, pgoff_t);
+int remove_inode_page(struct inode *);
+int new_inode_page(struct inode *, struct dentry *);
+struct page *new_node_page(struct dnode_of_data *, unsigned int);
+void ra_node_page(struct f2fs_sb_info *, nid_t);
+struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_node_page_ra(struct page *, int);
+void sync_inode_page(struct dnode_of_data *);
+int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
+bool alloc_nid(struct f2fs_sb_info *, nid_t *);
+void alloc_nid_done(struct f2fs_sb_info *, nid_t);
+void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
+void recover_node_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_summary *, struct node_info *, block_t);
+int recover_inode_page(struct f2fs_sb_info *, struct page *);
+int restore_node_summary(struct f2fs_sb_info *, unsigned int,
+ struct f2fs_summary_block *);
+void flush_nat_entries(struct f2fs_sb_info *);
+int build_node_manager(struct f2fs_sb_info *);
+void destroy_node_manager(struct f2fs_sb_info *);
+int create_node_manager_caches(void);
+void destroy_node_manager_caches(void);
+
+/*
+ * segment.c
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *);
+void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
+void clear_prefree_segments(struct f2fs_sb_info *);
+int npages_for_summary_flush(struct f2fs_sb_info *);
+void allocate_new_segments(struct f2fs_sb_info *);
+struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
+struct bio *f2fs_bio_alloc(struct block_device *, int);
+void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
+int write_meta_page(struct f2fs_sb_info *, struct page *,
+ struct writeback_control *);
+void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
+ block_t, block_t *);
+void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
+ block_t, block_t *);
+void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+void recover_data_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_summary *, block_t, block_t);
+void rewrite_node_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_summary *, block_t, block_t);
+void write_data_summaries(struct f2fs_sb_info *, block_t);
+void write_node_summaries(struct f2fs_sb_info *, block_t);
+int lookup_journal_in_cursum(struct f2fs_summary_block *,
+ int, unsigned int, int);
+void flush_sit_entries(struct f2fs_sb_info *);
+int build_segment_manager(struct f2fs_sb_info *);
+void reset_victim_segmap(struct f2fs_sb_info *);
+void destroy_segment_manager(struct f2fs_sb_info *);
+
+/*
+ * checkpoint.c
+ */
+struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
+int check_orphan_space(struct f2fs_sb_info *);
+void add_orphan_inode(struct f2fs_sb_info *, nid_t);
+void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
+int recover_orphan_inodes(struct f2fs_sb_info *);
+int get_valid_checkpoint(struct f2fs_sb_info *);
+void set_dirty_dir_page(struct inode *, struct page *);
+void remove_dirty_dir_inode(struct inode *);
+void sync_dirty_dir_inodes(struct f2fs_sb_info *);
+void block_operations(struct f2fs_sb_info *);
+void write_checkpoint(struct f2fs_sb_info *, bool, bool);
+void init_orphan_info(struct f2fs_sb_info *);
+int create_checkpoint_caches(void);
+void destroy_checkpoint_caches(void);
+
+/*
+ * data.c
+ */
+int reserve_new_block(struct dnode_of_data *);
+void update_extent_cache(block_t, struct dnode_of_data *);
+struct page *find_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_new_data_page(struct inode *, pgoff_t, bool);
+int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
+int do_write_data_page(struct page *);
+
+/*
+ * gc.c
+ */
+int start_gc_thread(struct f2fs_sb_info *);
+void stop_gc_thread(struct f2fs_sb_info *);
+block_t start_bidx_of_node(unsigned int);
+int f2fs_gc(struct f2fs_sb_info *, int);
+void build_gc_manager(struct f2fs_sb_info *);
+int create_gc_caches(void);
+void destroy_gc_caches(void);
+
+/*
+ * recovery.c
+ */
+void recover_fsync_data(struct f2fs_sb_info *);
+bool space_for_roll_forward(struct f2fs_sb_info *);
+
+/*
+ * debug.c
+ */
+#ifdef CONFIG_F2FS_STAT_FS
+struct f2fs_stat_info {
+ struct list_head stat_list;
+ struct f2fs_sb_info *sbi;
+ struct mutex stat_lock;
+ int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
+ int main_area_segs, main_area_sections, main_area_zones;
+ int hit_ext, total_ext;
+ int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+ int nats, sits, fnids;
+ int total_count, utilization;
+ int bg_gc;
+ unsigned int valid_count, valid_node_count, valid_inode_count;
+ unsigned int bimodal, avg_vblocks;
+ int util_free, util_valid, util_invalid;
+ int rsvd_segs, overp_segs;
+ int dirty_count, node_pages, meta_pages;
+ int prefree_count, call_count;
+ int tot_segs, node_segs, data_segs, free_segs, free_secs;
+ int tot_blks, data_blks, node_blks;
+ int curseg[NR_CURSEG_TYPE];
+ int cursec[NR_CURSEG_TYPE];
+ int curzone[NR_CURSEG_TYPE];
+
+ unsigned int segment_count[2];
+ unsigned int block_count[2];
+ unsigned base_mem, cache_mem;
+};
+
+#define stat_inc_call_count(si) ((si)->call_count++)
+
+#define stat_inc_seg_count(sbi, type) \
+ do { \
+ struct f2fs_stat_info *si = sbi->stat_info; \
+ (si)->tot_segs++; \
+ if (type == SUM_TYPE_DATA) \
+ si->data_segs++; \
+ else \
+ si->node_segs++; \
+ } while (0)
+
+#define stat_inc_tot_blk_count(si, blks) \
+ (si->tot_blks += (blks))
+
+#define stat_inc_data_blk_count(sbi, blks) \
+ do { \
+ struct f2fs_stat_info *si = sbi->stat_info; \
+ stat_inc_tot_blk_count(si, blks); \
+ si->data_blks += (blks); \
+ } while (0)
+
+#define stat_inc_node_blk_count(sbi, blks) \
+ do { \
+ struct f2fs_stat_info *si = sbi->stat_info; \
+ stat_inc_tot_blk_count(si, blks); \
+ si->node_blks += (blks); \
+ } while (0)
+
+int f2fs_build_stats(struct f2fs_sb_info *);
+void f2fs_destroy_stats(struct f2fs_sb_info *);
+void destroy_root_stats(void);
+#else
+#define stat_inc_call_count(si)
+#define stat_inc_seg_count(si, type)
+#define stat_inc_tot_blk_count(si, blks)
+#define stat_inc_data_blk_count(si, blks)
+#define stat_inc_node_blk_count(sbi, blks)
+
+static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
+static inline void destroy_root_stats(void) { }
+#endif
+
+extern const struct file_operations f2fs_dir_operations;
+extern const struct file_operations f2fs_file_operations;
+extern const struct inode_operations f2fs_file_inode_operations;
+extern const struct address_space_operations f2fs_dblock_aops;
+extern const struct address_space_operations f2fs_node_aops;
+extern const struct address_space_operations f2fs_meta_aops;
+extern const struct inode_operations f2fs_dir_inode_operations;
+extern const struct inode_operations f2fs_symlink_inode_operations;
+extern const struct inode_operations f2fs_special_inode_operations;
+#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
new file mode 100644
index 000000000000..f9e085dfb1f0
--- /dev/null
+++ b/fs/f2fs/file.c
@@ -0,0 +1,636 @@
+/*
+ * fs/f2fs/file.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/stat.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/mount.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "xattr.h"
+#include "acl.h"
+
+static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ block_t old_blk_addr;
+ struct dnode_of_data dn;
+ int err;
+
+ f2fs_balance_fs(sbi);
+
+ sb_start_pagefault(inode->i_sb);
+
+ mutex_lock_op(sbi, DATA_NEW);
+
+ /* block allocation */
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, page->index, 0);
+ if (err) {
+ mutex_unlock_op(sbi, DATA_NEW);
+ goto out;
+ }
+
+ old_blk_addr = dn.data_blkaddr;
+
+ if (old_blk_addr == NULL_ADDR) {
+ err = reserve_new_block(&dn);
+ if (err) {
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_NEW);
+ goto out;
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ lock_page(page);
+ if (page->mapping != inode->i_mapping ||
+ page_offset(page) >= i_size_read(inode) ||
+ !PageUptodate(page)) {
+ unlock_page(page);
+ err = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * check to see if the page is mapped already (no holes)
+ */
+ if (PageMappedToDisk(page))
+ goto out;
+
+ /* fill the page */
+ wait_on_page_writeback(page);
+
+ /* page is wholly or partially inside EOF */
+ if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+ unsigned offset;
+ offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
+ zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+ }
+ set_page_dirty(page);
+ SetPageUptodate(page);
+
+ file_update_time(vma->vm_file);
+out:
+ sb_end_pagefault(inode->i_sb);
+ return block_page_mkwrite_return(err);
+}
+
+static const struct vm_operations_struct f2fs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = f2fs_vm_page_mkwrite,
+};
+
+static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
+{
+ struct dentry *dentry;
+ nid_t pino;
+
+ inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ if (!dentry) {
+ iput(inode);
+ return 0;
+ }
+ pino = dentry->d_parent->d_inode->i_ino;
+ dput(dentry);
+ iput(inode);
+ return !is_checkpointed_node(sbi, pino);
+}
+
+int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ unsigned long long cur_version;
+ int ret = 0;
+ bool need_cp = false;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+
+ if (inode->i_sb->s_flags & MS_RDONLY)
+ return 0;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (ret)
+ return ret;
+
+ mutex_lock(&inode->i_mutex);
+
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ goto out;
+
+ mutex_lock(&sbi->cp_mutex);
+ cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
+ mutex_unlock(&sbi->cp_mutex);
+
+ if (F2FS_I(inode)->data_version != cur_version &&
+ !(inode->i_state & I_DIRTY))
+ goto out;
+ F2FS_I(inode)->data_version--;
+
+ if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
+ need_cp = true;
+ if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
+ need_cp = true;
+ if (!space_for_roll_forward(sbi))
+ need_cp = true;
+ if (need_to_sync_dir(sbi, inode))
+ need_cp = true;
+
+ f2fs_write_inode(inode, NULL);
+
+ if (need_cp) {
+ /* all the dirty node pages should be flushed for POR */
+ ret = f2fs_sync_fs(inode->i_sb, 1);
+ clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
+ } else {
+ while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
+ f2fs_write_inode(inode, NULL);
+ filemap_fdatawait_range(sbi->node_inode->i_mapping,
+ 0, LONG_MAX);
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
+
+static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &f2fs_file_vm_ops;
+ return 0;
+}
+
+static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
+{
+ int nr_free = 0, ofs = dn->ofs_in_node;
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_node *raw_node;
+ __le32 *addr;
+
+ raw_node = page_address(dn->node_page);
+ addr = blkaddr_in_node(raw_node) + ofs;
+
+ for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+ block_t blkaddr = le32_to_cpu(*addr);
+ if (blkaddr == NULL_ADDR)
+ continue;
+
+ update_extent_cache(NULL_ADDR, dn);
+ invalidate_blocks(sbi, blkaddr);
+ dec_valid_block_count(sbi, dn->inode, 1);
+ nr_free++;
+ }
+ if (nr_free) {
+ set_page_dirty(dn->node_page);
+ sync_inode_page(dn);
+ }
+ dn->ofs_in_node = ofs;
+ return nr_free;
+}
+
+void truncate_data_blocks(struct dnode_of_data *dn)
+{
+ truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
+}
+
+static void truncate_partial_data_page(struct inode *inode, u64 from)
+{
+ unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+ struct page *page;
+
+ if (!offset)
+ return;
+
+ page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
+ if (IS_ERR(page))
+ return;
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ zero_user(page, offset, PAGE_CACHE_SIZE - offset);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
+static int truncate_blocks(struct inode *inode, u64 from)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ unsigned int blocksize = inode->i_sb->s_blocksize;
+ struct dnode_of_data dn;
+ pgoff_t free_from;
+ int count = 0;
+ int err;
+
+ free_from = (pgoff_t)
+ ((from + blocksize - 1) >> (sbi->log_blocksize));
+
+ mutex_lock_op(sbi, DATA_TRUNC);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
+ if (err) {
+ if (err == -ENOENT)
+ goto free_next;
+ mutex_unlock_op(sbi, DATA_TRUNC);
+ return err;
+ }
+
+ if (IS_INODE(dn.node_page))
+ count = ADDRS_PER_INODE;
+ else
+ count = ADDRS_PER_BLOCK;
+
+ count -= dn.ofs_in_node;
+ BUG_ON(count < 0);
+ if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
+ truncate_data_blocks_range(&dn, count);
+ free_from += count;
+ }
+
+ f2fs_put_dnode(&dn);
+free_next:
+ err = truncate_inode_blocks(inode, free_from);
+ mutex_unlock_op(sbi, DATA_TRUNC);
+
+ /* lastly zero out the first data page */
+ truncate_partial_data_page(inode, from);
+
+ return err;
+}
+
+void f2fs_truncate(struct inode *inode)
+{
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
+
+ if (!truncate_blocks(inode, i_size_read(inode))) {
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ }
+
+ f2fs_balance_fs(F2FS_SB(inode->i_sb));
+}
+
+static int f2fs_getattr(struct vfsmount *mnt,
+ struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ stat->blocks <<= 3;
+ return 0;
+}
+
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+static void __setattr_copy(struct inode *inode, const struct iattr *attr)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int ia_valid = attr->ia_valid;
+
+ if (ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+ inode->i_atime = timespec_trunc(attr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MTIME)
+ inode->i_mtime = timespec_trunc(attr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_CTIME)
+ inode->i_ctime = timespec_trunc(attr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ set_acl_inode(fi, mode);
+ }
+}
+#else
+#define __setattr_copy setattr_copy
+#endif
+
+int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ int err;
+
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ if ((attr->ia_valid & ATTR_SIZE) &&
+ attr->ia_size != i_size_read(inode)) {
+ truncate_setsize(inode, attr->ia_size);
+ f2fs_truncate(inode);
+ }
+
+ __setattr_copy(inode, attr);
+
+ if (attr->ia_valid & ATTR_MODE) {
+ err = f2fs_acl_chmod(inode);
+ if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
+ inode->i_mode = fi->i_acl_mode;
+ clear_inode_flag(fi, FI_ACL_MODE);
+ }
+ }
+
+ mark_inode_dirty(inode);
+ return err;
+}
+
+const struct inode_operations f2fs_file_inode_operations = {
+ .getattr = f2fs_getattr,
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+static void fill_zero(struct inode *inode, pgoff_t index,
+ loff_t start, loff_t len)
+{
+ struct page *page;
+
+ if (!len)
+ return;
+
+ page = get_new_data_page(inode, index, false);
+
+ if (!IS_ERR(page)) {
+ wait_on_page_writeback(page);
+ zero_user(page, start, len);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ }
+}
+
+int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
+{
+ pgoff_t index;
+ int err;
+
+ for (index = pg_start; index < pg_end; index++) {
+ struct dnode_of_data dn;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+ mutex_lock_op(sbi, DATA_TRUNC);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, index, RDONLY_NODE);
+ if (err) {
+ mutex_unlock_op(sbi, DATA_TRUNC);
+ if (err == -ENOENT)
+ continue;
+ return err;
+ }
+
+ if (dn.data_blkaddr != NULL_ADDR)
+ truncate_data_blocks_range(&dn, 1);
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_TRUNC);
+ }
+ return 0;
+}
+
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+{
+ pgoff_t pg_start, pg_end;
+ loff_t off_start, off_end;
+ int ret = 0;
+
+ pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+ off_start = offset & (PAGE_CACHE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+ if (pg_start == pg_end) {
+ fill_zero(inode, pg_start, off_start,
+ off_end - off_start);
+ } else {
+ if (off_start)
+ fill_zero(inode, pg_start++, off_start,
+ PAGE_CACHE_SIZE - off_start);
+ if (off_end)
+ fill_zero(inode, pg_end, 0, off_end);
+
+ if (pg_start < pg_end) {
+ struct address_space *mapping = inode->i_mapping;
+ loff_t blk_start, blk_end;
+
+ blk_start = pg_start << PAGE_CACHE_SHIFT;
+ blk_end = pg_end << PAGE_CACHE_SHIFT;
+ truncate_inode_pages_range(mapping, blk_start,
+ blk_end - 1);
+ ret = truncate_hole(inode, pg_start, pg_end);
+ }
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ i_size_read(inode) <= (offset + len)) {
+ i_size_write(inode, offset);
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
+static int expand_inode_data(struct inode *inode, loff_t offset,
+ loff_t len, int mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ pgoff_t index, pg_start, pg_end;
+ loff_t new_size = i_size_read(inode);
+ loff_t off_start, off_end;
+ int ret = 0;
+
+ ret = inode_newsize_ok(inode, (len + offset));
+ if (ret)
+ return ret;
+
+ pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
+ pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+
+ off_start = offset & (PAGE_CACHE_SIZE - 1);
+ off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+
+ for (index = pg_start; index <= pg_end; index++) {
+ struct dnode_of_data dn;
+
+ mutex_lock_op(sbi, DATA_NEW);
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, index, 0);
+ if (ret) {
+ mutex_unlock_op(sbi, DATA_NEW);
+ break;
+ }
+
+ if (dn.data_blkaddr == NULL_ADDR) {
+ ret = reserve_new_block(&dn);
+ if (ret) {
+ f2fs_put_dnode(&dn);
+ mutex_unlock_op(sbi, DATA_NEW);
+ break;
+ }
+ }
+ f2fs_put_dnode(&dn);
+
+ mutex_unlock_op(sbi, DATA_NEW);
+
+ if (pg_start == pg_end)
+ new_size = offset + len;
+ else if (index == pg_start && off_start)
+ new_size = (index + 1) << PAGE_CACHE_SHIFT;
+ else if (index == pg_end)
+ new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+ else
+ new_size += PAGE_CACHE_SIZE;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ i_size_read(inode) < new_size) {
+ i_size_write(inode, new_size);
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
+static long f2fs_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ long ret;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = punch_hole(inode, offset, len, mode);
+ else
+ ret = expand_inode_data(inode, offset, len, mode);
+
+ f2fs_balance_fs(sbi);
+ return ret;
+}
+
+#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & F2FS_REG_FLMASK;
+ else
+ return flags & F2FS_OTHER_FLMASK;
+}
+
+long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int flags;
+ int ret;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ return put_user(flags, (int __user *) arg);
+ case FS_IOC_SETFLAGS:
+ {
+ unsigned int oldflags;
+
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
+
+ if (!inode_owner_or_capable(inode)) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ if (get_user(flags, (int __user *) arg)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
+ mutex_lock(&inode->i_mutex);
+
+ oldflags = fi->i_flags;
+
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EPERM;
+ goto out;
+ }
+ }
+
+ flags = flags & FS_FL_USER_MODIFIABLE;
+ flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
+ fi->i_flags = flags;
+ mutex_unlock(&inode->i_mutex);
+
+ f2fs_set_inode_flags(inode);
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+out:
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
+ }
+ default:
+ return -ENOTTY;
+ }
+}
+
+const struct file_operations f2fs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = generic_file_aio_write,
+ .open = generic_file_open,
+ .mmap = f2fs_file_mmap,
+ .fsync = f2fs_sync_file,
+ .fallocate = f2fs_fallocate,
+ .unlocked_ioctl = f2fs_ioctl,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
new file mode 100644
index 000000000000..644aa3808273
--- /dev/null
+++ b/fs/f2fs/gc.c
@@ -0,0 +1,742 @@
+/*
+ * fs/f2fs/gc.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/f2fs_fs.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/blkdev.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+#include "gc.h"
+
+static struct kmem_cache *winode_slab;
+
+static int gc_thread_func(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
+ long wait_ms;
+
+ wait_ms = GC_THREAD_MIN_SLEEP_TIME;
+
+ do {
+ if (try_to_freeze())
+ continue;
+ else
+ wait_event_interruptible_timeout(*wq,
+ kthread_should_stop(),
+ msecs_to_jiffies(wait_ms));
+ if (kthread_should_stop())
+ break;
+
+ f2fs_balance_fs(sbi);
+
+ if (!test_opt(sbi, BG_GC))
+ continue;
+
+ /*
+ * [GC triggering condition]
+ * 0. GC is not conducted currently.
+ * 1. There are enough dirty segments.
+ * 2. IO subsystem is idle by checking the # of writeback pages.
+ * 3. IO subsystem is idle by checking the # of requests in
+ * bdev's request list.
+ *
+ * Note) We have to avoid triggering GCs too much frequently.
+ * Because it is possible that some segments can be
+ * invalidated soon after by user update or deletion.
+ * So, I'd like to wait some time to collect dirty segments.
+ */
+ if (!mutex_trylock(&sbi->gc_mutex))
+ continue;
+
+ if (!is_idle(sbi)) {
+ wait_ms = increase_sleep_time(wait_ms);
+ mutex_unlock(&sbi->gc_mutex);
+ continue;
+ }
+
+ if (has_enough_invalid_blocks(sbi))
+ wait_ms = decrease_sleep_time(wait_ms);
+ else
+ wait_ms = increase_sleep_time(wait_ms);
+
+ sbi->bg_gc++;
+
+ if (f2fs_gc(sbi, 1) == GC_NONE)
+ wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
+ else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
+ wait_ms = GC_THREAD_MAX_SLEEP_TIME;
+
+ } while (!kthread_should_stop());
+ return 0;
+}
+
+int start_gc_thread(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_gc_kthread *gc_th;
+
+ gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
+ if (!gc_th)
+ return -ENOMEM;
+
+ sbi->gc_thread = gc_th;
+ init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
+ sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
+ GC_THREAD_NAME);
+ if (IS_ERR(gc_th->f2fs_gc_task)) {
+ kfree(gc_th);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void stop_gc_thread(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
+ if (!gc_th)
+ return;
+ kthread_stop(gc_th->f2fs_gc_task);
+ kfree(gc_th);
+ sbi->gc_thread = NULL;
+}
+
+static int select_gc_type(int gc_type)
+{
+ return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
+}
+
+static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
+ int type, struct victim_sel_policy *p)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ if (p->alloc_mode) {
+ p->gc_mode = GC_GREEDY;
+ p->dirty_segmap = dirty_i->dirty_segmap[type];
+ p->ofs_unit = 1;
+ } else {
+ p->gc_mode = select_gc_type(gc_type);
+ p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
+ p->ofs_unit = sbi->segs_per_sec;
+ }
+ p->offset = sbi->last_victim[p->gc_mode];
+}
+
+static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
+ struct victim_sel_policy *p)
+{
+ if (p->gc_mode == GC_GREEDY)
+ return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+ else if (p->gc_mode == GC_CB)
+ return UINT_MAX;
+ else /* No other gc_mode */
+ return 0;
+}
+
+static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno;
+
+ /*
+ * If the gc_type is FG_GC, we can select victim segments
+ * selected by background GC before.
+ * Those segments guarantee they have small valid blocks.
+ */
+ segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
+ TOTAL_SEGS(sbi), 0);
+ if (segno < TOTAL_SEGS(sbi)) {
+ clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+ return segno;
+ }
+ return NULL_SEGNO;
+}
+
+static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int secno = GET_SECNO(sbi, segno);
+ unsigned int start = secno * sbi->segs_per_sec;
+ unsigned long long mtime = 0;
+ unsigned int vblocks;
+ unsigned char age = 0;
+ unsigned char u;
+ unsigned int i;
+
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ mtime += get_seg_entry(sbi, start + i)->mtime;
+ vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+
+ mtime = div_u64(mtime, sbi->segs_per_sec);
+ vblocks = div_u64(vblocks, sbi->segs_per_sec);
+
+ u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+
+ /* Handle if the system time is changed by user */
+ if (mtime < sit_i->min_mtime)
+ sit_i->min_mtime = mtime;
+ if (mtime > sit_i->max_mtime)
+ sit_i->max_mtime = mtime;
+ if (sit_i->max_mtime != sit_i->min_mtime)
+ age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
+ sit_i->max_mtime - sit_i->min_mtime);
+
+ return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
+}
+
+static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
+ struct victim_sel_policy *p)
+{
+ if (p->alloc_mode == SSR)
+ return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+ /* alloc_mode == LFS */
+ if (p->gc_mode == GC_GREEDY)
+ return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
+ else
+ return get_cb_cost(sbi, segno);
+}
+
+/*
+ * This function is called from two pathes.
+ * One is garbage collection and the other is SSR segment selection.
+ * When it is called during GC, it just gets a victim segment
+ * and it does not remove it from dirty seglist.
+ * When it is called from SSR segment selection, it finds a segment
+ * which has minimum valid blocks and removes it from dirty seglist.
+ */
+static int get_victim_by_default(struct f2fs_sb_info *sbi,
+ unsigned int *result, int gc_type, int type, char alloc_mode)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct victim_sel_policy p;
+ unsigned int segno;
+ int nsearched = 0;
+
+ p.alloc_mode = alloc_mode;
+ select_policy(sbi, gc_type, type, &p);
+
+ p.min_segno = NULL_SEGNO;
+ p.min_cost = get_max_cost(sbi, &p);
+
+ mutex_lock(&dirty_i->seglist_lock);
+
+ if (p.alloc_mode == LFS && gc_type == FG_GC) {
+ p.min_segno = check_bg_victims(sbi);
+ if (p.min_segno != NULL_SEGNO)
+ goto got_it;
+ }
+
+ while (1) {
+ unsigned long cost;
+
+ segno = find_next_bit(p.dirty_segmap,
+ TOTAL_SEGS(sbi), p.offset);
+ if (segno >= TOTAL_SEGS(sbi)) {
+ if (sbi->last_victim[p.gc_mode]) {
+ sbi->last_victim[p.gc_mode] = 0;
+ p.offset = 0;
+ continue;
+ }
+ break;
+ }
+ p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+
+ if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
+ continue;
+ if (gc_type == BG_GC &&
+ test_bit(segno, dirty_i->victim_segmap[BG_GC]))
+ continue;
+ if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
+ continue;
+
+ cost = get_gc_cost(sbi, segno, &p);
+
+ if (p.min_cost > cost) {
+ p.min_segno = segno;
+ p.min_cost = cost;
+ }
+
+ if (cost == get_max_cost(sbi, &p))
+ continue;
+
+ if (nsearched++ >= MAX_VICTIM_SEARCH) {
+ sbi->last_victim[p.gc_mode] = segno;
+ break;
+ }
+ }
+got_it:
+ if (p.min_segno != NULL_SEGNO) {
+ *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
+ if (p.alloc_mode == LFS) {
+ int i;
+ for (i = 0; i < p.ofs_unit; i++)
+ set_bit(*result + i,
+ dirty_i->victim_segmap[gc_type]);
+ }
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ return (p.min_segno == NULL_SEGNO) ? 0 : 1;
+}
+
+static const struct victim_selection default_v_ops = {
+ .get_victim = get_victim_by_default,
+};
+
+static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
+{
+ struct list_head *this;
+ struct inode_entry *ie;
+
+ list_for_each(this, ilist) {
+ ie = list_entry(this, struct inode_entry, list);
+ if (ie->inode->i_ino == ino)
+ return ie->inode;
+ }
+ return NULL;
+}
+
+static void add_gc_inode(struct inode *inode, struct list_head *ilist)
+{
+ struct list_head *this;
+ struct inode_entry *new_ie, *ie;
+
+ list_for_each(this, ilist) {
+ ie = list_entry(this, struct inode_entry, list);
+ if (ie->inode == inode) {
+ iput(inode);
+ return;
+ }
+ }
+repeat:
+ new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
+ if (!new_ie) {
+ cond_resched();
+ goto repeat;
+ }
+ new_ie->inode = inode;
+ list_add_tail(&new_ie->list, ilist);
+}
+
+static void put_gc_inode(struct list_head *ilist)
+{
+ struct inode_entry *ie, *next_ie;
+ list_for_each_entry_safe(ie, next_ie, ilist, list) {
+ iput(ie->inode);
+ list_del(&ie->list);
+ kmem_cache_free(winode_slab, ie);
+ }
+}
+
+static int check_valid_map(struct f2fs_sb_info *sbi,
+ unsigned int segno, int offset)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct seg_entry *sentry;
+ int ret;
+
+ mutex_lock(&sit_i->sentry_lock);
+ sentry = get_seg_entry(sbi, segno);
+ ret = f2fs_test_bit(offset, sentry->cur_valid_map);
+ mutex_unlock(&sit_i->sentry_lock);
+ return ret ? GC_OK : GC_NEXT;
+}
+
+/*
+ * This function compares node address got in summary with that in NAT.
+ * On validity, copy that node with cold status, otherwise (invalid node)
+ * ignore that.
+ */
+static int gc_node_segment(struct f2fs_sb_info *sbi,
+ struct f2fs_summary *sum, unsigned int segno, int gc_type)
+{
+ bool initial = true;
+ struct f2fs_summary *entry;
+ int off;
+
+next_step:
+ entry = sum;
+ for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+ nid_t nid = le32_to_cpu(entry->nid);
+ struct page *node_page;
+ int err;
+
+ /*
+ * It makes sure that free segments are able to write
+ * all the dirty node pages before CP after this CP.
+ * So let's check the space of dirty node pages.
+ */
+ if (should_do_checkpoint(sbi)) {
+ mutex_lock(&sbi->cp_mutex);
+ block_operations(sbi);
+ return GC_BLOCKED;
+ }
+
+ err = check_valid_map(sbi, segno, off);
+ if (err == GC_ERROR)
+ return err;
+ else if (err == GC_NEXT)
+ continue;
+
+ if (initial) {
+ ra_node_page(sbi, nid);
+ continue;
+ }
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ continue;
+
+ /* set page dirty and write it */
+ if (!PageWriteback(node_page))
+ set_page_dirty(node_page);
+ f2fs_put_page(node_page, 1);
+ stat_inc_node_blk_count(sbi, 1);
+ }
+ if (initial) {
+ initial = false;
+ goto next_step;
+ }
+
+ if (gc_type == FG_GC) {
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+ sync_node_pages(sbi, 0, &wbc);
+ }
+ return GC_DONE;
+}
+
+/*
+ * Calculate start block index that this node page contains
+ */
+block_t start_bidx_of_node(unsigned int node_ofs)
+{
+ block_t start_bidx;
+ unsigned int bidx, indirect_blks;
+ int dec;
+
+ indirect_blks = 2 * NIDS_PER_BLOCK + 4;
+
+ start_bidx = 1;
+ if (node_ofs == 0) {
+ start_bidx = 0;
+ } else if (node_ofs <= 2) {
+ bidx = node_ofs - 1;
+ } else if (node_ofs <= indirect_blks) {
+ dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+ bidx = node_ofs - 2 - dec;
+ } else {
+ dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+ bidx = node_ofs - 5 - dec;
+ }
+
+ if (start_bidx)
+ start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
+ return start_bidx;
+}
+
+static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ struct node_info *dni, block_t blkaddr, unsigned int *nofs)
+{
+ struct page *node_page;
+ nid_t nid;
+ unsigned int ofs_in_node;
+ block_t source_blkaddr;
+
+ nid = le32_to_cpu(sum->nid);
+ ofs_in_node = le16_to_cpu(sum->ofs_in_node);
+
+ node_page = get_node_page(sbi, nid);
+ if (IS_ERR(node_page))
+ return GC_NEXT;
+
+ get_node_info(sbi, nid, dni);
+
+ if (sum->version != dni->version) {
+ f2fs_put_page(node_page, 1);
+ return GC_NEXT;
+ }
+
+ *nofs = ofs_of_node(node_page);
+ source_blkaddr = datablock_addr(node_page, ofs_in_node);
+ f2fs_put_page(node_page, 1);
+
+ if (source_blkaddr != blkaddr)
+ return GC_NEXT;
+ return GC_OK;
+}
+
+static void move_data_page(struct inode *inode, struct page *page, int gc_type)
+{
+ if (page->mapping != inode->i_mapping)
+ goto out;
+
+ if (inode != page->mapping->host)
+ goto out;
+
+ if (PageWriteback(page))
+ goto out;
+
+ if (gc_type == BG_GC) {
+ set_page_dirty(page);
+ set_cold_data(page);
+ } else {
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ mutex_lock_op(sbi, DATA_WRITE);
+ if (clear_page_dirty_for_io(page) &&
+ S_ISDIR(inode->i_mode)) {
+ dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ inode_dec_dirty_dents(inode);
+ }
+ set_cold_data(page);
+ do_write_data_page(page);
+ mutex_unlock_op(sbi, DATA_WRITE);
+ clear_cold_data(page);
+ }
+out:
+ f2fs_put_page(page, 1);
+}
+
+/*
+ * This function tries to get parent node of victim data block, and identifies
+ * data block validity. If the block is valid, copy that with cold status and
+ * modify parent node.
+ * If the parent node is not valid or the data block address is different,
+ * the victim data block is ignored.
+ */
+static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
+ struct list_head *ilist, unsigned int segno, int gc_type)
+{
+ struct super_block *sb = sbi->sb;
+ struct f2fs_summary *entry;
+ block_t start_addr;
+ int err, off;
+ int phase = 0;
+
+ start_addr = START_BLOCK(sbi, segno);
+
+next_step:
+ entry = sum;
+ for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
+ struct page *data_page;
+ struct inode *inode;
+ struct node_info dni; /* dnode info for the data */
+ unsigned int ofs_in_node, nofs;
+ block_t start_bidx;
+
+ /*
+ * It makes sure that free segments are able to write
+ * all the dirty node pages before CP after this CP.
+ * So let's check the space of dirty node pages.
+ */
+ if (should_do_checkpoint(sbi)) {
+ mutex_lock(&sbi->cp_mutex);
+ block_operations(sbi);
+ err = GC_BLOCKED;
+ goto stop;
+ }
+
+ err = check_valid_map(sbi, segno, off);
+ if (err == GC_ERROR)
+ goto stop;
+ else if (err == GC_NEXT)
+ continue;
+
+ if (phase == 0) {
+ ra_node_page(sbi, le32_to_cpu(entry->nid));
+ continue;
+ }
+
+ /* Get an inode by ino with checking validity */
+ err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
+ if (err == GC_ERROR)
+ goto stop;
+ else if (err == GC_NEXT)
+ continue;
+
+ if (phase == 1) {
+ ra_node_page(sbi, dni.ino);
+ continue;
+ }
+
+ start_bidx = start_bidx_of_node(nofs);
+ ofs_in_node = le16_to_cpu(entry->ofs_in_node);
+
+ if (phase == 2) {
+ inode = f2fs_iget_nowait(sb, dni.ino);
+ if (IS_ERR(inode))
+ continue;
+
+ data_page = find_data_page(inode,
+ start_bidx + ofs_in_node);
+ if (IS_ERR(data_page))
+ goto next_iput;
+
+ f2fs_put_page(data_page, 0);
+ add_gc_inode(inode, ilist);
+ } else {
+ inode = find_gc_inode(dni.ino, ilist);
+ if (inode) {
+ data_page = get_lock_data_page(inode,
+ start_bidx + ofs_in_node);
+ if (IS_ERR(data_page))
+ continue;
+ move_data_page(inode, data_page, gc_type);
+ stat_inc_data_blk_count(sbi, 1);
+ }
+ }
+ continue;
+next_iput:
+ iput(inode);
+ }
+ if (++phase < 4)
+ goto next_step;
+ err = GC_DONE;
+stop:
+ if (gc_type == FG_GC)
+ f2fs_submit_bio(sbi, DATA, true);
+ return err;
+}
+
+static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
+ int gc_type, int type)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ int ret;
+ mutex_lock(&sit_i->sentry_lock);
+ ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
+ mutex_unlock(&sit_i->sentry_lock);
+ return ret;
+}
+
+static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
+ struct list_head *ilist, int gc_type)
+{
+ struct page *sum_page;
+ struct f2fs_summary_block *sum;
+ int ret = GC_DONE;
+
+ /* read segment summary of victim */
+ sum_page = get_sum_page(sbi, segno);
+ if (IS_ERR(sum_page))
+ return GC_ERROR;
+
+ /*
+ * CP needs to lock sum_page. In this time, we don't need
+ * to lock this page, because this summary page is not gone anywhere.
+ * Also, this page is not gonna be updated before GC is done.
+ */
+ unlock_page(sum_page);
+ sum = page_address(sum_page);
+
+ switch (GET_SUM_TYPE((&sum->footer))) {
+ case SUM_TYPE_NODE:
+ ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
+ break;
+ case SUM_TYPE_DATA:
+ ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
+ break;
+ }
+ stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
+ stat_inc_call_count(sbi->stat_info);
+
+ f2fs_put_page(sum_page, 0);
+ return ret;
+}
+
+int f2fs_gc(struct f2fs_sb_info *sbi, int nGC)
+{
+ unsigned int segno;
+ int old_free_secs, cur_free_secs;
+ int gc_status, nfree;
+ struct list_head ilist;
+ int gc_type = BG_GC;
+
+ INIT_LIST_HEAD(&ilist);
+gc_more:
+ nfree = 0;
+ gc_status = GC_NONE;
+
+ if (has_not_enough_free_secs(sbi))
+ old_free_secs = reserved_sections(sbi);
+ else
+ old_free_secs = free_sections(sbi);
+
+ while (sbi->sb->s_flags & MS_ACTIVE) {
+ int i;
+ if (has_not_enough_free_secs(sbi))
+ gc_type = FG_GC;
+
+ cur_free_secs = free_sections(sbi) + nfree;
+
+ /* We got free space successfully. */
+ if (nGC < cur_free_secs - old_free_secs)
+ break;
+
+ if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+ break;
+
+ for (i = 0; i < sbi->segs_per_sec; i++) {
+ /*
+ * do_garbage_collect will give us three gc_status:
+ * GC_ERROR, GC_DONE, and GC_BLOCKED.
+ * If GC is finished uncleanly, we have to return
+ * the victim to dirty segment list.
+ */
+ gc_status = do_garbage_collect(sbi, segno + i,
+ &ilist, gc_type);
+ if (gc_status != GC_DONE)
+ goto stop;
+ nfree++;
+ }
+ }
+stop:
+ if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) {
+ write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
+ if (nfree)
+ goto gc_more;
+ }
+ mutex_unlock(&sbi->gc_mutex);
+
+ put_gc_inode(&ilist);
+ BUG_ON(!list_empty(&ilist));
+ return gc_status;
+}
+
+void build_gc_manager(struct f2fs_sb_info *sbi)
+{
+ DIRTY_I(sbi)->v_ops = &default_v_ops;
+}
+
+int create_gc_caches(void)
+{
+ winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
+ sizeof(struct inode_entry), NULL);
+ if (!winode_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void destroy_gc_caches(void)
+{
+ kmem_cache_destroy(winode_slab);
+}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
new file mode 100644
index 000000000000..b026d9354ccd
--- /dev/null
+++ b/fs/f2fs/gc.h
@@ -0,0 +1,117 @@
+/*
+ * fs/f2fs/gc.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#define GC_THREAD_NAME "f2fs_gc_task"
+#define GC_THREAD_MIN_WB_PAGES 1 /*
+ * a threshold to determine
+ * whether IO subsystem is idle
+ * or not
+ */
+#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */
+#define GC_THREAD_MAX_SLEEP_TIME 30000
+#define GC_THREAD_NOGC_SLEEP_TIME 10000
+#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
+#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
+
+/* Search max. number of dirty segments to select a victim segment */
+#define MAX_VICTIM_SEARCH 20
+
+enum {
+ GC_NONE = 0,
+ GC_ERROR,
+ GC_OK,
+ GC_NEXT,
+ GC_BLOCKED,
+ GC_DONE,
+};
+
+struct f2fs_gc_kthread {
+ struct task_struct *f2fs_gc_task;
+ wait_queue_head_t gc_wait_queue_head;
+};
+
+struct inode_entry {
+ struct list_head list;
+ struct inode *inode;
+};
+
+/*
+ * inline functions
+ */
+static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
+{
+ if (free_segments(sbi) < overprovision_segments(sbi))
+ return 0;
+ else
+ return (free_segments(sbi) - overprovision_segments(sbi))
+ << sbi->log_blocks_per_seg;
+}
+
+static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
+{
+ return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
+}
+
+static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
+{
+ block_t reclaimable_user_blocks = sbi->user_block_count -
+ written_block_count(sbi);
+ return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
+}
+
+static inline long increase_sleep_time(long wait)
+{
+ wait += GC_THREAD_MIN_SLEEP_TIME;
+ if (wait > GC_THREAD_MAX_SLEEP_TIME)
+ wait = GC_THREAD_MAX_SLEEP_TIME;
+ return wait;
+}
+
+static inline long decrease_sleep_time(long wait)
+{
+ wait -= GC_THREAD_MIN_SLEEP_TIME;
+ if (wait <= GC_THREAD_MIN_SLEEP_TIME)
+ wait = GC_THREAD_MIN_SLEEP_TIME;
+ return wait;
+}
+
+static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
+{
+ block_t invalid_user_blocks = sbi->user_block_count -
+ written_block_count(sbi);
+ /*
+ * Background GC is triggered with the following condition.
+ * 1. There are a number of invalid blocks.
+ * 2. There is not enough free space.
+ */
+ if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
+ free_user_blocks(sbi) < limit_free_user_blocks(sbi))
+ return true;
+ return false;
+}
+
+static inline int is_idle(struct f2fs_sb_info *sbi)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ struct request_list *rl = &q->root_rl;
+ return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
+}
+
+static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi)
+{
+ unsigned int pages_per_sec = sbi->segs_per_sec *
+ (1 << sbi->log_blocks_per_seg);
+ int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
+ >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
+ >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2);
+}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
new file mode 100644
index 000000000000..a60f04200f8b
--- /dev/null
+++ b/fs/f2fs/hash.c
@@ -0,0 +1,97 @@
+/*
+ * fs/f2fs/hash.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext3/hash.c
+ *
+ * Copyright (C) 2002 by Theodore Ts'o
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/cryptohash.h>
+#include <linux/pagemap.h>
+
+#include "f2fs.h"
+
+/*
+ * Hashing code copied from ext3
+ */
+#define DELTA 0x9E3779B9
+
+static void TEA_transform(unsigned int buf[4], unsigned int const in[])
+{
+ __u32 sum = 0;
+ __u32 b0 = buf[0], b1 = buf[1];
+ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
+ int n = 16;
+
+ do {
+ sum += DELTA;
+ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
+ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
+ } while (--n);
+
+ buf[0] += b0;
+ buf[1] += b1;
+}
+
+static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num)
+{
+ unsigned pad, val;
+ int i;
+
+ pad = (__u32)len | ((__u32)len << 8);
+ pad |= pad << 16;
+
+ val = pad;
+ if (len > num * 4)
+ len = num * 4;
+ for (i = 0; i < len; i++) {
+ if ((i % 4) == 0)
+ val = pad;
+ val = msg[i] + (val << 8);
+ if ((i % 4) == 3) {
+ *buf++ = val;
+ val = pad;
+ num--;
+ }
+ }
+ if (--num >= 0)
+ *buf++ = val;
+ while (--num >= 0)
+ *buf++ = pad;
+}
+
+f2fs_hash_t f2fs_dentry_hash(const char *name, int len)
+{
+ __u32 hash, minor_hash;
+ f2fs_hash_t f2fs_hash;
+ const char *p;
+ __u32 in[8], buf[4];
+
+ /* Initialize the default seed for the hash checksum functions */
+ buf[0] = 0x67452301;
+ buf[1] = 0xefcdab89;
+ buf[2] = 0x98badcfe;
+ buf[3] = 0x10325476;
+
+ p = name;
+ while (len > 0) {
+ str2hashbuf(p, len, in, 4);
+ TEA_transform(buf, in);
+ len -= 16;
+ p += 16;
+ }
+ hash = buf[0];
+ minor_hash = buf[1];
+
+ f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
+ return f2fs_hash;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
new file mode 100644
index 000000000000..df5fb381ebf1
--- /dev/null
+++ b/fs/f2fs/inode.c
@@ -0,0 +1,268 @@
+/*
+ * fs/f2fs/inode.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+
+#include "f2fs.h"
+#include "node.h"
+
+struct f2fs_iget_args {
+ u64 ino;
+ int on_free;
+};
+
+void f2fs_set_inode_flags(struct inode *inode)
+{
+ unsigned int flags = F2FS_I(inode)->i_flags;
+
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE |
+ S_NOATIME | S_DIRSYNC);
+
+ if (flags & FS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & FS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+ if (flags & FS_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+static int f2fs_iget_test(struct inode *inode, void *data)
+{
+ struct f2fs_iget_args *args = data;
+
+ if (inode->i_ino != args->ino)
+ return 0;
+ if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
+ args->on_free = 1;
+ return 0;
+ }
+ return 1;
+}
+
+struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
+{
+ struct f2fs_iget_args args = {
+ .ino = ino,
+ .on_free = 0
+ };
+ struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
+
+ if (inode)
+ return inode;
+ if (!args.on_free)
+ return f2fs_iget(sb, ino);
+ return ERR_PTR(-ENOENT);
+}
+
+static int do_read_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct page *node_page;
+ struct f2fs_node *rn;
+ struct f2fs_inode *ri;
+
+ /* Check if ino is within scope */
+ check_nid_range(sbi, inode->i_ino);
+
+ node_page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
+
+ rn = page_address(node_page);
+ ri = &(rn->i);
+
+ inode->i_mode = le16_to_cpu(ri->i_mode);
+ i_uid_write(inode, le32_to_cpu(ri->i_uid));
+ i_gid_write(inode, le32_to_cpu(ri->i_gid));
+ set_nlink(inode, le32_to_cpu(ri->i_links));
+ inode->i_size = le64_to_cpu(ri->i_size);
+ inode->i_blocks = le64_to_cpu(ri->i_blocks);
+
+ inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
+ inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
+ inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
+ inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+ inode->i_generation = le32_to_cpu(ri->i_generation);
+
+ fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
+ fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
+ fi->i_flags = le32_to_cpu(ri->i_flags);
+ fi->flags = 0;
+ fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
+ fi->i_advise = ri->i_advise;
+ fi->i_pino = le32_to_cpu(ri->i_pino);
+ get_extent_info(&fi->ext, ri->i_ext);
+ f2fs_put_page(node_page, 1);
+ return 0;
+}
+
+struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ int ret;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+ if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
+ goto make_now;
+
+ ret = do_read_inode(inode);
+ if (ret)
+ goto bad_inode;
+
+ if (!sbi->por_doing && inode->i_nlink == 0) {
+ ret = -ENOENT;
+ goto bad_inode;
+ }
+
+make_now:
+ if (ino == F2FS_NODE_INO(sbi)) {
+ inode->i_mapping->a_ops = &f2fs_node_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ } else if (ino == F2FS_META_INO(sbi)) {
+ inode->i_mapping->a_ops = &f2fs_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ } else if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &f2fs_dir_inode_operations;
+ inode->i_fop = &f2fs_dir_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
+ __GFP_ZERO);
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+ inode->i_op = &f2fs_special_inode_operations;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ ret = -EIO;
+ goto bad_inode;
+ }
+ unlock_new_inode(inode);
+
+ return inode;
+
+bad_inode:
+ iget_failed(inode);
+ return ERR_PTR(ret);
+}
+
+void update_inode(struct inode *inode, struct page *node_page)
+{
+ struct f2fs_node *rn;
+ struct f2fs_inode *ri;
+
+ wait_on_page_writeback(node_page);
+
+ rn = page_address(node_page);
+ ri = &(rn->i);
+
+ ri->i_mode = cpu_to_le16(inode->i_mode);
+ ri->i_advise = F2FS_I(inode)->i_advise;
+ ri->i_uid = cpu_to_le32(i_uid_read(inode));
+ ri->i_gid = cpu_to_le32(i_gid_read(inode));
+ ri->i_links = cpu_to_le32(inode->i_nlink);
+ ri->i_size = cpu_to_le64(i_size_read(inode));
+ ri->i_blocks = cpu_to_le64(inode->i_blocks);
+ set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+
+ ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
+ ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
+ ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
+ ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
+ ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
+ ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
+ ri->i_generation = cpu_to_le32(inode->i_generation);
+ set_page_dirty(node_page);
+}
+
+int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *node_page;
+ bool need_lock = false;
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ return 0;
+
+ node_page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(node_page))
+ return PTR_ERR(node_page);
+
+ if (!PageDirty(node_page)) {
+ need_lock = true;
+ f2fs_put_page(node_page, 1);
+ mutex_lock(&sbi->write_inode);
+ node_page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(node_page)) {
+ mutex_unlock(&sbi->write_inode);
+ return PTR_ERR(node_page);
+ }
+ }
+ update_inode(inode, node_page);
+ f2fs_put_page(node_page, 1);
+ if (need_lock)
+ mutex_unlock(&sbi->write_inode);
+ return 0;
+}
+
+/*
+ * Called at the last iput() if i_nlink is zero
+ */
+void f2fs_evict_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (inode->i_ino == F2FS_NODE_INO(sbi) ||
+ inode->i_ino == F2FS_META_INO(sbi))
+ goto no_delete;
+
+ BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
+ remove_dirty_dir_inode(inode);
+
+ if (inode->i_nlink || is_bad_inode(inode))
+ goto no_delete;
+
+ set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
+ i_size_write(inode, 0);
+
+ if (F2FS_HAS_BLOCKS(inode))
+ f2fs_truncate(inode);
+
+ remove_inode_page(inode);
+no_delete:
+ clear_inode(inode);
+}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
new file mode 100644
index 000000000000..89b7675dc377
--- /dev/null
+++ b/fs/f2fs/namei.c
@@ -0,0 +1,503 @@
+/*
+ * fs/f2fs/namei.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/ctype.h>
+
+#include "f2fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ nid_t ino;
+ struct inode *inode;
+ bool nid_free = false;
+ int err;
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_lock_op(sbi, NODE_NEW);
+ if (!alloc_nid(sbi, &ino)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = -ENOSPC;
+ goto fail;
+ }
+ mutex_unlock_op(sbi, NODE_NEW);
+
+ inode->i_uid = current_fsuid();
+
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else {
+ inode->i_gid = current_fsgid();
+ }
+
+ inode->i_ino = ino;
+ inode->i_mode = mode;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_generation = sbi->s_next_generation++;
+
+ err = insert_inode_locked(inode);
+ if (err) {
+ err = -EINVAL;
+ nid_free = true;
+ goto out;
+ }
+
+ mark_inode_dirty(inode);
+ return inode;
+
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+fail:
+ iput(inode);
+ if (nid_free)
+ alloc_nid_failed(sbi, ino);
+ return ERR_PTR(err);
+}
+
+static int is_multimedia_file(const unsigned char *s, const char *sub)
+{
+ int slen = strlen(s);
+ int sublen = strlen(sub);
+ int ret;
+
+ if (sublen > slen)
+ return 1;
+
+ ret = memcmp(s + slen - sublen, sub, sublen);
+ if (ret) { /* compare upper case */
+ int i;
+ char upper_sub[8];
+ for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
+ upper_sub[i] = toupper(sub[i]);
+ return memcmp(s + slen - sublen, upper_sub, sublen);
+ }
+
+ return ret;
+}
+
+/*
+ * Set multimedia files as cold files for hot/cold data separation
+ */
+static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
+ const unsigned char *name)
+{
+ int i;
+ __u8 (*extlist)[8] = sbi->raw_super->extension_list;
+
+ int count = le32_to_cpu(sbi->raw_super->extension_count);
+ for (i = 0; i < count; i++) {
+ if (!is_multimedia_file(name, extlist[i])) {
+ F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
+ break;
+ }
+ }
+}
+
+static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ nid_t ino = 0;
+ int err;
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ set_cold_file(sbi, inode, dentry->d_name.name);
+
+ inode->i_op = &f2fs_file_inode_operations;
+ inode->i_fop = &f2fs_file_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ ino = inode->i_ino;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ alloc_nid_done(sbi, ino);
+
+ if (!sbi->por_doing)
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, ino);
+ return err;
+}
+
+static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct inode *inode = old_dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err;
+
+ inode->i_ctime = CURRENT_TIME;
+ atomic_inc(&inode->i_count);
+
+ set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ d_instantiate(dentry, inode);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+out:
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ iput(inode);
+ return err;
+}
+
+struct dentry *f2fs_get_parent(struct dentry *child)
+{
+ struct qstr dotdot = QSTR_INIT("..", 2);
+ unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
+ if (!ino)
+ return ERR_PTR(-ENOENT);
+ return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
+{
+ struct inode *inode = NULL;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+
+ if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ if (de) {
+ nid_t ino = le32_to_cpu(de->ino);
+ kunmap(page);
+ f2fs_put_page(page, 0);
+
+ inode = f2fs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ int err = -ENOENT;
+
+ de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ if (!de)
+ goto fail;
+
+ err = check_orphan_space(sbi);
+ if (err) {
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ goto fail;
+ }
+
+ f2fs_delete_entry(de, page, inode);
+
+ /* In order to evict this inode, we set it dirty */
+ mark_inode_dirty(inode);
+ f2fs_balance_fs(sbi);
+fail:
+ return err;
+}
+
+static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ unsigned symlen = strlen(symname) + 1;
+ int err;
+
+ inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &f2fs_symlink_inode_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ err = page_symlink(inode, symname, symlen);
+ alloc_nid_done(sbi, inode->i_ino);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+
+ return err;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+ struct inode *inode;
+ int err;
+
+ inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &f2fs_dir_inode_operations;
+ inode->i_fop = &f2fs_dir_operations;
+ inode->i_mapping->a_ops = &f2fs_dblock_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+
+ set_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out_fail;
+
+ alloc_nid_done(sbi, inode->i_ino);
+
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+
+out_fail:
+ clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ if (f2fs_empty_dir(inode))
+ return f2fs_unlink(dir, dentry);
+ return -ENOTEMPTY;
+}
+
+static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
+ umode_t mode, dev_t rdev)
+{
+ struct super_block *sb = dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+ int err = 0;
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ inode = f2fs_new_inode(dir, mode);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ init_special_inode(inode, inode->i_mode, rdev);
+ inode->i_op = &f2fs_special_inode_operations;
+
+ err = f2fs_add_link(dentry, inode);
+ if (err)
+ goto out;
+
+ alloc_nid_done(sbi, inode->i_ino);
+ d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
+
+ f2fs_balance_fs(sbi);
+
+ return 0;
+out:
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ alloc_nid_failed(sbi, inode->i_ino);
+ return err;
+}
+
+static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct super_block *sb = old_dir->i_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct page *old_dir_page;
+ struct page *old_page;
+ struct f2fs_dir_entry *old_dir_entry = NULL;
+ struct f2fs_dir_entry *old_entry;
+ struct f2fs_dir_entry *new_entry;
+ int err = -ENOENT;
+
+ old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ if (!old_entry)
+ goto out;
+
+ if (S_ISDIR(old_inode->i_mode)) {
+ err = -EIO;
+ old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
+ if (!old_dir_entry)
+ goto out_old;
+ }
+
+ mutex_lock_op(sbi, RENAME);
+
+ if (new_inode) {
+ struct page *new_page;
+
+ err = -ENOTEMPTY;
+ if (old_dir_entry && !f2fs_empty_dir(new_inode))
+ goto out_dir;
+
+ err = -ENOENT;
+ new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
+ &new_page);
+ if (!new_entry)
+ goto out_dir;
+
+ f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+
+ new_inode->i_ctime = CURRENT_TIME;
+ if (old_dir_entry)
+ drop_nlink(new_inode);
+ drop_nlink(new_inode);
+ if (!new_inode->i_nlink)
+ add_orphan_inode(sbi, new_inode->i_ino);
+ f2fs_write_inode(new_inode, NULL);
+ } else {
+ err = f2fs_add_link(new_dentry, old_inode);
+ if (err)
+ goto out_dir;
+
+ if (old_dir_entry) {
+ inc_nlink(new_dir);
+ f2fs_write_inode(new_dir, NULL);
+ }
+ }
+
+ old_inode->i_ctime = CURRENT_TIME;
+ set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
+ mark_inode_dirty(old_inode);
+
+ f2fs_delete_entry(old_entry, old_page, NULL);
+
+ if (old_dir_entry) {
+ if (old_dir != new_dir) {
+ f2fs_set_link(old_inode, old_dir_entry,
+ old_dir_page, new_dir);
+ } else {
+ kunmap(old_dir_page);
+ f2fs_put_page(old_dir_page, 0);
+ }
+ drop_nlink(old_dir);
+ f2fs_write_inode(old_dir, NULL);
+ }
+
+ mutex_unlock_op(sbi, RENAME);
+
+ f2fs_balance_fs(sbi);
+ return 0;
+
+out_dir:
+ if (old_dir_entry) {
+ kunmap(old_dir_page);
+ f2fs_put_page(old_dir_page, 0);
+ }
+ mutex_unlock_op(sbi, RENAME);
+out_old:
+ kunmap(old_page);
+ f2fs_put_page(old_page, 0);
+out:
+ return err;
+}
+
+const struct inode_operations f2fs_dir_inode_operations = {
+ .create = f2fs_create,
+ .lookup = f2fs_lookup,
+ .link = f2fs_link,
+ .unlink = f2fs_unlink,
+ .symlink = f2fs_symlink,
+ .mkdir = f2fs_mkdir,
+ .rmdir = f2fs_rmdir,
+ .mknod = f2fs_mknod,
+ .rename = f2fs_rename,
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = page_follow_link_light,
+ .put_link = page_put_link,
+ .setattr = f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
+
+const struct inode_operations f2fs_special_inode_operations = {
+ .setattr = f2fs_setattr,
+ .get_acl = f2fs_get_acl,
+#ifdef CONFIG_F2FS_FS_XATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = f2fs_listxattr,
+ .removexattr = generic_removexattr,
+#endif
+};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
new file mode 100644
index 000000000000..19870361497e
--- /dev/null
+++ b/fs/f2fs/node.c
@@ -0,0 +1,1764 @@
+/*
+ * fs/f2fs/node.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/mpage.h>
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/swap.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+static struct kmem_cache *nat_entry_slab;
+static struct kmem_cache *free_nid_slab;
+
+static void clear_node_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ unsigned int long flags;
+
+ if (PageDirty(page)) {
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ radix_tree_tag_clear(&mapping->page_tree,
+ page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ clear_page_dirty_for_io(page);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ }
+ ClearPageUptodate(page);
+}
+
+static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ pgoff_t index = current_nat_addr(sbi, nid);
+ return get_meta_page(sbi, index);
+}
+
+static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct page *src_page;
+ struct page *dst_page;
+ pgoff_t src_off;
+ pgoff_t dst_off;
+ void *src_addr;
+ void *dst_addr;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ src_off = current_nat_addr(sbi, nid);
+ dst_off = next_nat_addr(sbi, src_off);
+
+ /* get current nat block page with lock */
+ src_page = get_meta_page(sbi, src_off);
+
+ /* Dirty src_page means that it is already the new target NAT page. */
+ if (PageDirty(src_page))
+ return src_page;
+
+ dst_page = grab_meta_page(sbi, dst_off);
+
+ src_addr = page_address(src_page);
+ dst_addr = page_address(dst_page);
+ memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+ set_page_dirty(dst_page);
+ f2fs_put_page(src_page, 1);
+
+ set_to_next_nat(nm_i, nid);
+
+ return dst_page;
+}
+
+/*
+ * Readahead NAT pages
+ */
+static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
+{
+ struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct page *page;
+ pgoff_t index;
+ int i;
+
+ for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
+ if (nid >= nm_i->max_nid)
+ nid = 0;
+ index = current_nat_addr(sbi, nid);
+
+ page = grab_cache_page(mapping, index);
+ if (!page)
+ continue;
+ if (f2fs_readpage(sbi, page, index, READ)) {
+ f2fs_put_page(page, 1);
+ continue;
+ }
+ page_cache_release(page);
+ }
+}
+
+static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
+{
+ return radix_tree_lookup(&nm_i->nat_root, n);
+}
+
+static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
+ nid_t start, unsigned int nr, struct nat_entry **ep)
+{
+ return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
+}
+
+static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
+{
+ list_del(&e->list);
+ radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
+ nm_i->nat_cnt--;
+ kmem_cache_free(nat_entry_slab, e);
+}
+
+int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+ int is_cp = 1;
+
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e && !e->checkpointed)
+ is_cp = 0;
+ read_unlock(&nm_i->nat_tree_lock);
+ return is_cp;
+}
+
+static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+ struct nat_entry *new;
+
+ new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+ if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
+ kmem_cache_free(nat_entry_slab, new);
+ return NULL;
+ }
+ memset(new, 0, sizeof(struct nat_entry));
+ nat_set_nid(new, nid);
+ list_add_tail(&new->list, &nm_i->nat_entries);
+ nm_i->nat_cnt++;
+ return new;
+}
+
+static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
+ struct f2fs_nat_entry *ne)
+{
+ struct nat_entry *e;
+retry:
+ write_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (!e) {
+ e = grab_nat_entry(nm_i, nid);
+ if (!e) {
+ write_unlock(&nm_i->nat_tree_lock);
+ goto retry;
+ }
+ nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
+ nat_set_ino(e, le32_to_cpu(ne->ino));
+ nat_set_version(e, ne->version);
+ e->checkpointed = true;
+ }
+ write_unlock(&nm_i->nat_tree_lock);
+}
+
+static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
+ block_t new_blkaddr)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+retry:
+ write_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, ni->nid);
+ if (!e) {
+ e = grab_nat_entry(nm_i, ni->nid);
+ if (!e) {
+ write_unlock(&nm_i->nat_tree_lock);
+ goto retry;
+ }
+ e->ni = *ni;
+ e->checkpointed = true;
+ BUG_ON(ni->blk_addr == NEW_ADDR);
+ } else if (new_blkaddr == NEW_ADDR) {
+ /*
+ * when nid is reallocated,
+ * previous nat entry can be remained in nat cache.
+ * So, reinitialize it with new information.
+ */
+ e->ni = *ni;
+ BUG_ON(ni->blk_addr != NULL_ADDR);
+ }
+
+ if (new_blkaddr == NEW_ADDR)
+ e->checkpointed = false;
+
+ /* sanity check */
+ BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
+ BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
+ new_blkaddr == NULL_ADDR);
+ BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
+ new_blkaddr == NEW_ADDR);
+ BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
+ nat_get_blkaddr(e) != NULL_ADDR &&
+ new_blkaddr == NEW_ADDR);
+
+ /* increament version no as node is removed */
+ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
+ unsigned char version = nat_get_version(e);
+ nat_set_version(e, inc_node_version(version));
+ }
+
+ /* change address */
+ nat_set_blkaddr(e, new_blkaddr);
+ __set_nat_cache_dirty(nm_i, e);
+ write_unlock(&nm_i->nat_tree_lock);
+}
+
+static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
+ return 0;
+
+ write_lock(&nm_i->nat_tree_lock);
+ while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
+ struct nat_entry *ne;
+ ne = list_first_entry(&nm_i->nat_entries,
+ struct nat_entry, list);
+ __del_from_nat_cache(nm_i, ne);
+ nr_shrink--;
+ }
+ write_unlock(&nm_i->nat_tree_lock);
+ return nr_shrink;
+}
+
+/*
+ * This function returns always success
+ */
+void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t start_nid = START_NID(nid);
+ struct f2fs_nat_block *nat_blk;
+ struct page *page = NULL;
+ struct f2fs_nat_entry ne;
+ struct nat_entry *e;
+ int i;
+
+ memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+ ni->nid = nid;
+
+ /* Check nat cache */
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e) {
+ ni->ino = nat_get_ino(e);
+ ni->blk_addr = nat_get_blkaddr(e);
+ ni->version = nat_get_version(e);
+ }
+ read_unlock(&nm_i->nat_tree_lock);
+ if (e)
+ return;
+
+ /* Check current segment summary */
+ mutex_lock(&curseg->curseg_mutex);
+ i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
+ if (i >= 0) {
+ ne = nat_in_journal(sum, i);
+ node_info_from_raw_nat(ni, &ne);
+ }
+ mutex_unlock(&curseg->curseg_mutex);
+ if (i >= 0)
+ goto cache;
+
+ /* Fill node_info from nat page */
+ page = get_current_nat_page(sbi, start_nid);
+ nat_blk = (struct f2fs_nat_block *)page_address(page);
+ ne = nat_blk->entries[nid - start_nid];
+ node_info_from_raw_nat(ni, &ne);
+ f2fs_put_page(page, 1);
+cache:
+ /* cache nat entry */
+ cache_nat_entry(NM_I(sbi), nid, &ne);
+}
+
+/*
+ * The maximum depth is four.
+ * Offset[0] will have raw inode offset.
+ */
+static int get_node_path(long block, int offset[4], unsigned int noffset[4])
+{
+ const long direct_index = ADDRS_PER_INODE;
+ const long direct_blks = ADDRS_PER_BLOCK;
+ const long dptrs_per_blk = NIDS_PER_BLOCK;
+ const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
+ const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
+ int n = 0;
+ int level = 0;
+
+ noffset[0] = 0;
+
+ if (block < direct_index) {
+ offset[n++] = block;
+ level = 0;
+ goto got;
+ }
+ block -= direct_index;
+ if (block < direct_blks) {
+ offset[n++] = NODE_DIR1_BLOCK;
+ noffset[n] = 1;
+ offset[n++] = block;
+ level = 1;
+ goto got;
+ }
+ block -= direct_blks;
+ if (block < direct_blks) {
+ offset[n++] = NODE_DIR2_BLOCK;
+ noffset[n] = 2;
+ offset[n++] = block;
+ level = 1;
+ goto got;
+ }
+ block -= direct_blks;
+ if (block < indirect_blks) {
+ offset[n++] = NODE_IND1_BLOCK;
+ noffset[n] = 3;
+ offset[n++] = block / direct_blks;
+ noffset[n] = 4 + offset[n - 1];
+ offset[n++] = block % direct_blks;
+ level = 2;
+ goto got;
+ }
+ block -= indirect_blks;
+ if (block < indirect_blks) {
+ offset[n++] = NODE_IND2_BLOCK;
+ noffset[n] = 4 + dptrs_per_blk;
+ offset[n++] = block / direct_blks;
+ noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
+ offset[n++] = block % direct_blks;
+ level = 2;
+ goto got;
+ }
+ block -= indirect_blks;
+ if (block < dindirect_blks) {
+ offset[n++] = NODE_DIND_BLOCK;
+ noffset[n] = 5 + (dptrs_per_blk * 2);
+ offset[n++] = block / indirect_blks;
+ noffset[n] = 6 + (dptrs_per_blk * 2) +
+ offset[n - 1] * (dptrs_per_blk + 1);
+ offset[n++] = (block / direct_blks) % dptrs_per_blk;
+ noffset[n] = 7 + (dptrs_per_blk * 2) +
+ offset[n - 2] * (dptrs_per_blk + 1) +
+ offset[n - 1];
+ offset[n++] = block % direct_blks;
+ level = 3;
+ goto got;
+ } else {
+ BUG();
+ }
+got:
+ return level;
+}
+
+/*
+ * Caller should call f2fs_put_dnode(dn).
+ */
+int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct page *npage[4];
+ struct page *parent;
+ int offset[4];
+ unsigned int noffset[4];
+ nid_t nids[4];
+ int level, i;
+ int err = 0;
+
+ level = get_node_path(index, offset, noffset);
+
+ nids[0] = dn->inode->i_ino;
+ npage[0] = get_node_page(sbi, nids[0]);
+ if (IS_ERR(npage[0]))
+ return PTR_ERR(npage[0]);
+
+ parent = npage[0];
+ nids[1] = get_nid(parent, offset[0], true);
+ dn->inode_page = npage[0];
+ dn->inode_page_locked = true;
+
+ /* get indirect or direct nodes */
+ for (i = 1; i <= level; i++) {
+ bool done = false;
+
+ if (!nids[i] && !ro) {
+ mutex_lock_op(sbi, NODE_NEW);
+
+ /* alloc new node */
+ if (!alloc_nid(sbi, &(nids[i]))) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = -ENOSPC;
+ goto release_pages;
+ }
+
+ dn->nid = nids[i];
+ npage[i] = new_node_page(dn, noffset[i]);
+ if (IS_ERR(npage[i])) {
+ alloc_nid_failed(sbi, nids[i]);
+ mutex_unlock_op(sbi, NODE_NEW);
+ err = PTR_ERR(npage[i]);
+ goto release_pages;
+ }
+
+ set_nid(parent, offset[i - 1], nids[i], i == 1);
+ alloc_nid_done(sbi, nids[i]);
+ mutex_unlock_op(sbi, NODE_NEW);
+ done = true;
+ } else if (ro && i == level && level > 1) {
+ npage[i] = get_node_page_ra(parent, offset[i - 1]);
+ if (IS_ERR(npage[i])) {
+ err = PTR_ERR(npage[i]);
+ goto release_pages;
+ }
+ done = true;
+ }
+ if (i == 1) {
+ dn->inode_page_locked = false;
+ unlock_page(parent);
+ } else {
+ f2fs_put_page(parent, 1);
+ }
+
+ if (!done) {
+ npage[i] = get_node_page(sbi, nids[i]);
+ if (IS_ERR(npage[i])) {
+ err = PTR_ERR(npage[i]);
+ f2fs_put_page(npage[0], 0);
+ goto release_out;
+ }
+ }
+ if (i < level) {
+ parent = npage[i];
+ nids[i + 1] = get_nid(parent, offset[i], false);
+ }
+ }
+ dn->nid = nids[level];
+ dn->ofs_in_node = offset[level];
+ dn->node_page = npage[level];
+ dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ return 0;
+
+release_pages:
+ f2fs_put_page(parent, 1);
+ if (i > 1)
+ f2fs_put_page(npage[0], 0);
+release_out:
+ dn->inode_page = NULL;
+ dn->node_page = NULL;
+ return err;
+}
+
+static void truncate_node(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct node_info ni;
+
+ get_node_info(sbi, dn->nid, &ni);
+ BUG_ON(ni.blk_addr == NULL_ADDR);
+
+ if (ni.blk_addr != NULL_ADDR)
+ invalidate_blocks(sbi, ni.blk_addr);
+
+ /* Deallocate node address */
+ dec_valid_node_count(sbi, dn->inode, 1);
+ set_node_addr(sbi, &ni, NULL_ADDR);
+
+ if (dn->nid == dn->inode->i_ino) {
+ remove_orphan_inode(sbi, dn->nid);
+ dec_valid_inode_count(sbi);
+ } else {
+ sync_inode_page(dn);
+ }
+
+ clear_node_page_dirty(dn->node_page);
+ F2FS_SET_SB_DIRT(sbi);
+
+ f2fs_put_page(dn->node_page, 1);
+ dn->node_page = NULL;
+}
+
+static int truncate_dnode(struct dnode_of_data *dn)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct page *page;
+
+ if (dn->nid == 0)
+ return 1;
+
+ /* get direct node */
+ page = get_node_page(sbi, dn->nid);
+ if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
+ return 1;
+ else if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ /* Make dnode_of_data for parameter */
+ dn->node_page = page;
+ dn->ofs_in_node = 0;
+ truncate_data_blocks(dn);
+ truncate_node(dn);
+ return 1;
+}
+
+static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
+ int ofs, int depth)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct dnode_of_data rdn = *dn;
+ struct page *page;
+ struct f2fs_node *rn;
+ nid_t child_nid;
+ unsigned int child_nofs;
+ int freed = 0;
+ int i, ret;
+
+ if (dn->nid == 0)
+ return NIDS_PER_BLOCK + 1;
+
+ page = get_node_page(sbi, dn->nid);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ rn = (struct f2fs_node *)page_address(page);
+ if (depth < 3) {
+ for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
+ child_nid = le32_to_cpu(rn->in.nid[i]);
+ if (child_nid == 0)
+ continue;
+ rdn.nid = child_nid;
+ ret = truncate_dnode(&rdn);
+ if (ret < 0)
+ goto out_err;
+ set_nid(page, i, 0, false);
+ }
+ } else {
+ child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
+ for (i = ofs; i < NIDS_PER_BLOCK; i++) {
+ child_nid = le32_to_cpu(rn->in.nid[i]);
+ if (child_nid == 0) {
+ child_nofs += NIDS_PER_BLOCK + 1;
+ continue;
+ }
+ rdn.nid = child_nid;
+ ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
+ if (ret == (NIDS_PER_BLOCK + 1)) {
+ set_nid(page, i, 0, false);
+ child_nofs += ret;
+ } else if (ret < 0 && ret != -ENOENT) {
+ goto out_err;
+ }
+ }
+ freed = child_nofs;
+ }
+
+ if (!ofs) {
+ /* remove current indirect node */
+ dn->node_page = page;
+ truncate_node(dn);
+ freed++;
+ } else {
+ f2fs_put_page(page, 1);
+ }
+ return freed;
+
+out_err:
+ f2fs_put_page(page, 1);
+ return ret;
+}
+
+static int truncate_partial_nodes(struct dnode_of_data *dn,
+ struct f2fs_inode *ri, int *offset, int depth)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct page *pages[2];
+ nid_t nid[3];
+ nid_t child_nid;
+ int err = 0;
+ int i;
+ int idx = depth - 2;
+
+ nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ if (!nid[0])
+ return 0;
+
+ /* get indirect nodes in the path */
+ for (i = 0; i < depth - 1; i++) {
+ /* refernece count'll be increased */
+ pages[i] = get_node_page(sbi, nid[i]);
+ if (IS_ERR(pages[i])) {
+ depth = i + 1;
+ err = PTR_ERR(pages[i]);
+ goto fail;
+ }
+ nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
+ }
+
+ /* free direct nodes linked to a partial indirect node */
+ for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+ child_nid = get_nid(pages[idx], i, false);
+ if (!child_nid)
+ continue;
+ dn->nid = child_nid;
+ err = truncate_dnode(dn);
+ if (err < 0)
+ goto fail;
+ set_nid(pages[idx], i, 0, false);
+ }
+
+ if (offset[depth - 1] == 0) {
+ dn->node_page = pages[idx];
+ dn->nid = nid[idx];
+ truncate_node(dn);
+ } else {
+ f2fs_put_page(pages[idx], 1);
+ }
+ offset[idx]++;
+ offset[depth - 1] = 0;
+fail:
+ for (i = depth - 3; i >= 0; i--)
+ f2fs_put_page(pages[i], 1);
+ return err;
+}
+
+/*
+ * All the block addresses of data and nodes should be nullified.
+ */
+int truncate_inode_blocks(struct inode *inode, pgoff_t from)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int err = 0, cont = 1;
+ int level, offset[4], noffset[4];
+ unsigned int nofs;
+ struct f2fs_node *rn;
+ struct dnode_of_data dn;
+ struct page *page;
+
+ level = get_node_path(from, offset, noffset);
+
+ page = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ set_new_dnode(&dn, inode, page, NULL, 0);
+ unlock_page(page);
+
+ rn = page_address(page);
+ switch (level) {
+ case 0:
+ case 1:
+ nofs = noffset[1];
+ break;
+ case 2:
+ nofs = noffset[1];
+ if (!offset[level - 1])
+ goto skip_partial;
+ err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+ if (err < 0 && err != -ENOENT)
+ goto fail;
+ nofs += 1 + NIDS_PER_BLOCK;
+ break;
+ case 3:
+ nofs = 5 + 2 * NIDS_PER_BLOCK;
+ if (!offset[level - 1])
+ goto skip_partial;
+ err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+ if (err < 0 && err != -ENOENT)
+ goto fail;
+ break;
+ default:
+ BUG();
+ }
+
+skip_partial:
+ while (cont) {
+ dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ switch (offset[0]) {
+ case NODE_DIR1_BLOCK:
+ case NODE_DIR2_BLOCK:
+ err = truncate_dnode(&dn);
+ break;
+
+ case NODE_IND1_BLOCK:
+ case NODE_IND2_BLOCK:
+ err = truncate_nodes(&dn, nofs, offset[1], 2);
+ break;
+
+ case NODE_DIND_BLOCK:
+ err = truncate_nodes(&dn, nofs, offset[1], 3);
+ cont = 0;
+ break;
+
+ default:
+ BUG();
+ }
+ if (err < 0 && err != -ENOENT)
+ goto fail;
+ if (offset[1] == 0 &&
+ rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+ lock_page(page);
+ wait_on_page_writeback(page);
+ rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+ set_page_dirty(page);
+ unlock_page(page);
+ }
+ offset[1] = 0;
+ offset[0]++;
+ nofs += err;
+ }
+fail:
+ f2fs_put_page(page, 0);
+ return err > 0 ? 0 : err;
+}
+
+int remove_inode_page(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+ nid_t ino = inode->i_ino;
+ struct dnode_of_data dn;
+
+ mutex_lock_op(sbi, NODE_TRUNC);
+ page = get_node_page(sbi, ino);
+ if (IS_ERR(page)) {
+ mutex_unlock_op(sbi, NODE_TRUNC);
+ return PTR_ERR(page);
+ }
+
+ if (F2FS_I(inode)->i_xattr_nid) {
+ nid_t nid = F2FS_I(inode)->i_xattr_nid;
+ struct page *npage = get_node_page(sbi, nid);
+
+ if (IS_ERR(npage)) {
+ mutex_unlock_op(sbi, NODE_TRUNC);
+ return PTR_ERR(npage);
+ }
+
+ F2FS_I(inode)->i_xattr_nid = 0;
+ set_new_dnode(&dn, inode, page, npage, nid);
+ dn.inode_page_locked = 1;
+ truncate_node(&dn);
+ }
+ if (inode->i_blocks == 1) {
+ /* inernally call f2fs_put_page() */
+ set_new_dnode(&dn, inode, page, page, ino);
+ truncate_node(&dn);
+ } else if (inode->i_blocks == 0) {
+ struct node_info ni;
+ get_node_info(sbi, inode->i_ino, &ni);
+
+ /* called after f2fs_new_inode() is failed */
+ BUG_ON(ni.blk_addr != NULL_ADDR);
+ f2fs_put_page(page, 1);
+ } else {
+ BUG();
+ }
+ mutex_unlock_op(sbi, NODE_TRUNC);
+ return 0;
+}
+
+int new_inode_page(struct inode *inode, struct dentry *dentry)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *page;
+ struct dnode_of_data dn;
+
+ /* allocate inode page for new inode */
+ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
+ mutex_lock_op(sbi, NODE_NEW);
+ page = new_node_page(&dn, 0);
+ init_dent_inode(dentry, page);
+ mutex_unlock_op(sbi, NODE_NEW);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ f2fs_put_page(page, 1);
+ return 0;
+}
+
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ struct node_info old_ni, new_ni;
+ struct page *page;
+ int err;
+
+ if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ return ERR_PTR(-EPERM);
+
+ page = grab_cache_page(mapping, dn->nid);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ get_node_info(sbi, dn->nid, &old_ni);
+
+ SetPageUptodate(page);
+ fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
+
+ /* Reinitialize old_ni with new node page */
+ BUG_ON(old_ni.blk_addr != NULL_ADDR);
+ new_ni = old_ni;
+ new_ni.ino = dn->inode->i_ino;
+
+ if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+ err = -ENOSPC;
+ goto fail;
+ }
+ set_node_addr(sbi, &new_ni, NEW_ADDR);
+
+ dn->node_page = page;
+ sync_inode_page(dn);
+ set_page_dirty(page);
+ set_cold_node(dn->inode, page);
+ if (ofs == 0)
+ inc_valid_inode_count(sbi);
+
+ return page;
+
+fail:
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+}
+
+static int read_node_page(struct page *page, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct node_info ni;
+
+ get_node_info(sbi, page->index, &ni);
+
+ if (ni.blk_addr == NULL_ADDR)
+ return -ENOENT;
+ return f2fs_readpage(sbi, page, ni.blk_addr, type);
+}
+
+/*
+ * Readahead a node page
+ */
+void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ struct page *apage;
+
+ apage = find_get_page(mapping, nid);
+ if (apage && PageUptodate(apage))
+ goto release_out;
+ f2fs_put_page(apage, 0);
+
+ apage = grab_cache_page(mapping, nid);
+ if (!apage)
+ return;
+
+ if (read_node_page(apage, READA))
+ goto unlock_out;
+
+ page_cache_release(apage);
+ return;
+
+unlock_out:
+ unlock_page(apage);
+release_out:
+ page_cache_release(apage);
+}
+
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ int err;
+ struct page *page;
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+
+ page = grab_cache_page(mapping, nid);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ err = read_node_page(page, READ_SYNC);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+
+ BUG_ON(nid != nid_of_node(page));
+ mark_page_accessed(page);
+ return page;
+}
+
+/*
+ * Return a locked page for the desired node page.
+ * And, readahead MAX_RA_NODE number of node pages.
+ */
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ int i, end;
+ int err = 0;
+ nid_t nid;
+ struct page *page;
+
+ /* First, try getting the desired direct node. */
+ nid = get_nid(parent, start, false);
+ if (!nid)
+ return ERR_PTR(-ENOENT);
+
+ page = find_get_page(mapping, nid);
+ if (page && PageUptodate(page))
+ goto page_hit;
+ f2fs_put_page(page, 0);
+
+repeat:
+ page = grab_cache_page(mapping, nid);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ err = read_node_page(page, READA);
+ if (err) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(err);
+ }
+
+ /* Then, try readahead for siblings of the desired node */
+ end = start + MAX_RA_NODE;
+ end = min(end, NIDS_PER_BLOCK);
+ for (i = start + 1; i < end; i++) {
+ nid = get_nid(parent, i, false);
+ if (!nid)
+ continue;
+ ra_node_page(sbi, nid);
+ }
+
+page_hit:
+ lock_page(page);
+ if (PageError(page)) {
+ f2fs_put_page(page, 1);
+ return ERR_PTR(-EIO);
+ }
+
+ /* Has the page been truncated? */
+ if (page->mapping != mapping) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
+ return page;
+}
+
+void sync_inode_page(struct dnode_of_data *dn)
+{
+ if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
+ update_inode(dn->inode, dn->node_page);
+ } else if (dn->inode_page) {
+ if (!dn->inode_page_locked)
+ lock_page(dn->inode_page);
+ update_inode(dn->inode, dn->inode_page);
+ if (!dn->inode_page_locked)
+ unlock_page(dn->inode_page);
+ } else {
+ f2fs_write_inode(dn->inode, NULL);
+ }
+}
+
+int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
+ struct writeback_control *wbc)
+{
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ int step = ino ? 2 : 0;
+ int nwritten = 0, wrote = 0;
+
+ pagevec_init(&pvec, 0);
+
+next_step:
+ index = 0;
+ end = LONG_MAX;
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /*
+ * flushing sequence with step:
+ * 0. indirect nodes
+ * 1. dentry dnodes
+ * 2. file dnodes
+ */
+ if (step == 0 && IS_DNODE(page))
+ continue;
+ if (step == 1 && (!IS_DNODE(page) ||
+ is_cold_node(page)))
+ continue;
+ if (step == 2 && (!IS_DNODE(page) ||
+ !is_cold_node(page)))
+ continue;
+
+ /*
+ * If an fsync mode,
+ * we should not skip writing node pages.
+ */
+ if (ino && ino_of_node(page) == ino)
+ lock_page(page);
+ else if (!trylock_page(page))
+ continue;
+
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (ino && ino_of_node(page) != ino)
+ goto continue_unlock;
+
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ /* called by fsync() */
+ if (ino && IS_DNODE(page)) {
+ int mark = !is_checkpointed_node(sbi, ino);
+ set_fsync_mark(page, 1);
+ if (IS_INODE(page))
+ set_dentry_mark(page, mark);
+ nwritten++;
+ } else {
+ set_fsync_mark(page, 0);
+ set_dentry_mark(page, 0);
+ }
+ mapping->a_ops->writepage(page, wbc);
+ wrote++;
+
+ if (--wbc->nr_to_write == 0)
+ break;
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+
+ if (wbc->nr_to_write == 0) {
+ step = 2;
+ break;
+ }
+ }
+
+ if (step < 2) {
+ step++;
+ goto next_step;
+ }
+
+ if (wrote)
+ f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
+
+ return nwritten;
+}
+
+static int f2fs_write_node_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ nid_t nid;
+ unsigned int nofs;
+ block_t new_addr;
+ struct node_info ni;
+
+ if (wbc->for_reclaim) {
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ wbc->pages_skipped++;
+ set_page_dirty(page);
+ return AOP_WRITEPAGE_ACTIVATE;
+ }
+
+ wait_on_page_writeback(page);
+
+ mutex_lock_op(sbi, NODE_WRITE);
+
+ /* get old block addr of this node page */
+ nid = nid_of_node(page);
+ nofs = ofs_of_node(page);
+ BUG_ON(page->index != nid);
+
+ get_node_info(sbi, nid, &ni);
+
+ /* This page is already truncated */
+ if (ni.blk_addr == NULL_ADDR)
+ return 0;
+
+ set_page_writeback(page);
+
+ /* insert node offset */
+ write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
+ set_node_addr(sbi, &ni, new_addr);
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+
+ mutex_unlock_op(sbi, NODE_WRITE);
+ unlock_page(page);
+ return 0;
+}
+
+static int f2fs_write_node_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ struct block_device *bdev = sbi->sb->s_bdev;
+ long nr_to_write = wbc->nr_to_write;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+ if (get_pages(sbi, F2FS_DIRTY_NODES) == 0)
+ return 0;
+
+ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
+ write_checkpoint(sbi, false, false);
+ return 0;
+ }
+
+ /* if mounting is failed, skip writing node pages */
+ wbc->nr_to_write = bio_get_nr_vecs(bdev);
+ sync_node_pages(sbi, 0, wbc);
+ wbc->nr_to_write = nr_to_write -
+ (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
+ return 0;
+}
+
+static int f2fs_set_node_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+
+ SetPageUptodate(page);
+ if (!PageDirty(page)) {
+ __set_page_dirty_nobuffers(page);
+ inc_page_count(sbi, F2FS_DIRTY_NODES);
+ SetPagePrivate(page);
+ return 1;
+ }
+ return 0;
+}
+
+static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ if (PageDirty(page))
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ ClearPagePrivate(page);
+}
+
+static int f2fs_release_node_page(struct page *page, gfp_t wait)
+{
+ ClearPagePrivate(page);
+ return 0;
+}
+
+/*
+ * Structure of the f2fs node operations
+ */
+const struct address_space_operations f2fs_node_aops = {
+ .writepage = f2fs_write_node_page,
+ .writepages = f2fs_write_node_pages,
+ .set_page_dirty = f2fs_set_node_page_dirty,
+ .invalidatepage = f2fs_invalidate_node_page,
+ .releasepage = f2fs_release_node_page,
+};
+
+static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
+{
+ struct list_head *this;
+ struct free_nid *i = NULL;
+ list_for_each(this, head) {
+ i = list_entry(this, struct free_nid, list);
+ if (i->nid == n)
+ break;
+ i = NULL;
+ }
+ return i;
+}
+
+static void __del_from_free_nid_list(struct free_nid *i)
+{
+ list_del(&i->list);
+ kmem_cache_free(free_nid_slab, i);
+}
+
+static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+ struct free_nid *i;
+
+ if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+ return 0;
+retry:
+ i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
+ if (!i) {
+ cond_resched();
+ goto retry;
+ }
+ i->nid = nid;
+ i->state = NID_NEW;
+
+ spin_lock(&nm_i->free_nid_list_lock);
+ if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
+ spin_unlock(&nm_i->free_nid_list_lock);
+ kmem_cache_free(free_nid_slab, i);
+ return 0;
+ }
+ list_add_tail(&i->list, &nm_i->free_nid_list);
+ nm_i->fcnt++;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return 1;
+}
+
+static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+{
+ struct free_nid *i;
+ spin_lock(&nm_i->free_nid_list_lock);
+ i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ if (i && i->state == NID_NEW) {
+ __del_from_free_nid_list(i);
+ nm_i->fcnt--;
+ }
+ spin_unlock(&nm_i->free_nid_list_lock);
+}
+
+static int scan_nat_page(struct f2fs_nm_info *nm_i,
+ struct page *nat_page, nid_t start_nid)
+{
+ struct f2fs_nat_block *nat_blk = page_address(nat_page);
+ block_t blk_addr;
+ int fcnt = 0;
+ int i;
+
+ /* 0 nid should not be used */
+ if (start_nid == 0)
+ ++start_nid;
+
+ i = start_nid % NAT_ENTRY_PER_BLOCK;
+
+ for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
+ blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
+ BUG_ON(blk_addr == NEW_ADDR);
+ if (blk_addr == NULL_ADDR)
+ fcnt += add_free_nid(nm_i, start_nid);
+ }
+ return fcnt;
+}
+
+static void build_free_nids(struct f2fs_sb_info *sbi)
+{
+ struct free_nid *fnid, *next_fnid;
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ nid_t nid = 0;
+ bool is_cycled = false;
+ int fcnt = 0;
+ int i;
+
+ nid = nm_i->next_scan_nid;
+ nm_i->init_scan_nid = nid;
+
+ ra_nat_pages(sbi, nid);
+
+ while (1) {
+ struct page *page = get_current_nat_page(sbi, nid);
+
+ fcnt += scan_nat_page(nm_i, page, nid);
+ f2fs_put_page(page, 1);
+
+ nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
+
+ if (nid >= nm_i->max_nid) {
+ nid = 0;
+ is_cycled = true;
+ }
+ if (fcnt > MAX_FREE_NIDS)
+ break;
+ if (is_cycled && nm_i->init_scan_nid <= nid)
+ break;
+ }
+
+ nm_i->next_scan_nid = nid;
+
+ /* find free nids from current sum_pages */
+ mutex_lock(&curseg->curseg_mutex);
+ for (i = 0; i < nats_in_cursum(sum); i++) {
+ block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
+ nid = le32_to_cpu(nid_in_journal(sum, i));
+ if (addr == NULL_ADDR)
+ add_free_nid(nm_i, nid);
+ else
+ remove_free_nid(nm_i, nid);
+ }
+ mutex_unlock(&curseg->curseg_mutex);
+
+ /* remove the free nids from current allocated nids */
+ list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
+ struct nat_entry *ne;
+
+ read_lock(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, fnid->nid);
+ if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
+ remove_free_nid(nm_i, fnid->nid);
+ read_unlock(&nm_i->nat_tree_lock);
+ }
+}
+
+/*
+ * If this function returns success, caller can obtain a new nid
+ * from second parameter of this function.
+ * The returned nid could be used ino as well as nid when inode is created.
+ */
+bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i = NULL;
+ struct list_head *this;
+retry:
+ mutex_lock(&nm_i->build_lock);
+ if (!nm_i->fcnt) {
+ /* scan NAT in order to build free nid list */
+ build_free_nids(sbi);
+ if (!nm_i->fcnt) {
+ mutex_unlock(&nm_i->build_lock);
+ return false;
+ }
+ }
+ mutex_unlock(&nm_i->build_lock);
+
+ /*
+ * We check fcnt again since previous check is racy as
+ * we didn't hold free_nid_list_lock. So other thread
+ * could consume all of free nids.
+ */
+ spin_lock(&nm_i->free_nid_list_lock);
+ if (!nm_i->fcnt) {
+ spin_unlock(&nm_i->free_nid_list_lock);
+ goto retry;
+ }
+
+ BUG_ON(list_empty(&nm_i->free_nid_list));
+ list_for_each(this, &nm_i->free_nid_list) {
+ i = list_entry(this, struct free_nid, list);
+ if (i->state == NID_NEW)
+ break;
+ }
+
+ BUG_ON(i->state != NID_NEW);
+ *nid = i->nid;
+ i->state = NID_ALLOC;
+ nm_i->fcnt--;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return true;
+}
+
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i;
+
+ spin_lock(&nm_i->free_nid_list_lock);
+ i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ if (i) {
+ BUG_ON(i->state != NID_ALLOC);
+ __del_from_free_nid_list(i);
+ }
+ spin_unlock(&nm_i->free_nid_list_lock);
+}
+
+/*
+ * alloc_nid() should be called prior to this function.
+ */
+void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ alloc_nid_done(sbi, nid);
+ add_free_nid(NM_I(sbi), nid);
+}
+
+void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
+ struct f2fs_summary *sum, struct node_info *ni,
+ block_t new_blkaddr)
+{
+ rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
+ set_node_addr(sbi, ni, new_blkaddr);
+ clear_node_page_dirty(page);
+}
+
+int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct address_space *mapping = sbi->node_inode->i_mapping;
+ struct f2fs_node *src, *dst;
+ nid_t ino = ino_of_node(page);
+ struct node_info old_ni, new_ni;
+ struct page *ipage;
+
+ ipage = grab_cache_page(mapping, ino);
+ if (!ipage)
+ return -ENOMEM;
+
+ /* Should not use this inode from free nid list */
+ remove_free_nid(NM_I(sbi), ino);
+
+ get_node_info(sbi, ino, &old_ni);
+ SetPageUptodate(ipage);
+ fill_node_footer(ipage, ino, ino, 0, true);
+
+ src = (struct f2fs_node *)page_address(page);
+ dst = (struct f2fs_node *)page_address(ipage);
+
+ memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
+ dst->i.i_size = 0;
+ dst->i.i_blocks = cpu_to_le64(1);
+ dst->i.i_links = cpu_to_le32(1);
+ dst->i.i_xattr_nid = 0;
+
+ new_ni = old_ni;
+ new_ni.ino = ino;
+
+ set_node_addr(sbi, &new_ni, NEW_ADDR);
+ inc_valid_inode_count(sbi);
+
+ f2fs_put_page(ipage, 1);
+ return 0;
+}
+
+int restore_node_summary(struct f2fs_sb_info *sbi,
+ unsigned int segno, struct f2fs_summary_block *sum)
+{
+ struct f2fs_node *rn;
+ struct f2fs_summary *sum_entry;
+ struct page *page;
+ block_t addr;
+ int i, last_offset;
+
+ /* alloc temporal page for read node */
+ page = alloc_page(GFP_NOFS | __GFP_ZERO);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ lock_page(page);
+
+ /* scan the node segment */
+ last_offset = sbi->blocks_per_seg;
+ addr = START_BLOCK(sbi, segno);
+ sum_entry = &sum->entries[0];
+
+ for (i = 0; i < last_offset; i++, sum_entry++) {
+ if (f2fs_readpage(sbi, page, addr, READ_SYNC))
+ goto out;
+
+ rn = (struct f2fs_node *)page_address(page);
+ sum_entry->nid = rn->footer.nid;
+ sum_entry->version = 0;
+ sum_entry->ofs_in_node = 0;
+ addr++;
+
+ /*
+ * In order to read next node page,
+ * we must clear PageUptodate flag.
+ */
+ ClearPageUptodate(page);
+ }
+out:
+ unlock_page(page);
+ __free_pages(page, 0);
+ return 0;
+}
+
+static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ int i;
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
+ mutex_unlock(&curseg->curseg_mutex);
+ return false;
+ }
+
+ for (i = 0; i < nats_in_cursum(sum); i++) {
+ struct nat_entry *ne;
+ struct f2fs_nat_entry raw_ne;
+ nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
+
+ raw_ne = nat_in_journal(sum, i);
+retry:
+ write_lock(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne) {
+ __set_nat_cache_dirty(nm_i, ne);
+ write_unlock(&nm_i->nat_tree_lock);
+ continue;
+ }
+ ne = grab_nat_entry(nm_i, nid);
+ if (!ne) {
+ write_unlock(&nm_i->nat_tree_lock);
+ goto retry;
+ }
+ nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
+ nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
+ nat_set_version(ne, raw_ne.version);
+ __set_nat_cache_dirty(nm_i, ne);
+ write_unlock(&nm_i->nat_tree_lock);
+ }
+ update_nats_in_cursum(sum, -i);
+ mutex_unlock(&curseg->curseg_mutex);
+ return true;
+}
+
+/*
+ * This function is called during the checkpointing process.
+ */
+void flush_nat_entries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ struct list_head *cur, *n;
+ struct page *page = NULL;
+ struct f2fs_nat_block *nat_blk = NULL;
+ nid_t start_nid = 0, end_nid = 0;
+ bool flushed;
+
+ flushed = flush_nats_in_journal(sbi);
+
+ if (!flushed)
+ mutex_lock(&curseg->curseg_mutex);
+
+ /* 1) flush dirty nat caches */
+ list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
+ struct nat_entry *ne;
+ nid_t nid;
+ struct f2fs_nat_entry raw_ne;
+ int offset = -1;
+ block_t old_blkaddr, new_blkaddr;
+
+ ne = list_entry(cur, struct nat_entry, list);
+ nid = nat_get_nid(ne);
+
+ if (nat_get_blkaddr(ne) == NEW_ADDR)
+ continue;
+ if (flushed)
+ goto to_nat_page;
+
+ /* if there is room for nat enries in curseg->sumpage */
+ offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
+ if (offset >= 0) {
+ raw_ne = nat_in_journal(sum, offset);
+ old_blkaddr = le32_to_cpu(raw_ne.block_addr);
+ goto flush_now;
+ }
+to_nat_page:
+ if (!page || (start_nid > nid || nid > end_nid)) {
+ if (page) {
+ f2fs_put_page(page, 1);
+ page = NULL;
+ }
+ start_nid = START_NID(nid);
+ end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
+
+ /*
+ * get nat block with dirty flag, increased reference
+ * count, mapped and lock
+ */
+ page = get_next_nat_page(sbi, start_nid);
+ nat_blk = page_address(page);
+ }
+
+ BUG_ON(!nat_blk);
+ raw_ne = nat_blk->entries[nid - start_nid];
+ old_blkaddr = le32_to_cpu(raw_ne.block_addr);
+flush_now:
+ new_blkaddr = nat_get_blkaddr(ne);
+
+ raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
+ raw_ne.block_addr = cpu_to_le32(new_blkaddr);
+ raw_ne.version = nat_get_version(ne);
+
+ if (offset < 0) {
+ nat_blk->entries[nid - start_nid] = raw_ne;
+ } else {
+ nat_in_journal(sum, offset) = raw_ne;
+ nid_in_journal(sum, offset) = cpu_to_le32(nid);
+ }
+
+ if (nat_get_blkaddr(ne) == NULL_ADDR) {
+ write_lock(&nm_i->nat_tree_lock);
+ __del_from_nat_cache(nm_i, ne);
+ write_unlock(&nm_i->nat_tree_lock);
+
+ /* We can reuse this freed nid at this point */
+ add_free_nid(NM_I(sbi), nid);
+ } else {
+ write_lock(&nm_i->nat_tree_lock);
+ __clear_nat_cache_dirty(nm_i, ne);
+ ne->checkpointed = true;
+ write_unlock(&nm_i->nat_tree_lock);
+ }
+ }
+ if (!flushed)
+ mutex_unlock(&curseg->curseg_mutex);
+ f2fs_put_page(page, 1);
+
+ /* 2) shrink nat caches if necessary */
+ try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
+}
+
+static int init_node_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ unsigned char *version_bitmap;
+ unsigned int nat_segs, nat_blocks;
+
+ nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
+
+ /* segment_count_nat includes pair segment so divide to 2. */
+ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
+ nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+ nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+ nm_i->fcnt = 0;
+ nm_i->nat_cnt = 0;
+
+ INIT_LIST_HEAD(&nm_i->free_nid_list);
+ INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
+ INIT_LIST_HEAD(&nm_i->nat_entries);
+ INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
+
+ mutex_init(&nm_i->build_lock);
+ spin_lock_init(&nm_i->free_nid_list_lock);
+ rwlock_init(&nm_i->nat_tree_lock);
+
+ nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
+ nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+ nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
+
+ nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
+ if (!nm_i->nat_bitmap)
+ return -ENOMEM;
+ version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
+ if (!version_bitmap)
+ return -EFAULT;
+
+ /* copy version bitmap */
+ memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
+ return 0;
+}
+
+int build_node_manager(struct f2fs_sb_info *sbi)
+{
+ int err;
+
+ sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
+ if (!sbi->nm_info)
+ return -ENOMEM;
+
+ err = init_node_manager(sbi);
+ if (err)
+ return err;
+
+ build_free_nids(sbi);
+ return 0;
+}
+
+void destroy_node_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *i, *next_i;
+ struct nat_entry *natvec[NATVEC_SIZE];
+ nid_t nid = 0;
+ unsigned int found;
+
+ if (!nm_i)
+ return;
+
+ /* destroy free nid list */
+ spin_lock(&nm_i->free_nid_list_lock);
+ list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+ BUG_ON(i->state == NID_ALLOC);
+ __del_from_free_nid_list(i);
+ nm_i->fcnt--;
+ }
+ BUG_ON(nm_i->fcnt);
+ spin_unlock(&nm_i->free_nid_list_lock);
+
+ /* destroy nat cache */
+ write_lock(&nm_i->nat_tree_lock);
+ while ((found = __gang_lookup_nat_cache(nm_i,
+ nid, NATVEC_SIZE, natvec))) {
+ unsigned idx;
+ for (idx = 0; idx < found; idx++) {
+ struct nat_entry *e = natvec[idx];
+ nid = nat_get_nid(e) + 1;
+ __del_from_nat_cache(nm_i, e);
+ }
+ }
+ BUG_ON(nm_i->nat_cnt);
+ write_unlock(&nm_i->nat_tree_lock);
+
+ kfree(nm_i->nat_bitmap);
+ sbi->nm_info = NULL;
+ kfree(nm_i);
+}
+
+int create_node_manager_caches(void)
+{
+ nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+ sizeof(struct nat_entry), NULL);
+ if (!nat_entry_slab)
+ return -ENOMEM;
+
+ free_nid_slab = f2fs_kmem_cache_create("free_nid",
+ sizeof(struct free_nid), NULL);
+ if (!free_nid_slab) {
+ kmem_cache_destroy(nat_entry_slab);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void destroy_node_manager_caches(void)
+{
+ kmem_cache_destroy(free_nid_slab);
+ kmem_cache_destroy(nat_entry_slab);
+}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
new file mode 100644
index 000000000000..afdb130f782e
--- /dev/null
+++ b/fs/f2fs/node.h
@@ -0,0 +1,353 @@
+/*
+ * fs/f2fs/node.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* start node id of a node block dedicated to the given node id */
+#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
+
+/* node block offset on the NAT area dedicated to the given start node id */
+#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
+
+/* # of pages to perform readahead before building free nids */
+#define FREE_NID_PAGES 4
+
+/* maximum # of free node ids to produce during build_free_nids */
+#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
+
+/* maximum readahead size for node during getting data blocks */
+#define MAX_RA_NODE 128
+
+/* maximum cached nat entries to manage memory footprint */
+#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK)
+
+/* vector size for gang look-up from nat cache that consists of radix tree */
+#define NATVEC_SIZE 64
+
+/*
+ * For node information
+ */
+struct node_info {
+ nid_t nid; /* node id */
+ nid_t ino; /* inode number of the node's owner */
+ block_t blk_addr; /* block address of the node */
+ unsigned char version; /* version of the node */
+};
+
+struct nat_entry {
+ struct list_head list; /* for clean or dirty nat list */
+ bool checkpointed; /* whether it is checkpointed or not */
+ struct node_info ni; /* in-memory node information */
+};
+
+#define nat_get_nid(nat) (nat->ni.nid)
+#define nat_set_nid(nat, n) (nat->ni.nid = n)
+#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
+#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
+#define nat_get_ino(nat) (nat->ni.ino)
+#define nat_set_ino(nat, i) (nat->ni.ino = i)
+#define nat_get_version(nat) (nat->ni.version)
+#define nat_set_version(nat, v) (nat->ni.version = v)
+
+#define __set_nat_cache_dirty(nm_i, ne) \
+ list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
+#define __clear_nat_cache_dirty(nm_i, ne) \
+ list_move_tail(&ne->list, &nm_i->nat_entries);
+#define inc_node_version(version) (++version)
+
+static inline void node_info_from_raw_nat(struct node_info *ni,
+ struct f2fs_nat_entry *raw_ne)
+{
+ ni->ino = le32_to_cpu(raw_ne->ino);
+ ni->blk_addr = le32_to_cpu(raw_ne->block_addr);
+ ni->version = raw_ne->version;
+}
+
+/*
+ * For free nid mangement
+ */
+enum nid_state {
+ NID_NEW, /* newly added to free nid list */
+ NID_ALLOC /* it is allocated */
+};
+
+struct free_nid {
+ struct list_head list; /* for free node id list */
+ nid_t nid; /* node id */
+ int state; /* in use or not: NID_NEW or NID_ALLOC */
+};
+
+static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct free_nid *fnid;
+
+ if (nm_i->fcnt <= 0)
+ return -1;
+ spin_lock(&nm_i->free_nid_list_lock);
+ fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+ *nid = fnid->nid;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ return 0;
+}
+
+/*
+ * inline functions
+ */
+static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
+}
+
+static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ pgoff_t block_off;
+ pgoff_t block_addr;
+ int seg_off;
+
+ block_off = NAT_BLOCK_OFFSET(start);
+ seg_off = block_off >> sbi->log_blocks_per_seg;
+
+ block_addr = (pgoff_t)(nm_i->nat_blkaddr +
+ (seg_off << sbi->log_blocks_per_seg << 1) +
+ (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+
+ if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+ block_addr += sbi->blocks_per_seg;
+
+ return block_addr;
+}
+
+static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
+ pgoff_t block_addr)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ block_addr -= nm_i->nat_blkaddr;
+ if ((block_addr >> sbi->log_blocks_per_seg) % 2)
+ block_addr -= sbi->blocks_per_seg;
+ else
+ block_addr += sbi->blocks_per_seg;
+
+ return block_addr + nm_i->nat_blkaddr;
+}
+
+static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
+{
+ unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
+
+ if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
+ f2fs_clear_bit(block_off, nm_i->nat_bitmap);
+ else
+ f2fs_set_bit(block_off, nm_i->nat_bitmap);
+}
+
+static inline void fill_node_footer(struct page *page, nid_t nid,
+ nid_t ino, unsigned int ofs, bool reset)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ if (reset)
+ memset(rn, 0, sizeof(*rn));
+ rn->footer.nid = cpu_to_le32(nid);
+ rn->footer.ino = cpu_to_le32(ino);
+ rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+}
+
+static inline void copy_node_footer(struct page *dst, struct page *src)
+{
+ void *src_addr = page_address(src);
+ void *dst_addr = page_address(dst);
+ struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
+ struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
+ memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
+}
+
+static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ rn->footer.cp_ver = ckpt->checkpoint_ver;
+ rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
+}
+
+static inline nid_t ino_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le32_to_cpu(rn->footer.ino);
+}
+
+static inline nid_t nid_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le32_to_cpu(rn->footer.nid);
+}
+
+static inline unsigned int ofs_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned flag = le32_to_cpu(rn->footer.flag);
+ return flag >> OFFSET_BIT_SHIFT;
+}
+
+static inline unsigned long long cpver_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le64_to_cpu(rn->footer.cp_ver);
+}
+
+static inline block_t next_blkaddr_of_node(struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ return le32_to_cpu(rn->footer.next_blkaddr);
+}
+
+/*
+ * f2fs assigns the following node offsets described as (num).
+ * N = NIDS_PER_BLOCK
+ *
+ * Inode block (0)
+ * |- direct node (1)
+ * |- direct node (2)
+ * |- indirect node (3)
+ * | `- direct node (4 => 4 + N - 1)
+ * |- indirect node (4 + N)
+ * | `- direct node (5 + N => 5 + 2N - 1)
+ * `- double indirect node (5 + 2N)
+ * `- indirect node (6 + 2N)
+ * `- direct node (x(N + 1))
+ */
+static inline bool IS_DNODE(struct page *node_page)
+{
+ unsigned int ofs = ofs_of_node(node_page);
+ if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
+ ofs == 5 + 2 * NIDS_PER_BLOCK)
+ return false;
+ if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
+ ofs -= 6 + 2 * NIDS_PER_BLOCK;
+ if ((long int)ofs % (NIDS_PER_BLOCK + 1))
+ return false;
+ }
+ return true;
+}
+
+static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+{
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+
+ wait_on_page_writeback(p);
+
+ if (i)
+ rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
+ else
+ rn->in.nid[off] = cpu_to_le32(nid);
+ set_page_dirty(p);
+}
+
+static inline nid_t get_nid(struct page *p, int off, bool i)
+{
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
+ if (i)
+ return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
+ return le32_to_cpu(rn->in.nid[off]);
+}
+
+/*
+ * Coldness identification:
+ * - Mark cold files in f2fs_inode_info
+ * - Mark cold node blocks in their node footer
+ * - Mark cold data pages in page cache
+ */
+static inline int is_cold_file(struct inode *inode)
+{
+ return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
+}
+
+static inline int is_cold_data(struct page *page)
+{
+ return PageChecked(page);
+}
+
+static inline void set_cold_data(struct page *page)
+{
+ SetPageChecked(page);
+}
+
+static inline void clear_cold_data(struct page *page)
+{
+ ClearPageChecked(page);
+}
+
+static inline int is_cold_node(struct page *page)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ return flag & (0x1 << COLD_BIT_SHIFT);
+}
+
+static inline unsigned char is_fsync_dnode(struct page *page)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ return flag & (0x1 << FSYNC_BIT_SHIFT);
+}
+
+static inline unsigned char is_dent_dnode(struct page *page)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ return flag & (0x1 << DENT_BIT_SHIFT);
+}
+
+static inline void set_cold_node(struct inode *inode, struct page *page)
+{
+ struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+
+ if (S_ISDIR(inode->i_mode))
+ flag &= ~(0x1 << COLD_BIT_SHIFT);
+ else
+ flag |= (0x1 << COLD_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+}
+
+static inline void set_fsync_mark(struct page *page, int mark)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ if (mark)
+ flag |= (0x1 << FSYNC_BIT_SHIFT);
+ else
+ flag &= ~(0x1 << FSYNC_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+}
+
+static inline void set_dentry_mark(struct page *page, int mark)
+{
+ void *kaddr = page_address(page);
+ struct f2fs_node *rn = (struct f2fs_node *)kaddr;
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ if (mark)
+ flag |= (0x1 << DENT_BIT_SHIFT);
+ else
+ flag &= ~(0x1 << DENT_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
new file mode 100644
index 000000000000..b07e9b6ef376
--- /dev/null
+++ b/fs/f2fs/recovery.c
@@ -0,0 +1,375 @@
+/*
+ * fs/f2fs/recovery.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "node.h"
+#include "segment.h"
+
+static struct kmem_cache *fsync_entry_slab;
+
+bool space_for_roll_forward(struct f2fs_sb_info *sbi)
+{
+ if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
+ > sbi->user_block_count)
+ return false;
+ return true;
+}
+
+static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
+ nid_t ino)
+{
+ struct list_head *this;
+ struct fsync_inode_entry *entry;
+
+ list_for_each(this, head) {
+ entry = list_entry(this, struct fsync_inode_entry, list);
+ if (entry->inode->i_ino == ino)
+ return entry;
+ }
+ return NULL;
+}
+
+static int recover_dentry(struct page *ipage, struct inode *inode)
+{
+ struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
+ struct f2fs_inode *raw_inode = &(raw_node->i);
+ struct dentry dent, parent;
+ struct f2fs_dir_entry *de;
+ struct page *page;
+ struct inode *dir;
+ int err = 0;
+
+ if (!is_dent_dnode(ipage))
+ goto out;
+
+ dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
+ if (IS_ERR(dir)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ parent.d_inode = dir;
+ dent.d_parent = &parent;
+ dent.d_name.len = le32_to_cpu(raw_inode->i_namelen);
+ dent.d_name.name = raw_inode->i_name;
+
+ de = f2fs_find_entry(dir, &dent.d_name, &page);
+ if (de) {
+ kunmap(page);
+ f2fs_put_page(page, 0);
+ } else {
+ f2fs_add_link(&dent, inode);
+ }
+ iput(dir);
+out:
+ kunmap(ipage);
+ return err;
+}
+
+static int recover_inode(struct inode *inode, struct page *node_page)
+{
+ void *kaddr = page_address(node_page);
+ struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
+ struct f2fs_inode *raw_inode = &(raw_node->i);
+
+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ i_size_write(inode, le64_to_cpu(raw_inode->i_size));
+ inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+ inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+ inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
+ inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+
+ return recover_dentry(node_page, inode);
+}
+
+static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
+{
+ unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+ struct curseg_info *curseg;
+ struct page *page;
+ block_t blkaddr;
+ int err = 0;
+
+ /* get node pages in the current segment */
+ curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+ blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
+
+ /* read node page */
+ page = alloc_page(GFP_F2FS_ZERO);
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+ lock_page(page);
+
+ while (1) {
+ struct fsync_inode_entry *entry;
+
+ if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+ goto out;
+
+ if (cp_ver != cpver_of_node(page))
+ goto out;
+
+ if (!is_fsync_dnode(page))
+ goto next;
+
+ entry = get_fsync_inode(head, ino_of_node(page));
+ if (entry) {
+ entry->blkaddr = blkaddr;
+ if (IS_INODE(page) && is_dent_dnode(page))
+ set_inode_flag(F2FS_I(entry->inode),
+ FI_INC_LINK);
+ } else {
+ if (IS_INODE(page) && is_dent_dnode(page)) {
+ if (recover_inode_page(sbi, page)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* add this fsync inode to the list */
+ entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
+ if (!entry) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&entry->list);
+ list_add_tail(&entry->list, head);
+
+ entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
+ if (IS_ERR(entry->inode)) {
+ err = PTR_ERR(entry->inode);
+ goto out;
+ }
+ entry->blkaddr = blkaddr;
+ }
+ if (IS_INODE(page)) {
+ err = recover_inode(entry->inode, page);
+ if (err)
+ goto out;
+ }
+next:
+ /* check next segment */
+ blkaddr = next_blkaddr_of_node(page);
+ ClearPageUptodate(page);
+ }
+out:
+ unlock_page(page);
+ __free_pages(page, 0);
+ return err;
+}
+
+static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
+ struct list_head *head)
+{
+ struct list_head *this;
+ struct fsync_inode_entry *entry;
+ list_for_each(this, head) {
+ entry = list_entry(this, struct fsync_inode_entry, list);
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+ }
+}
+
+static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ struct seg_entry *sentry;
+ unsigned int segno = GET_SEGNO(sbi, blkaddr);
+ unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
+ (sbi->blocks_per_seg - 1);
+ struct f2fs_summary sum;
+ nid_t ino;
+ void *kaddr;
+ struct inode *inode;
+ struct page *node_page;
+ block_t bidx;
+ int i;
+
+ sentry = get_seg_entry(sbi, segno);
+ if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
+ return;
+
+ /* Get the previous summary */
+ for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
+ struct curseg_info *curseg = CURSEG_I(sbi, i);
+ if (curseg->segno == segno) {
+ sum = curseg->sum_blk->entries[blkoff];
+ break;
+ }
+ }
+ if (i > CURSEG_COLD_DATA) {
+ struct page *sum_page = get_sum_page(sbi, segno);
+ struct f2fs_summary_block *sum_node;
+ kaddr = page_address(sum_page);
+ sum_node = (struct f2fs_summary_block *)kaddr;
+ sum = sum_node->entries[blkoff];
+ f2fs_put_page(sum_page, 1);
+ }
+
+ /* Get the node page */
+ node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
+ bidx = start_bidx_of_node(ofs_of_node(node_page)) +
+ le16_to_cpu(sum.ofs_in_node);
+ ino = ino_of_node(node_page);
+ f2fs_put_page(node_page, 1);
+
+ /* Deallocate previous index in the node page */
+ inode = f2fs_iget_nowait(sbi->sb, ino);
+ truncate_hole(inode, bidx, bidx + 1);
+ iput(inode);
+}
+
+static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
+ struct page *page, block_t blkaddr)
+{
+ unsigned int start, end;
+ struct dnode_of_data dn;
+ struct f2fs_summary sum;
+ struct node_info ni;
+
+ start = start_bidx_of_node(ofs_of_node(page));
+ if (IS_INODE(page))
+ end = start + ADDRS_PER_INODE;
+ else
+ end = start + ADDRS_PER_BLOCK;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&dn, start, 0))
+ return;
+
+ wait_on_page_writeback(dn.node_page);
+
+ get_node_info(sbi, dn.nid, &ni);
+ BUG_ON(ni.ino != ino_of_node(page));
+ BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
+
+ for (; start < end; start++) {
+ block_t src, dest;
+
+ src = datablock_addr(dn.node_page, dn.ofs_in_node);
+ dest = datablock_addr(page, dn.ofs_in_node);
+
+ if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
+ if (src == NULL_ADDR) {
+ int err = reserve_new_block(&dn);
+ /* We should not get -ENOSPC */
+ BUG_ON(err);
+ }
+
+ /* Check the previous node page having this index */
+ check_index_in_prev_nodes(sbi, dest);
+
+ set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
+
+ /* write dummy data page */
+ recover_data_page(sbi, NULL, &sum, src, dest);
+ update_extent_cache(dest, &dn);
+ }
+ dn.ofs_in_node++;
+ }
+
+ /* write node page in place */
+ set_summary(&sum, dn.nid, 0, 0);
+ if (IS_INODE(dn.node_page))
+ sync_inode_page(&dn);
+
+ copy_node_footer(dn.node_page, page);
+ fill_node_footer(dn.node_page, dn.nid, ni.ino,
+ ofs_of_node(page), false);
+ set_page_dirty(dn.node_page);
+
+ recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
+ f2fs_put_dnode(&dn);
+}
+
+static void recover_data(struct f2fs_sb_info *sbi,
+ struct list_head *head, int type)
+{
+ unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
+ struct curseg_info *curseg;
+ struct page *page;
+ block_t blkaddr;
+
+ /* get node pages in the current segment */
+ curseg = CURSEG_I(sbi, type);
+ blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+ /* read node page */
+ page = alloc_page(GFP_NOFS | __GFP_ZERO);
+ if (IS_ERR(page))
+ return;
+ lock_page(page);
+
+ while (1) {
+ struct fsync_inode_entry *entry;
+
+ if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
+ goto out;
+
+ if (cp_ver != cpver_of_node(page))
+ goto out;
+
+ entry = get_fsync_inode(head, ino_of_node(page));
+ if (!entry)
+ goto next;
+
+ do_recover_data(sbi, entry->inode, page, blkaddr);
+
+ if (entry->blkaddr == blkaddr) {
+ iput(entry->inode);
+ list_del(&entry->list);
+ kmem_cache_free(fsync_entry_slab, entry);
+ }
+next:
+ /* check next segment */
+ blkaddr = next_blkaddr_of_node(page);
+ ClearPageUptodate(page);
+ }
+out:
+ unlock_page(page);
+ __free_pages(page, 0);
+
+ allocate_new_segments(sbi);
+}
+
+void recover_fsync_data(struct f2fs_sb_info *sbi)
+{
+ struct list_head inode_list;
+
+ fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
+ sizeof(struct fsync_inode_entry), NULL);
+ if (unlikely(!fsync_entry_slab))
+ return;
+
+ INIT_LIST_HEAD(&inode_list);
+
+ /* step #1: find fsynced inode numbers */
+ if (find_fsync_dnodes(sbi, &inode_list))
+ goto out;
+
+ if (list_empty(&inode_list))
+ goto out;
+
+ /* step #2: recover data */
+ sbi->por_doing = 1;
+ recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+ sbi->por_doing = 0;
+ BUG_ON(!list_empty(&inode_list));
+out:
+ destroy_fsync_dnodes(sbi, &inode_list);
+ kmem_cache_destroy(fsync_entry_slab);
+ write_checkpoint(sbi, false, false);
+}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
new file mode 100644
index 000000000000..1b26e4ea1016
--- /dev/null
+++ b/fs/f2fs/segment.c
@@ -0,0 +1,1791 @@
+/*
+ * fs/f2fs/segment.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+
+#include "f2fs.h"
+#include "segment.h"
+#include "node.h"
+
+static int need_to_flush(struct f2fs_sb_info *sbi)
+{
+ unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
+ sbi->segs_per_sec;
+ int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
+ >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+ int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
+ >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+
+ if (sbi->por_doing)
+ return 0;
+
+ if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
+ reserved_sections(sbi)))
+ return 1;
+ return 0;
+}
+
+/*
+ * This function balances dirty node and dentry pages.
+ * In addition, it controls garbage collection.
+ */
+void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .for_reclaim = 0,
+ };
+
+ if (sbi->por_doing)
+ return;
+
+ /*
+ * We should do checkpoint when there are so many dirty node pages
+ * with enough free segments. After then, we should do GC.
+ */
+ if (need_to_flush(sbi)) {
+ sync_dirty_dir_inodes(sbi);
+ sync_node_pages(sbi, 0, &wbc);
+ }
+
+ if (has_not_enough_free_secs(sbi)) {
+ mutex_lock(&sbi->gc_mutex);
+ f2fs_gc(sbi, 1);
+ }
+}
+
+static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+ enum dirty_type dirty_type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ /* need not be added */
+ if (IS_CURSEG(sbi, segno))
+ return;
+
+ if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]++;
+
+ if (dirty_type == DIRTY) {
+ struct seg_entry *sentry = get_seg_entry(sbi, segno);
+ dirty_type = sentry->type;
+ if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]++;
+ }
+}
+
+static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
+ enum dirty_type dirty_type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]--;
+
+ if (dirty_type == DIRTY) {
+ struct seg_entry *sentry = get_seg_entry(sbi, segno);
+ dirty_type = sentry->type;
+ if (test_and_clear_bit(segno,
+ dirty_i->dirty_segmap[dirty_type]))
+ dirty_i->nr_dirty[dirty_type]--;
+ clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
+ clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
+ }
+}
+
+/*
+ * Should not occur error such as -ENOMEM.
+ * Adding dirty entry into seglist is not critical operation.
+ * If a given segment is one of current working segments, it won't be added.
+ */
+void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned short valid_blocks;
+
+ if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
+ return;
+
+ mutex_lock(&dirty_i->seglist_lock);
+
+ valid_blocks = get_valid_blocks(sbi, segno, 0);
+
+ if (valid_blocks == 0) {
+ __locate_dirty_segment(sbi, segno, PRE);
+ __remove_dirty_segment(sbi, segno, DIRTY);
+ } else if (valid_blocks < sbi->blocks_per_seg) {
+ __locate_dirty_segment(sbi, segno, DIRTY);
+ } else {
+ /* Recovery routine with SSR needs this */
+ __remove_dirty_segment(sbi, segno, DIRTY);
+ }
+
+ mutex_unlock(&dirty_i->seglist_lock);
+ return;
+}
+
+/*
+ * Should call clear_prefree_segments after checkpoint is done.
+ */
+static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno, offset = 0;
+ unsigned int total_segs = TOTAL_SEGS(sbi);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ while (1) {
+ segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
+ offset);
+ if (segno >= total_segs)
+ break;
+ __set_test_and_free(sbi, segno);
+ offset = segno + 1;
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+void clear_prefree_segments(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int segno, offset = 0;
+ unsigned int total_segs = TOTAL_SEGS(sbi);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ while (1) {
+ segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
+ offset);
+ if (segno >= total_segs)
+ break;
+
+ offset = segno + 1;
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
+ dirty_i->nr_dirty[PRE]--;
+
+ /* Let's use trim */
+ if (test_opt(sbi, DISCARD))
+ blkdev_issue_discard(sbi->sb->s_bdev,
+ START_BLOCK(sbi, segno) <<
+ sbi->log_sectors_per_block,
+ 1 << (sbi->log_sectors_per_block +
+ sbi->log_blocks_per_seg),
+ GFP_NOFS, 0);
+ }
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
+ sit_i->dirty_sentries++;
+}
+
+static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
+ unsigned int segno, int modified)
+{
+ struct seg_entry *se = get_seg_entry(sbi, segno);
+ se->type = type;
+ if (modified)
+ __mark_sit_entry_dirty(sbi, segno);
+}
+
+static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
+{
+ struct seg_entry *se;
+ unsigned int segno, offset;
+ long int new_vblocks;
+
+ segno = GET_SEGNO(sbi, blkaddr);
+
+ se = get_seg_entry(sbi, segno);
+ new_vblocks = se->valid_blocks + del;
+ offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+
+ BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
+ (new_vblocks > sbi->blocks_per_seg)));
+
+ se->valid_blocks = new_vblocks;
+ se->mtime = get_mtime(sbi);
+ SIT_I(sbi)->max_mtime = se->mtime;
+
+ /* Update valid block bitmap */
+ if (del > 0) {
+ if (f2fs_set_bit(offset, se->cur_valid_map))
+ BUG();
+ } else {
+ if (!f2fs_clear_bit(offset, se->cur_valid_map))
+ BUG();
+ }
+ if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+ se->ckpt_valid_blocks += del;
+
+ __mark_sit_entry_dirty(sbi, segno);
+
+ /* update total number of valid blocks to be written in ckpt area */
+ SIT_I(sbi)->written_valid_blocks += del;
+
+ if (sbi->segs_per_sec > 1)
+ get_sec_entry(sbi, segno)->valid_blocks += del;
+}
+
+static void refresh_sit_entry(struct f2fs_sb_info *sbi,
+ block_t old_blkaddr, block_t new_blkaddr)
+{
+ update_sit_entry(sbi, new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+}
+
+void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
+{
+ unsigned int segno = GET_SEGNO(sbi, addr);
+ struct sit_info *sit_i = SIT_I(sbi);
+
+ BUG_ON(addr == NULL_ADDR);
+ if (addr == NEW_ADDR)
+ return;
+
+ /* add it into sit main buffer */
+ mutex_lock(&sit_i->sentry_lock);
+
+ update_sit_entry(sbi, addr, -1);
+
+ /* add it into dirty seglist */
+ locate_dirty_segment(sbi, segno);
+
+ mutex_unlock(&sit_i->sentry_lock);
+}
+
+/*
+ * This function should be resided under the curseg_mutex lock
+ */
+static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
+ struct f2fs_summary *sum, unsigned short offset)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ void *addr = curseg->sum_blk;
+ addr += offset * sizeof(struct f2fs_summary);
+ memcpy(addr, sum, sizeof(struct f2fs_summary));
+ return;
+}
+
+/*
+ * Calculate the number of current summary pages for writing
+ */
+int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+{
+ int total_size_bytes = 0;
+ int valid_sum_count = 0;
+ int i, sum_space;
+
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ if (sbi->ckpt->alloc_type[i] == SSR)
+ valid_sum_count += sbi->blocks_per_seg;
+ else
+ valid_sum_count += curseg_blkoff(sbi, i);
+ }
+
+ total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
+ + sizeof(struct nat_journal) + 2
+ + sizeof(struct sit_journal) + 2;
+ sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
+ if (total_size_bytes < sum_space)
+ return 1;
+ else if (total_size_bytes < 2 * sum_space)
+ return 2;
+ return 3;
+}
+
+/*
+ * Caller should put this summary page
+ */
+struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
+}
+
+static void write_sum_page(struct f2fs_sb_info *sbi,
+ struct f2fs_summary_block *sum_blk, block_t blk_addr)
+{
+ struct page *page = grab_meta_page(sbi, blk_addr);
+ void *kaddr = page_address(page);
+ memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+}
+
+static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
+ int ofs_unit, int type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
+ unsigned int segno, next_segno, i;
+ int ofs = 0;
+
+ /*
+ * If there is not enough reserved sections,
+ * we should not reuse prefree segments.
+ */
+ if (has_not_enough_free_secs(sbi))
+ return NULL_SEGNO;
+
+ /*
+ * NODE page should not reuse prefree segment,
+ * since those information is used for SPOR.
+ */
+ if (IS_NODESEG(type))
+ return NULL_SEGNO;
+next:
+ segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
+ ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
+ if (segno < TOTAL_SEGS(sbi)) {
+ /* skip intermediate segments in a section */
+ if (segno % ofs_unit)
+ goto next;
+
+ /* skip if whole section is not prefree */
+ next_segno = find_next_zero_bit(prefree_segmap,
+ TOTAL_SEGS(sbi), segno + 1);
+ if (next_segno - segno < ofs_unit)
+ goto next;
+
+ /* skip if whole section was not free at the last checkpoint */
+ for (i = 0; i < ofs_unit; i++)
+ if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
+ goto next;
+ return segno;
+ }
+ return NULL_SEGNO;
+}
+
+/*
+ * Find a new segment from the free segments bitmap to right order
+ * This function should be returned with success, otherwise BUG
+ */
+static void get_new_segment(struct f2fs_sb_info *sbi,
+ unsigned int *newseg, bool new_sec, int dir)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int total_secs = sbi->total_sections;
+ unsigned int segno, secno, zoneno;
+ unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
+ unsigned int hint = *newseg / sbi->segs_per_sec;
+ unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
+ unsigned int left_start = hint;
+ bool init = true;
+ int go_left = 0;
+ int i;
+
+ write_lock(&free_i->segmap_lock);
+
+ if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+ segno = find_next_zero_bit(free_i->free_segmap,
+ TOTAL_SEGS(sbi), *newseg + 1);
+ if (segno < TOTAL_SEGS(sbi))
+ goto got_it;
+ }
+find_other_zone:
+ secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
+ if (secno >= total_secs) {
+ if (dir == ALLOC_RIGHT) {
+ secno = find_next_zero_bit(free_i->free_secmap,
+ total_secs, 0);
+ BUG_ON(secno >= total_secs);
+ } else {
+ go_left = 1;
+ left_start = hint - 1;
+ }
+ }
+ if (go_left == 0)
+ goto skip_left;
+
+ while (test_bit(left_start, free_i->free_secmap)) {
+ if (left_start > 0) {
+ left_start--;
+ continue;
+ }
+ left_start = find_next_zero_bit(free_i->free_secmap,
+ total_secs, 0);
+ BUG_ON(left_start >= total_secs);
+ break;
+ }
+ secno = left_start;
+skip_left:
+ hint = secno;
+ segno = secno * sbi->segs_per_sec;
+ zoneno = secno / sbi->secs_per_zone;
+
+ /* give up on finding another zone */
+ if (!init)
+ goto got_it;
+ if (sbi->secs_per_zone == 1)
+ goto got_it;
+ if (zoneno == old_zoneno)
+ goto got_it;
+ if (dir == ALLOC_LEFT) {
+ if (!go_left && zoneno + 1 >= total_zones)
+ goto got_it;
+ if (go_left && zoneno == 0)
+ goto got_it;
+ }
+ for (i = 0; i < NR_CURSEG_TYPE; i++)
+ if (CURSEG_I(sbi, i)->zone == zoneno)
+ break;
+
+ if (i < NR_CURSEG_TYPE) {
+ /* zone is in user, try another */
+ if (go_left)
+ hint = zoneno * sbi->secs_per_zone - 1;
+ else if (zoneno + 1 >= total_zones)
+ hint = 0;
+ else
+ hint = (zoneno + 1) * sbi->secs_per_zone;
+ init = false;
+ goto find_other_zone;
+ }
+got_it:
+ /* set it as dirty segment in free segmap */
+ BUG_ON(test_bit(segno, free_i->free_segmap));
+ __set_inuse(sbi, segno);
+ *newseg = segno;
+ write_unlock(&free_i->segmap_lock);
+}
+
+static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ struct summary_footer *sum_footer;
+
+ curseg->segno = curseg->next_segno;
+ curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
+ curseg->next_blkoff = 0;
+ curseg->next_segno = NULL_SEGNO;
+
+ sum_footer = &(curseg->sum_blk->footer);
+ memset(sum_footer, 0, sizeof(struct summary_footer));
+ if (IS_DATASEG(type))
+ SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
+ if (IS_NODESEG(type))
+ SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
+ __set_sit_entry_type(sbi, type, curseg->segno, modified);
+}
+
+/*
+ * Allocate a current working segment.
+ * This function always allocates a free segment in LFS manner.
+ */
+static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int segno = curseg->segno;
+ int dir = ALLOC_LEFT;
+
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
+ if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
+ dir = ALLOC_RIGHT;
+
+ if (test_opt(sbi, NOHEAP))
+ dir = ALLOC_RIGHT;
+
+ get_new_segment(sbi, &segno, new_sec, dir);
+ curseg->next_segno = segno;
+ reset_curseg(sbi, type, 1);
+ curseg->alloc_type = LFS;
+}
+
+static void __next_free_blkoff(struct f2fs_sb_info *sbi,
+ struct curseg_info *seg, block_t start)
+{
+ struct seg_entry *se = get_seg_entry(sbi, seg->segno);
+ block_t ofs;
+ for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
+ if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
+ && !f2fs_test_bit(ofs, se->cur_valid_map))
+ break;
+ }
+ seg->next_blkoff = ofs;
+}
+
+/*
+ * If a segment is written by LFS manner, next block offset is just obtained
+ * by increasing the current block offset. However, if a segment is written by
+ * SSR manner, next block offset obtained by calling __next_free_blkoff
+ */
+static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
+ struct curseg_info *seg)
+{
+ if (seg->alloc_type == SSR)
+ __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
+ else
+ seg->next_blkoff++;
+}
+
+/*
+ * This function always allocates a used segment (from dirty seglist) by SSR
+ * manner, so it should recover the existing segment information of valid blocks
+ */
+static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int new_segno = curseg->next_segno;
+ struct f2fs_summary_block *sum_node;
+ struct page *sum_page;
+
+ write_sum_page(sbi, curseg->sum_blk,
+ GET_SUM_BLOCK(sbi, curseg->segno));
+ __set_test_and_inuse(sbi, new_segno);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ __remove_dirty_segment(sbi, new_segno, PRE);
+ __remove_dirty_segment(sbi, new_segno, DIRTY);
+ mutex_unlock(&dirty_i->seglist_lock);
+
+ reset_curseg(sbi, type, 1);
+ curseg->alloc_type = SSR;
+ __next_free_blkoff(sbi, curseg, 0);
+
+ if (reuse) {
+ sum_page = get_sum_page(sbi, new_segno);
+ sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+ f2fs_put_page(sum_page, 1);
+ }
+}
+
+/*
+ * flush out current segment and replace it with new segment
+ * This function should be returned with success, otherwise BUG
+ */
+static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
+ int type, bool force)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int ofs_unit;
+
+ if (force) {
+ new_curseg(sbi, type, true);
+ goto out;
+ }
+
+ ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
+ curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
+
+ if (curseg->next_segno != NULL_SEGNO)
+ change_curseg(sbi, type, false);
+ else if (type == CURSEG_WARM_NODE)
+ new_curseg(sbi, type, false);
+ else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
+ change_curseg(sbi, type, true);
+ else
+ new_curseg(sbi, type, false);
+out:
+ sbi->segment_count[curseg->alloc_type]++;
+}
+
+void allocate_new_segments(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *curseg;
+ unsigned int old_curseg;
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ curseg = CURSEG_I(sbi, i);
+ old_curseg = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+ locate_dirty_segment(sbi, old_curseg);
+ }
+}
+
+static const struct segment_allocation default_salloc_ops = {
+ .allocate_segment = allocate_segment_by_default,
+};
+
+static void f2fs_end_io_write(struct bio *bio, int err)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_private *p = bio->bi_private;
+
+ do {
+ struct page *page = bvec->bv_page;
+
+ if (--bvec >= bio->bi_io_vec)
+ prefetchw(&bvec->bv_page->flags);
+ if (!uptodate) {
+ SetPageError(page);
+ if (page->mapping)
+ set_bit(AS_EIO, &page->mapping->flags);
+ set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
+ set_page_dirty(page);
+ }
+ end_page_writeback(page);
+ dec_page_count(p->sbi, F2FS_WRITEBACK);
+ } while (bvec >= bio->bi_io_vec);
+
+ if (p->is_sync)
+ complete(p->wait);
+ kfree(p);
+ bio_put(bio);
+}
+
+struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
+{
+ struct bio *bio;
+ struct bio_private *priv;
+retry:
+ priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
+ if (!priv) {
+ cond_resched();
+ goto retry;
+ }
+
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+ bio->bi_bdev = bdev;
+ bio->bi_private = priv;
+ return bio;
+}
+
+static void do_submit_bio(struct f2fs_sb_info *sbi,
+ enum page_type type, bool sync)
+{
+ int rw = sync ? WRITE_SYNC : WRITE;
+ enum page_type btype = type > META ? META : type;
+
+ if (type >= META_FLUSH)
+ rw = WRITE_FLUSH_FUA;
+
+ if (sbi->bio[btype]) {
+ struct bio_private *p = sbi->bio[btype]->bi_private;
+ p->sbi = sbi;
+ sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
+ if (type == META_FLUSH) {
+ DECLARE_COMPLETION_ONSTACK(wait);
+ p->is_sync = true;
+ p->wait = &wait;
+ submit_bio(rw, sbi->bio[btype]);
+ wait_for_completion(&wait);
+ } else {
+ p->is_sync = false;
+ submit_bio(rw, sbi->bio[btype]);
+ }
+ sbi->bio[btype] = NULL;
+ }
+}
+
+void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
+{
+ down_write(&sbi->bio_sem);
+ do_submit_bio(sbi, type, sync);
+ up_write(&sbi->bio_sem);
+}
+
+static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, enum page_type type)
+{
+ struct block_device *bdev = sbi->sb->s_bdev;
+
+ verify_block_addr(sbi, blk_addr);
+
+ down_write(&sbi->bio_sem);
+
+ inc_page_count(sbi, F2FS_WRITEBACK);
+
+ if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
+ do_submit_bio(sbi, type, false);
+alloc_new:
+ if (sbi->bio[type] == NULL) {
+ sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
+ sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ /*
+ * The end_io will be assigned at the sumbission phase.
+ * Until then, let bio_add_page() merge consecutive IOs as much
+ * as possible.
+ */
+ }
+
+ if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ do_submit_bio(sbi, type, false);
+ goto alloc_new;
+ }
+
+ sbi->last_block_in_bio[type] = blk_addr;
+
+ up_write(&sbi->bio_sem);
+}
+
+static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ if (curseg->next_blkoff < sbi->blocks_per_seg)
+ return true;
+ return false;
+}
+
+static int __get_segment_type_2(struct page *page, enum page_type p_type)
+{
+ if (p_type == DATA)
+ return CURSEG_HOT_DATA;
+ else
+ return CURSEG_HOT_NODE;
+}
+
+static int __get_segment_type_4(struct page *page, enum page_type p_type)
+{
+ if (p_type == DATA) {
+ struct inode *inode = page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode))
+ return CURSEG_HOT_DATA;
+ else
+ return CURSEG_COLD_DATA;
+ } else {
+ if (IS_DNODE(page) && !is_cold_node(page))
+ return CURSEG_HOT_NODE;
+ else
+ return CURSEG_COLD_NODE;
+ }
+}
+
+static int __get_segment_type_6(struct page *page, enum page_type p_type)
+{
+ if (p_type == DATA) {
+ struct inode *inode = page->mapping->host;
+
+ if (S_ISDIR(inode->i_mode))
+ return CURSEG_HOT_DATA;
+ else if (is_cold_data(page) || is_cold_file(inode))
+ return CURSEG_COLD_DATA;
+ else
+ return CURSEG_WARM_DATA;
+ } else {
+ if (IS_DNODE(page))
+ return is_cold_node(page) ? CURSEG_WARM_NODE :
+ CURSEG_HOT_NODE;
+ else
+ return CURSEG_COLD_NODE;
+ }
+}
+
+static int __get_segment_type(struct page *page, enum page_type p_type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ switch (sbi->active_logs) {
+ case 2:
+ return __get_segment_type_2(page, p_type);
+ case 4:
+ return __get_segment_type_4(page, p_type);
+ case 6:
+ return __get_segment_type_6(page, p_type);
+ default:
+ BUG();
+ }
+}
+
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blkaddr, block_t *new_blkaddr,
+ struct f2fs_summary *sum, enum page_type p_type)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct curseg_info *curseg;
+ unsigned int old_cursegno;
+ int type;
+
+ type = __get_segment_type(page, p_type);
+ curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+
+ *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+ old_cursegno = curseg->segno;
+
+ /*
+ * __add_sum_entry should be resided under the curseg_mutex
+ * because, this function updates a summary entry in the
+ * current summary block.
+ */
+ __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+
+ mutex_lock(&sit_i->sentry_lock);
+ __refresh_next_blkoff(sbi, curseg);
+ sbi->block_count[curseg->alloc_type]++;
+
+ /*
+ * SIT information should be updated before segment allocation,
+ * since SSR needs latest valid block information.
+ */
+ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
+
+ locate_dirty_segment(sbi, old_cursegno);
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ mutex_unlock(&sit_i->sentry_lock);
+
+ if (p_type == NODE)
+ fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+
+ /* writeout dirty page into bdev */
+ submit_write_page(sbi, page, *new_blkaddr, p_type);
+
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+ struct writeback_control *wbc)
+{
+ if (wbc->for_reclaim)
+ return AOP_WRITEPAGE_ACTIVATE;
+
+ set_page_writeback(page);
+ submit_write_page(sbi, page, page->index, META);
+ return 0;
+}
+
+void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+ unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
+{
+ struct f2fs_summary sum;
+ set_summary(&sum, nid, 0, 0);
+ do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+}
+
+void write_data_page(struct inode *inode, struct page *page,
+ struct dnode_of_data *dn, block_t old_blkaddr,
+ block_t *new_blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_summary sum;
+ struct node_info ni;
+
+ BUG_ON(old_blkaddr == NULL_ADDR);
+ get_node_info(sbi, dn->nid, &ni);
+ set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+
+ do_write_page(sbi, page, old_blkaddr,
+ new_blkaddr, &sum, DATA);
+}
+
+void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blk_addr)
+{
+ submit_write_page(sbi, page, old_blk_addr, DATA);
+}
+
+void recover_data_page(struct f2fs_sb_info *sbi,
+ struct page *page, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct curseg_info *curseg;
+ unsigned int segno, old_cursegno;
+ struct seg_entry *se;
+ int type;
+
+ segno = GET_SEGNO(sbi, new_blkaddr);
+ se = get_seg_entry(sbi, segno);
+ type = se->type;
+
+ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+ if (old_blkaddr == NULL_ADDR)
+ type = CURSEG_COLD_DATA;
+ else
+ type = CURSEG_WARM_DATA;
+ }
+ curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
+
+ old_cursegno = curseg->segno;
+
+ /* change the current segment */
+ if (segno != curseg->segno) {
+ curseg->next_segno = segno;
+ change_curseg(sbi, type, true);
+ }
+
+ curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+ (sbi->blocks_per_seg - 1);
+ __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+
+ refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+
+ locate_dirty_segment(sbi, old_cursegno);
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+
+ mutex_unlock(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+void rewrite_node_page(struct f2fs_sb_info *sbi,
+ struct page *page, struct f2fs_summary *sum,
+ block_t old_blkaddr, block_t new_blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ int type = CURSEG_WARM_NODE;
+ struct curseg_info *curseg;
+ unsigned int segno, old_cursegno;
+ block_t next_blkaddr = next_blkaddr_of_node(page);
+ unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+
+ curseg = CURSEG_I(sbi, type);
+
+ mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
+
+ segno = GET_SEGNO(sbi, new_blkaddr);
+ old_cursegno = curseg->segno;
+
+ /* change the current segment */
+ if (segno != curseg->segno) {
+ curseg->next_segno = segno;
+ change_curseg(sbi, type, true);
+ }
+ curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
+ (sbi->blocks_per_seg - 1);
+ __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
+
+ /* change the current log to the next block addr in advance */
+ if (next_segno != segno) {
+ curseg->next_segno = next_segno;
+ change_curseg(sbi, type, true);
+ }
+ curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
+ (sbi->blocks_per_seg - 1);
+
+ /* rewrite node page */
+ set_page_writeback(page);
+ submit_write_page(sbi, page, new_blkaddr, NODE);
+ f2fs_submit_bio(sbi, NODE, true);
+ refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+
+ locate_dirty_segment(sbi, old_cursegno);
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+
+ mutex_unlock(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+}
+
+static int read_compacted_summaries(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct curseg_info *seg_i;
+ unsigned char *kaddr;
+ struct page *page;
+ block_t start;
+ int i, j, offset;
+
+ start = start_sum_block(sbi);
+
+ page = get_meta_page(sbi, start++);
+ kaddr = (unsigned char *)page_address(page);
+
+ /* Step 1: restore nat cache */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
+
+ /* Step 2: restore sit cache */
+ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
+ SUM_JOURNAL_SIZE);
+ offset = 2 * SUM_JOURNAL_SIZE;
+
+ /* Step 3: restore summary entries */
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ unsigned short blk_off;
+ unsigned int segno;
+
+ seg_i = CURSEG_I(sbi, i);
+ segno = le32_to_cpu(ckpt->cur_data_segno[i]);
+ blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
+ seg_i->next_segno = segno;
+ reset_curseg(sbi, i, 0);
+ seg_i->alloc_type = ckpt->alloc_type[i];
+ seg_i->next_blkoff = blk_off;
+
+ if (seg_i->alloc_type == SSR)
+ blk_off = sbi->blocks_per_seg;
+
+ for (j = 0; j < blk_off; j++) {
+ struct f2fs_summary *s;
+ s = (struct f2fs_summary *)(kaddr + offset);
+ seg_i->sum_blk->entries[j] = *s;
+ offset += SUMMARY_SIZE;
+ if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ SUM_FOOTER_SIZE)
+ continue;
+
+ f2fs_put_page(page, 1);
+ page = NULL;
+
+ page = get_meta_page(sbi, start++);
+ kaddr = (unsigned char *)page_address(page);
+ offset = 0;
+ }
+ }
+ f2fs_put_page(page, 1);
+ return 0;
+}
+
+static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
+{
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_summary_block *sum;
+ struct curseg_info *curseg;
+ struct page *new;
+ unsigned short blk_off;
+ unsigned int segno = 0;
+ block_t blk_addr = 0;
+
+ /* get segment number and block addr */
+ if (IS_DATASEG(type)) {
+ segno = le32_to_cpu(ckpt->cur_data_segno[type]);
+ blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
+ CURSEG_HOT_DATA]);
+ if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
+ else
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
+ } else {
+ segno = le32_to_cpu(ckpt->cur_node_segno[type -
+ CURSEG_HOT_NODE]);
+ blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
+ CURSEG_HOT_NODE]);
+ if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
+ type - CURSEG_HOT_NODE);
+ else
+ blk_addr = GET_SUM_BLOCK(sbi, segno);
+ }
+
+ new = get_meta_page(sbi, blk_addr);
+ sum = (struct f2fs_summary_block *)page_address(new);
+
+ if (IS_NODESEG(type)) {
+ if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+ struct f2fs_summary *ns = &sum->entries[0];
+ int i;
+ for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+ ns->version = 0;
+ ns->ofs_in_node = 0;
+ }
+ } else {
+ if (restore_node_summary(sbi, segno, sum)) {
+ f2fs_put_page(new, 1);
+ return -EINVAL;
+ }
+ }
+ }
+
+ /* set uncompleted segment to curseg */
+ curseg = CURSEG_I(sbi, type);
+ mutex_lock(&curseg->curseg_mutex);
+ memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
+ curseg->next_segno = segno;
+ reset_curseg(sbi, type, 0);
+ curseg->alloc_type = ckpt->alloc_type[type];
+ curseg->next_blkoff = blk_off;
+ mutex_unlock(&curseg->curseg_mutex);
+ f2fs_put_page(new, 1);
+ return 0;
+}
+
+static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
+{
+ int type = CURSEG_HOT_DATA;
+
+ if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+ /* restore for compacted data summary */
+ if (read_compacted_summaries(sbi))
+ return -EINVAL;
+ type = CURSEG_HOT_NODE;
+ }
+
+ for (; type <= CURSEG_COLD_NODE; type++)
+ if (read_normal_summaries(sbi, type))
+ return -EINVAL;
+ return 0;
+}
+
+static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct page *page;
+ unsigned char *kaddr;
+ struct f2fs_summary *summary;
+ struct curseg_info *seg_i;
+ int written_size = 0;
+ int i, j;
+
+ page = grab_meta_page(sbi, blkaddr++);
+ kaddr = (unsigned char *)page_address(page);
+
+ /* Step 1: write nat cache */
+ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
+ memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
+ written_size += SUM_JOURNAL_SIZE;
+
+ /* Step 2: write sit cache */
+ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
+ SUM_JOURNAL_SIZE);
+ written_size += SUM_JOURNAL_SIZE;
+
+ set_page_dirty(page);
+
+ /* Step 3: write summary entries */
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+ unsigned short blkoff;
+ seg_i = CURSEG_I(sbi, i);
+ if (sbi->ckpt->alloc_type[i] == SSR)
+ blkoff = sbi->blocks_per_seg;
+ else
+ blkoff = curseg_blkoff(sbi, i);
+
+ for (j = 0; j < blkoff; j++) {
+ if (!page) {
+ page = grab_meta_page(sbi, blkaddr++);
+ kaddr = (unsigned char *)page_address(page);
+ written_size = 0;
+ }
+ summary = (struct f2fs_summary *)(kaddr + written_size);
+ *summary = seg_i->sum_blk->entries[j];
+ written_size += SUMMARY_SIZE;
+ set_page_dirty(page);
+
+ if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+ SUM_FOOTER_SIZE)
+ continue;
+
+ f2fs_put_page(page, 1);
+ page = NULL;
+ }
+ }
+ if (page)
+ f2fs_put_page(page, 1);
+}
+
+static void write_normal_summaries(struct f2fs_sb_info *sbi,
+ block_t blkaddr, int type)
+{
+ int i, end;
+ if (IS_DATASEG(type))
+ end = type + NR_CURSEG_DATA_TYPE;
+ else
+ end = type + NR_CURSEG_NODE_TYPE;
+
+ for (i = type; i < end; i++) {
+ struct curseg_info *sum = CURSEG_I(sbi, i);
+ mutex_lock(&sum->curseg_mutex);
+ write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
+ mutex_unlock(&sum->curseg_mutex);
+ }
+}
+
+void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+ if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
+ write_compacted_summaries(sbi, start_blk);
+ else
+ write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
+}
+
+void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
+{
+ if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+ write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+ return;
+}
+
+int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
+ unsigned int val, int alloc)
+{
+ int i;
+
+ if (type == NAT_JOURNAL) {
+ for (i = 0; i < nats_in_cursum(sum); i++) {
+ if (le32_to_cpu(nid_in_journal(sum, i)) == val)
+ return i;
+ }
+ if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+ return update_nats_in_cursum(sum, 1);
+ } else if (type == SIT_JOURNAL) {
+ for (i = 0; i < sits_in_cursum(sum); i++)
+ if (le32_to_cpu(segno_in_journal(sum, i)) == val)
+ return i;
+ if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+ return update_sits_in_cursum(sum, 1);
+ }
+ return -1;
+}
+
+static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
+ block_t blk_addr = sit_i->sit_base_addr + offset;
+
+ check_seg_range(sbi, segno);
+
+ /* calculate sit block address */
+ if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+ blk_addr += sit_i->sit_blocks;
+
+ return get_meta_page(sbi, blk_addr);
+}
+
+static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
+ unsigned int start)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct page *src_page, *dst_page;
+ pgoff_t src_off, dst_off;
+ void *src_addr, *dst_addr;
+
+ src_off = current_sit_addr(sbi, start);
+ dst_off = next_sit_addr(sbi, src_off);
+
+ /* get current sit block page without lock */
+ src_page = get_meta_page(sbi, src_off);
+ dst_page = grab_meta_page(sbi, dst_off);
+ BUG_ON(PageDirty(src_page));
+
+ src_addr = page_address(src_page);
+ dst_addr = page_address(dst_page);
+ memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+
+ set_page_dirty(dst_page);
+ f2fs_put_page(src_page, 1);
+
+ set_to_next_sit(sit_i, start);
+
+ return dst_page;
+}
+
+static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ int i;
+
+ /*
+ * If the journal area in the current summary is full of sit entries,
+ * all the sit entries will be flushed. Otherwise the sit entries
+ * are not able to replace with newly hot sit entries.
+ */
+ if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
+ for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
+ unsigned int segno;
+ segno = le32_to_cpu(segno_in_journal(sum, i));
+ __mark_sit_entry_dirty(sbi, segno);
+ }
+ update_sits_in_cursum(sum, -sits_in_cursum(sum));
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * CP calls this function, which flushes SIT entries including sit_journal,
+ * and moves prefree segs to free segs.
+ */
+void flush_sit_entries(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ unsigned long nsegs = TOTAL_SEGS(sbi);
+ struct page *page = NULL;
+ struct f2fs_sit_block *raw_sit = NULL;
+ unsigned int start = 0, end = 0;
+ unsigned int segno = -1;
+ bool flushed;
+
+ mutex_lock(&curseg->curseg_mutex);
+ mutex_lock(&sit_i->sentry_lock);
+
+ /*
+ * "flushed" indicates whether sit entries in journal are flushed
+ * to the SIT area or not.
+ */
+ flushed = flush_sits_in_journal(sbi);
+
+ while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) {
+ struct seg_entry *se = get_seg_entry(sbi, segno);
+ int sit_offset, offset;
+
+ sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+
+ if (flushed)
+ goto to_sit_page;
+
+ offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
+ if (offset >= 0) {
+ segno_in_journal(sum, offset) = cpu_to_le32(segno);
+ seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
+ goto flush_done;
+ }
+to_sit_page:
+ if (!page || (start > segno) || (segno > end)) {
+ if (page) {
+ f2fs_put_page(page, 1);
+ page = NULL;
+ }
+
+ start = START_SEGNO(sit_i, segno);
+ end = start + SIT_ENTRY_PER_BLOCK - 1;
+
+ /* read sit block that will be updated */
+ page = get_next_sit_page(sbi, start);
+ raw_sit = page_address(page);
+ }
+
+ /* udpate entry in SIT block */
+ seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
+flush_done:
+ __clear_bit(segno, bitmap);
+ sit_i->dirty_sentries--;
+ }
+ mutex_unlock(&sit_i->sentry_lock);
+ mutex_unlock(&curseg->curseg_mutex);
+
+ /* writeout last modified SIT block */
+ f2fs_put_page(page, 1);
+
+ set_prefree_as_free_segments(sbi);
+}
+
+static int build_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct sit_info *sit_i;
+ unsigned int sit_segs, start;
+ char *src_bitmap, *dst_bitmap;
+ unsigned int bitmap_size;
+
+ /* allocate memory for SIT information */
+ sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
+ if (!sit_i)
+ return -ENOMEM;
+
+ SM_I(sbi)->sit_info = sit_i;
+
+ sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
+ if (!sit_i->sentries)
+ return -ENOMEM;
+
+ bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!sit_i->dirty_sentries_bitmap)
+ return -ENOMEM;
+
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ sit_i->sentries[start].cur_valid_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ sit_i->sentries[start].ckpt_valid_map
+ = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->sentries[start].cur_valid_map
+ || !sit_i->sentries[start].ckpt_valid_map)
+ return -ENOMEM;
+ }
+
+ if (sbi->segs_per_sec > 1) {
+ sit_i->sec_entries = vzalloc(sbi->total_sections *
+ sizeof(struct sec_entry));
+ if (!sit_i->sec_entries)
+ return -ENOMEM;
+ }
+
+ /* get information related with SIT */
+ sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
+
+ /* setup SIT bitmap from ckeckpoint pack */
+ bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
+ src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
+
+ dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dst_bitmap)
+ return -ENOMEM;
+ memcpy(dst_bitmap, src_bitmap, bitmap_size);
+
+ /* init SIT information */
+ sit_i->s_ops = &default_salloc_ops;
+
+ sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
+ sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+ sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+ sit_i->sit_bitmap = dst_bitmap;
+ sit_i->bitmap_size = bitmap_size;
+ sit_i->dirty_sentries = 0;
+ sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
+ sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
+ sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
+ mutex_init(&sit_i->sentry_lock);
+ return 0;
+}
+
+static int build_free_segmap(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ struct free_segmap_info *free_i;
+ unsigned int bitmap_size, sec_bitmap_size;
+
+ /* allocate memory for free segmap information */
+ free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
+ if (!free_i)
+ return -ENOMEM;
+
+ SM_I(sbi)->free_info = free_i;
+
+ bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+ if (!free_i->free_segmap)
+ return -ENOMEM;
+
+ sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
+ free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+ if (!free_i->free_secmap)
+ return -ENOMEM;
+
+ /* set all segments as dirty temporarily */
+ memset(free_i->free_segmap, 0xff, bitmap_size);
+ memset(free_i->free_secmap, 0xff, sec_bitmap_size);
+
+ /* init free segmap information */
+ free_i->start_segno =
+ (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
+ free_i->free_segments = 0;
+ free_i->free_sections = 0;
+ rwlock_init(&free_i->segmap_lock);
+ return 0;
+}
+
+static int build_curseg(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *array;
+ int i;
+
+ array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ SM_I(sbi)->curseg_array = array;
+
+ for (i = 0; i < NR_CURSEG_TYPE; i++) {
+ mutex_init(&array[i].curseg_mutex);
+ array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+ if (!array[i].sum_blk)
+ return -ENOMEM;
+ array[i].segno = NULL_SEGNO;
+ array[i].next_blkoff = 0;
+ }
+ return restore_curseg_summaries(sbi);
+}
+
+static void build_sit_entries(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
+ struct f2fs_summary_block *sum = curseg->sum_blk;
+ unsigned int start;
+
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ struct seg_entry *se = &sit_i->sentries[start];
+ struct f2fs_sit_block *sit_blk;
+ struct f2fs_sit_entry sit;
+ struct page *page;
+ int i;
+
+ mutex_lock(&curseg->curseg_mutex);
+ for (i = 0; i < sits_in_cursum(sum); i++) {
+ if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
+ sit = sit_in_journal(sum, i);
+ mutex_unlock(&curseg->curseg_mutex);
+ goto got_it;
+ }
+ }
+ mutex_unlock(&curseg->curseg_mutex);
+ page = get_current_sit_page(sbi, start);
+ sit_blk = (struct f2fs_sit_block *)page_address(page);
+ sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+ f2fs_put_page(page, 1);
+got_it:
+ check_block_count(sbi, start, &sit);
+ seg_info_from_raw_sit(se, &sit);
+ if (sbi->segs_per_sec > 1) {
+ struct sec_entry *e = get_sec_entry(sbi, start);
+ e->valid_blocks += se->valid_blocks;
+ }
+ }
+}
+
+static void init_free_segmap(struct f2fs_sb_info *sbi)
+{
+ unsigned int start;
+ int type;
+
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ struct seg_entry *sentry = get_seg_entry(sbi, start);
+ if (!sentry->valid_blocks)
+ __set_free(sbi, start);
+ }
+
+ /* set use the current segments */
+ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
+ struct curseg_info *curseg_t = CURSEG_I(sbi, type);
+ __set_test_and_inuse(sbi, curseg_t->segno);
+ }
+}
+
+static void init_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int segno = 0, offset = 0;
+ unsigned short valid_blocks;
+
+ while (segno < TOTAL_SEGS(sbi)) {
+ /* find dirty segment based on free segmap */
+ segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
+ if (segno >= TOTAL_SEGS(sbi))
+ break;
+ offset = segno + 1;
+ valid_blocks = get_valid_blocks(sbi, segno, 0);
+ if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
+ continue;
+ mutex_lock(&dirty_i->seglist_lock);
+ __locate_dirty_segment(sbi, segno, DIRTY);
+ mutex_unlock(&dirty_i->seglist_lock);
+ }
+}
+
+static int init_victim_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+
+ dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
+ return -ENOMEM;
+ return 0;
+}
+
+static int build_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i;
+ unsigned int bitmap_size, i;
+
+ /* allocate memory for dirty segments list information */
+ dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
+ if (!dirty_i)
+ return -ENOMEM;
+
+ SM_I(sbi)->dirty_info = dirty_i;
+ mutex_init(&dirty_i->seglist_lock);
+
+ bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+
+ for (i = 0; i < NR_DIRTY_TYPE; i++) {
+ dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->nr_dirty[i] = 0;
+ if (!dirty_i->dirty_segmap[i])
+ return -ENOMEM;
+ }
+
+ init_dirty_segmap(sbi);
+ return init_victim_segmap(sbi);
+}
+
+/*
+ * Update min, max modified time for cost-benefit GC algorithm
+ */
+static void init_min_max_mtime(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno;
+
+ mutex_lock(&sit_i->sentry_lock);
+
+ sit_i->min_mtime = LLONG_MAX;
+
+ for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
+ unsigned int i;
+ unsigned long long mtime = 0;
+
+ for (i = 0; i < sbi->segs_per_sec; i++)
+ mtime += get_seg_entry(sbi, segno + i)->mtime;
+
+ mtime = div_u64(mtime, sbi->segs_per_sec);
+
+ if (sit_i->min_mtime > mtime)
+ sit_i->min_mtime = mtime;
+ }
+ sit_i->max_mtime = get_mtime(sbi);
+ mutex_unlock(&sit_i->sentry_lock);
+}
+
+int build_segment_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+ struct f2fs_sm_info *sm_info;
+ int err;
+
+ sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
+ if (!sm_info)
+ return -ENOMEM;
+
+ /* init sm info */
+ sbi->sm_info = sm_info;
+ INIT_LIST_HEAD(&sm_info->wblist_head);
+ spin_lock_init(&sm_info->wblist_lock);
+ sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+ sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+ sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
+ sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+ sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+ sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
+ sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+
+ err = build_sit_info(sbi);
+ if (err)
+ return err;
+ err = build_free_segmap(sbi);
+ if (err)
+ return err;
+ err = build_curseg(sbi);
+ if (err)
+ return err;
+
+ /* reinit free segmap based on SIT */
+ build_sit_entries(sbi);
+
+ init_free_segmap(sbi);
+ err = build_dirty_segmap(sbi);
+ if (err)
+ return err;
+
+ init_min_max_mtime(sbi);
+ return 0;
+}
+
+static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
+ enum dirty_type dirty_type)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ mutex_lock(&dirty_i->seglist_lock);
+ kfree(dirty_i->dirty_segmap[dirty_type]);
+ dirty_i->nr_dirty[dirty_type] = 0;
+ mutex_unlock(&dirty_i->seglist_lock);
+}
+
+void reset_victim_segmap(struct f2fs_sb_info *sbi)
+{
+ unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
+ memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
+}
+
+static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+
+ kfree(dirty_i->victim_segmap[FG_GC]);
+ kfree(dirty_i->victim_segmap[BG_GC]);
+}
+
+static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
+{
+ struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
+ int i;
+
+ if (!dirty_i)
+ return;
+
+ /* discard pre-free/dirty segments list */
+ for (i = 0; i < NR_DIRTY_TYPE; i++)
+ discard_dirty_segmap(sbi, i);
+
+ destroy_victim_segmap(sbi);
+ SM_I(sbi)->dirty_info = NULL;
+ kfree(dirty_i);
+}
+
+static void destroy_curseg(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *array = SM_I(sbi)->curseg_array;
+ int i;
+
+ if (!array)
+ return;
+ SM_I(sbi)->curseg_array = NULL;
+ for (i = 0; i < NR_CURSEG_TYPE; i++)
+ kfree(array[i].sum_blk);
+ kfree(array);
+}
+
+static void destroy_free_segmap(struct f2fs_sb_info *sbi)
+{
+ struct free_segmap_info *free_i = SM_I(sbi)->free_info;
+ if (!free_i)
+ return;
+ SM_I(sbi)->free_info = NULL;
+ kfree(free_i->free_segmap);
+ kfree(free_i->free_secmap);
+ kfree(free_i);
+}
+
+static void destroy_sit_info(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int start;
+
+ if (!sit_i)
+ return;
+
+ if (sit_i->sentries) {
+ for (start = 0; start < TOTAL_SEGS(sbi); start++) {
+ kfree(sit_i->sentries[start].cur_valid_map);
+ kfree(sit_i->sentries[start].ckpt_valid_map);
+ }
+ }
+ vfree(sit_i->sentries);
+ vfree(sit_i->sec_entries);
+ kfree(sit_i->dirty_sentries_bitmap);
+
+ SM_I(sbi)->sit_info = NULL;
+ kfree(sit_i->sit_bitmap);
+ kfree(sit_i);
+}
+
+void destroy_segment_manager(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ destroy_dirty_segmap(sbi);
+ destroy_curseg(sbi);
+ destroy_free_segmap(sbi);
+ destroy_sit_info(sbi);
+ sbi->sm_info = NULL;
+ kfree(sm_info);
+}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
new file mode 100644
index 000000000000..0948405af6f5
--- /dev/null
+++ b/fs/f2fs/segment.h
@@ -0,0 +1,618 @@
+/*
+ * fs/f2fs/segment.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+/* constant macro */
+#define NULL_SEGNO ((unsigned int)(~0))
+
+/* V: Logical segment # in volume, R: Relative segment # in main area */
+#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
+#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
+
+#define IS_DATASEG(t) \
+ ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \
+ (t == CURSEG_WARM_DATA))
+
+#define IS_NODESEG(t) \
+ ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
+ (t == CURSEG_WARM_NODE))
+
+#define IS_CURSEG(sbi, segno) \
+ ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
+ (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
+
+#define IS_CURSEC(sbi, secno) \
+ ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
+ sbi->segs_per_sec) || \
+ (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
+ sbi->segs_per_sec)) \
+
+#define START_BLOCK(sbi, segno) \
+ (SM_I(sbi)->seg0_blkaddr + \
+ (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
+#define NEXT_FREE_BLKADDR(sbi, curseg) \
+ (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
+
+#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr)
+
+#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
+ ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
+#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_SEGNO(sbi, blk_addr) \
+ (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
+ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
+ GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
+#define GET_SECNO(sbi, segno) \
+ ((segno) / sbi->segs_per_sec)
+#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
+ ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
+
+#define GET_SUM_BLOCK(sbi, segno) \
+ ((sbi->sm_info->ssa_blkaddr) + segno)
+
+#define GET_SUM_TYPE(footer) ((footer)->entry_type)
+#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
+
+#define SIT_ENTRY_OFFSET(sit_i, segno) \
+ (segno % sit_i->sents_per_block)
+#define SIT_BLOCK_OFFSET(sit_i, segno) \
+ (segno / SIT_ENTRY_PER_BLOCK)
+#define START_SEGNO(sit_i, segno) \
+ (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define f2fs_bitmap_size(nr) \
+ (BITS_TO_LONGS(nr) * sizeof(unsigned long))
+#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
+
+#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
+ (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+
+/* during checkpoint, bio_private is used to synchronize the last bio */
+struct bio_private {
+ struct f2fs_sb_info *sbi;
+ bool is_sync;
+ void *wait;
+};
+
+/*
+ * indicate a block allocation direction: RIGHT and LEFT.
+ * RIGHT means allocating new sections towards the end of volume.
+ * LEFT means the opposite direction.
+ */
+enum {
+ ALLOC_RIGHT = 0,
+ ALLOC_LEFT
+};
+
+/*
+ * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
+ * LFS writes data sequentially with cleaning operations.
+ * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
+ */
+enum {
+ LFS = 0,
+ SSR
+};
+
+/*
+ * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
+ * GC_CB is based on cost-benefit algorithm.
+ * GC_GREEDY is based on greedy algorithm.
+ */
+enum {
+ GC_CB = 0,
+ GC_GREEDY
+};
+
+/*
+ * BG_GC means the background cleaning job.
+ * FG_GC means the on-demand cleaning job.
+ */
+enum {
+ BG_GC = 0,
+ FG_GC
+};
+
+/* for a function parameter to select a victim segment */
+struct victim_sel_policy {
+ int alloc_mode; /* LFS or SSR */
+ int gc_mode; /* GC_CB or GC_GREEDY */
+ unsigned long *dirty_segmap; /* dirty segment bitmap */
+ unsigned int offset; /* last scanned bitmap offset */
+ unsigned int ofs_unit; /* bitmap search unit */
+ unsigned int min_cost; /* minimum cost */
+ unsigned int min_segno; /* segment # having min. cost */
+};
+
+struct seg_entry {
+ unsigned short valid_blocks; /* # of valid blocks */
+ unsigned char *cur_valid_map; /* validity bitmap of blocks */
+ /*
+ * # of valid blocks and the validity bitmap stored in the the last
+ * checkpoint pack. This information is used by the SSR mode.
+ */
+ unsigned short ckpt_valid_blocks;
+ unsigned char *ckpt_valid_map;
+ unsigned char type; /* segment type like CURSEG_XXX_TYPE */
+ unsigned long long mtime; /* modification time of the segment */
+};
+
+struct sec_entry {
+ unsigned int valid_blocks; /* # of valid blocks in a section */
+};
+
+struct segment_allocation {
+ void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
+};
+
+struct sit_info {
+ const struct segment_allocation *s_ops;
+
+ block_t sit_base_addr; /* start block address of SIT area */
+ block_t sit_blocks; /* # of blocks used by SIT area */
+ block_t written_valid_blocks; /* # of valid blocks in main area */
+ char *sit_bitmap; /* SIT bitmap pointer */
+ unsigned int bitmap_size; /* SIT bitmap size */
+
+ unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
+ unsigned int dirty_sentries; /* # of dirty sentries */
+ unsigned int sents_per_block; /* # of SIT entries per block */
+ struct mutex sentry_lock; /* to protect SIT cache */
+ struct seg_entry *sentries; /* SIT segment-level cache */
+ struct sec_entry *sec_entries; /* SIT section-level cache */
+
+ /* for cost-benefit algorithm in cleaning procedure */
+ unsigned long long elapsed_time; /* elapsed time after mount */
+ unsigned long long mounted_time; /* mount time */
+ unsigned long long min_mtime; /* min. modification time */
+ unsigned long long max_mtime; /* max. modification time */
+};
+
+struct free_segmap_info {
+ unsigned int start_segno; /* start segment number logically */
+ unsigned int free_segments; /* # of free segments */
+ unsigned int free_sections; /* # of free sections */
+ rwlock_t segmap_lock; /* free segmap lock */
+ unsigned long *free_segmap; /* free segment bitmap */
+ unsigned long *free_secmap; /* free section bitmap */
+};
+
+/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */
+enum dirty_type {
+ DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */
+ DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */
+ DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */
+ DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */
+ DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */
+ DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
+ DIRTY, /* to count # of dirty segments */
+ PRE, /* to count # of entirely obsolete segments */
+ NR_DIRTY_TYPE
+};
+
+struct dirty_seglist_info {
+ const struct victim_selection *v_ops; /* victim selction operation */
+ unsigned long *dirty_segmap[NR_DIRTY_TYPE];
+ struct mutex seglist_lock; /* lock for segment bitmaps */
+ int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
+ unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */
+};
+
+/* victim selection function for cleaning and SSR */
+struct victim_selection {
+ int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
+ int, int, char);
+};
+
+/* for active log information */
+struct curseg_info {
+ struct mutex curseg_mutex; /* lock for consistency */
+ struct f2fs_summary_block *sum_blk; /* cached summary block */
+ unsigned char alloc_type; /* current allocation type */
+ unsigned int segno; /* current segment number */
+ unsigned short next_blkoff; /* next block offset to write */
+ unsigned int zone; /* current zone number */
+ unsigned int next_segno; /* preallocated segment */
+};
+
+/*
+ * inline functions
+ */
+static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
+{
+ return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
+}
+
+static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ return &sit_i->sentries[segno];
+}
+
+static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
+}
+
+static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
+ unsigned int segno, int section)
+{
+ /*
+ * In order to get # of valid blocks in a section instantly from many
+ * segments, f2fs manages two counting structures separately.
+ */
+ if (section > 1)
+ return get_sec_entry(sbi, segno)->valid_blocks;
+ else
+ return get_seg_entry(sbi, segno)->valid_blocks;
+}
+
+static inline void seg_info_from_raw_sit(struct seg_entry *se,
+ struct f2fs_sit_entry *rs)
+{
+ se->valid_blocks = GET_SIT_VBLOCKS(rs);
+ se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
+ memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+ memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+ se->type = GET_SIT_TYPE(rs);
+ se->mtime = le64_to_cpu(rs->mtime);
+}
+
+static inline void seg_info_to_raw_sit(struct seg_entry *se,
+ struct f2fs_sit_entry *rs)
+{
+ unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
+ se->valid_blocks;
+ rs->vblocks = cpu_to_le16(raw_vblocks);
+ memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
+ memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
+ se->ckpt_valid_blocks = se->valid_blocks;
+ rs->mtime = cpu_to_le64(se->mtime);
+}
+
+static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
+ unsigned int max, unsigned int segno)
+{
+ unsigned int ret;
+ read_lock(&free_i->segmap_lock);
+ ret = find_next_bit(free_i->free_segmap, max, segno);
+ read_unlock(&free_i->segmap_lock);
+ return ret;
+}
+
+static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int next;
+
+ write_lock(&free_i->segmap_lock);
+ clear_bit(segno, free_i->free_segmap);
+ free_i->free_segments++;
+
+ next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
+ if (next >= start_segno + sbi->segs_per_sec) {
+ clear_bit(secno, free_i->free_secmap);
+ free_i->free_sections++;
+ }
+ write_unlock(&free_i->segmap_lock);
+}
+
+static inline void __set_inuse(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ set_bit(segno, free_i->free_segmap);
+ free_i->free_segments--;
+ if (!test_and_set_bit(secno, free_i->free_secmap))
+ free_i->free_sections--;
+}
+
+static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ unsigned int start_segno = secno * sbi->segs_per_sec;
+ unsigned int next;
+
+ write_lock(&free_i->segmap_lock);
+ if (test_and_clear_bit(segno, free_i->free_segmap)) {
+ free_i->free_segments++;
+
+ next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi),
+ start_segno);
+ if (next >= start_segno + sbi->segs_per_sec) {
+ if (test_and_clear_bit(secno, free_i->free_secmap))
+ free_i->free_sections++;
+ }
+ }
+ write_unlock(&free_i->segmap_lock);
+}
+
+static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int secno = segno / sbi->segs_per_sec;
+ write_lock(&free_i->segmap_lock);
+ if (!test_and_set_bit(segno, free_i->free_segmap)) {
+ free_i->free_segments--;
+ if (!test_and_set_bit(secno, free_i->free_secmap))
+ free_i->free_sections--;
+ }
+ write_unlock(&free_i->segmap_lock);
+}
+
+static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
+ void *dst_addr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
+}
+
+static inline block_t written_block_count(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ block_t vblocks;
+
+ mutex_lock(&sit_i->sentry_lock);
+ vblocks = sit_i->written_valid_blocks;
+ mutex_unlock(&sit_i->sentry_lock);
+
+ return vblocks;
+}
+
+static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int free_segs;
+
+ read_lock(&free_i->segmap_lock);
+ free_segs = free_i->free_segments;
+ read_unlock(&free_i->segmap_lock);
+
+ return free_segs;
+}
+
+static inline int reserved_segments(struct f2fs_sb_info *sbi)
+{
+ return SM_I(sbi)->reserved_segments;
+}
+
+static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
+{
+ struct free_segmap_info *free_i = FREE_I(sbi);
+ unsigned int free_secs;
+
+ read_lock(&free_i->segmap_lock);
+ free_secs = free_i->free_sections;
+ read_unlock(&free_i->segmap_lock);
+
+ return free_secs;
+}
+
+static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
+{
+ return DIRTY_I(sbi)->nr_dirty[PRE];
+}
+
+static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
+{
+ return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] +
+ DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE];
+}
+
+static inline int overprovision_segments(struct f2fs_sb_info *sbi)
+{
+ return SM_I(sbi)->ovp_segments;
+}
+
+static inline int overprovision_sections(struct f2fs_sb_info *sbi)
+{
+ return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
+}
+
+static inline int reserved_sections(struct f2fs_sb_info *sbi)
+{
+ return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
+}
+
+static inline bool need_SSR(struct f2fs_sb_info *sbi)
+{
+ return (free_sections(sbi) < overprovision_sections(sbi));
+}
+
+static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return DIRTY_I(sbi)->v_ops->get_victim(sbi,
+ &(curseg)->next_segno, BG_GC, type, SSR);
+}
+
+static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
+{
+ return free_sections(sbi) <= reserved_sections(sbi);
+}
+
+static inline int utilization(struct f2fs_sb_info *sbi)
+{
+ return (long int)valid_user_blocks(sbi) * 100 /
+ (long int)sbi->user_block_count;
+}
+
+/*
+ * Sometimes f2fs may be better to drop out-of-place update policy.
+ * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
+ * data in the original place likewise other traditional file systems.
+ * But, currently set 100 in percentage, which means it is disabled.
+ * See below need_inplace_update().
+ */
+#define MIN_IPU_UTIL 100
+static inline bool need_inplace_update(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ if (S_ISDIR(inode->i_mode))
+ return false;
+ if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+ return true;
+ return false;
+}
+
+static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
+ int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return curseg->segno;
+}
+
+static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
+ int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return curseg->alloc_type;
+}
+
+static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ return curseg->next_blkoff;
+}
+
+static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ unsigned int end_segno = SM_I(sbi)->segment_count - 1;
+ BUG_ON(segno > end_segno);
+}
+
+/*
+ * This function is used for only debugging.
+ * NOTE: In future, we have to remove this function.
+ */
+static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
+ block_t start_addr = sm_info->seg0_blkaddr;
+ block_t end_addr = start_addr + total_blks - 1;
+ BUG_ON(blk_addr < start_addr);
+ BUG_ON(blk_addr > end_addr);
+}
+
+/*
+ * Summary block is always treated as invalid block
+ */
+static inline void check_block_count(struct f2fs_sb_info *sbi,
+ int segno, struct f2fs_sit_entry *raw_sit)
+{
+ struct f2fs_sm_info *sm_info = SM_I(sbi);
+ unsigned int end_segno = sm_info->segment_count - 1;
+ int valid_blocks = 0;
+ int i;
+
+ /* check segment usage */
+ BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
+
+ /* check boundary of a given segment number */
+ BUG_ON(segno > end_segno);
+
+ /* check bitmap with valid block count */
+ for (i = 0; i < sbi->blocks_per_seg; i++)
+ if (f2fs_test_bit(i, raw_sit->valid_map))
+ valid_blocks++;
+ BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
+}
+
+static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
+ unsigned int start)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
+ block_t blk_addr = sit_i->sit_base_addr + offset;
+
+ check_seg_range(sbi, start);
+
+ /* calculate sit block address */
+ if (f2fs_test_bit(offset, sit_i->sit_bitmap))
+ blk_addr += sit_i->sit_blocks;
+
+ return blk_addr;
+}
+
+static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
+ pgoff_t block_addr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ block_addr -= sit_i->sit_base_addr;
+ if (block_addr < sit_i->sit_blocks)
+ block_addr += sit_i->sit_blocks;
+ else
+ block_addr -= sit_i->sit_blocks;
+
+ return block_addr + sit_i->sit_base_addr;
+}
+
+static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
+{
+ unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
+
+ if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
+ f2fs_clear_bit(block_off, sit_i->sit_bitmap);
+ else
+ f2fs_set_bit(block_off, sit_i->sit_bitmap);
+}
+
+static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
+ sit_i->mounted_time;
+}
+
+static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
+ unsigned int ofs_in_node, unsigned char version)
+{
+ sum->nid = cpu_to_le32(nid);
+ sum->ofs_in_node = cpu_to_le16(ofs_in_node);
+ sum->version = version;
+}
+
+static inline block_t start_sum_block(struct f2fs_sb_info *sbi)
+{
+ return __start_cp_addr(sbi) +
+ le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
+}
+
+static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
+{
+ return __start_cp_addr(sbi) +
+ le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
+ - (base + 1) + type;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
new file mode 100644
index 000000000000..13867322cf5a
--- /dev/null
+++ b/fs/f2fs/super.c
@@ -0,0 +1,657 @@
+/*
+ * fs/f2fs/super.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/statfs.h>
+#include <linux/proc_fs.h>
+#include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/exportfs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+#include "node.h"
+#include "xattr.h"
+
+static struct kmem_cache *f2fs_inode_cachep;
+
+enum {
+ Opt_gc_background_off,
+ Opt_disable_roll_forward,
+ Opt_discard,
+ Opt_noheap,
+ Opt_nouser_xattr,
+ Opt_noacl,
+ Opt_active_logs,
+ Opt_disable_ext_identify,
+ Opt_err,
+};
+
+static match_table_t f2fs_tokens = {
+ {Opt_gc_background_off, "background_gc_off"},
+ {Opt_disable_roll_forward, "disable_roll_forward"},
+ {Opt_discard, "discard"},
+ {Opt_noheap, "no_heap"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_noacl, "noacl"},
+ {Opt_active_logs, "active_logs=%u"},
+ {Opt_disable_ext_identify, "disable_ext_identify"},
+ {Opt_err, NULL},
+};
+
+static void init_once(void *foo)
+{
+ struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
+
+ inode_init_once(&fi->vfs_inode);
+}
+
+static struct inode *f2fs_alloc_inode(struct super_block *sb)
+{
+ struct f2fs_inode_info *fi;
+
+ fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+ if (!fi)
+ return NULL;
+
+ init_once((void *) fi);
+
+ /* Initilize f2fs-specific inode info */
+ fi->vfs_inode.i_version = 1;
+ atomic_set(&fi->dirty_dents, 0);
+ fi->i_current_depth = 1;
+ fi->i_advise = 0;
+ rwlock_init(&fi->ext.ext_lock);
+
+ set_inode_flag(fi, FI_NEW_INODE);
+
+ return &fi->vfs_inode;
+}
+
+static void f2fs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
+}
+
+static void f2fs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, f2fs_i_callback);
+}
+
+static void f2fs_put_super(struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ f2fs_destroy_stats(sbi);
+ stop_gc_thread(sbi);
+
+ write_checkpoint(sbi, false, true);
+
+ iput(sbi->node_inode);
+ iput(sbi->meta_inode);
+
+ /* destroy f2fs internal modules */
+ destroy_node_manager(sbi);
+ destroy_segment_manager(sbi);
+
+ kfree(sbi->ckpt);
+
+ sb->s_fs_info = NULL;
+ brelse(sbi->raw_super_buf);
+ kfree(sbi);
+}
+
+int f2fs_sync_fs(struct super_block *sb, int sync)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int ret = 0;
+
+ if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
+ return 0;
+
+ if (sync)
+ write_checkpoint(sbi, false, false);
+
+ return ret;
+}
+
+static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ block_t total_count, user_block_count, start_count, ovp_count;
+
+ total_count = le64_to_cpu(sbi->raw_super->block_count);
+ user_block_count = sbi->user_block_count;
+ start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
+ ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
+ buf->f_type = F2FS_SUPER_MAGIC;
+ buf->f_bsize = sbi->blocksize;
+
+ buf->f_blocks = total_count - start_count;
+ buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
+ buf->f_bavail = user_block_count - valid_user_blocks(sbi);
+
+ buf->f_files = valid_inode_count(sbi);
+ buf->f_ffree = sbi->total_node_count - valid_node_count(sbi);
+
+ buf->f_namelen = F2FS_MAX_NAME_LEN;
+ buf->f_fsid.val[0] = (u32)id;
+ buf->f_fsid.val[1] = (u32)(id >> 32);
+
+ return 0;
+}
+
+static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
+
+ if (test_opt(sbi, BG_GC))
+ seq_puts(seq, ",background_gc_on");
+ else
+ seq_puts(seq, ",background_gc_off");
+ if (test_opt(sbi, DISABLE_ROLL_FORWARD))
+ seq_puts(seq, ",disable_roll_forward");
+ if (test_opt(sbi, DISCARD))
+ seq_puts(seq, ",discard");
+ if (test_opt(sbi, NOHEAP))
+ seq_puts(seq, ",no_heap_alloc");
+#ifdef CONFIG_F2FS_FS_XATTR
+ if (test_opt(sbi, XATTR_USER))
+ seq_puts(seq, ",user_xattr");
+ else
+ seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ if (test_opt(sbi, POSIX_ACL))
+ seq_puts(seq, ",acl");
+ else
+ seq_puts(seq, ",noacl");
+#endif
+ if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ seq_puts(seq, ",disable_ext_indentify");
+
+ seq_printf(seq, ",active_logs=%u", sbi->active_logs);
+
+ return 0;
+}
+
+static struct super_operations f2fs_sops = {
+ .alloc_inode = f2fs_alloc_inode,
+ .destroy_inode = f2fs_destroy_inode,
+ .write_inode = f2fs_write_inode,
+ .show_options = f2fs_show_options,
+ .evict_inode = f2fs_evict_inode,
+ .put_super = f2fs_put_super,
+ .sync_fs = f2fs_sync_fs,
+ .statfs = f2fs_statfs,
+};
+
+static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct inode *inode;
+
+ if (ino < F2FS_ROOT_INO(sbi))
+ return ERR_PTR(-ESTALE);
+
+ /*
+ * f2fs_iget isn't quite right if the inode is currently unallocated!
+ * However f2fs_iget currently does appropriate checks to handle stale
+ * inodes so everything is OK.
+ */
+ inode = f2fs_iget(sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (generation && inode->i_generation != generation) {
+ /* we didn't find the right inode.. */
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ return inode;
+}
+
+static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ f2fs_nfs_get_inode);
+}
+
+static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ f2fs_nfs_get_inode);
+}
+
+static const struct export_operations f2fs_export_ops = {
+ .fh_to_dentry = f2fs_fh_to_dentry,
+ .fh_to_parent = f2fs_fh_to_parent,
+ .get_parent = f2fs_get_parent,
+};
+
+static int parse_options(struct f2fs_sb_info *sbi, char *options)
+{
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
+ int arg = 0;
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, f2fs_tokens, args);
+
+ switch (token) {
+ case Opt_gc_background_off:
+ clear_opt(sbi, BG_GC);
+ break;
+ case Opt_disable_roll_forward:
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_discard:
+ set_opt(sbi, DISCARD);
+ break;
+ case Opt_noheap:
+ set_opt(sbi, NOHEAP);
+ break;
+#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_nouser_xattr:
+ clear_opt(sbi, XATTR_USER);
+ break;
+#else
+ case Opt_nouser_xattr:
+ pr_info("nouser_xattr options not supported\n");
+ break;
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_noacl:
+ clear_opt(sbi, POSIX_ACL);
+ break;
+#else
+ case Opt_noacl:
+ pr_info("noacl options not supported\n");
+ break;
+#endif
+ case Opt_active_logs:
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg != 2 && arg != 4 && arg != 6)
+ return -EINVAL;
+ sbi->active_logs = arg;
+ break;
+ case Opt_disable_ext_identify:
+ set_opt(sbi, DISABLE_EXT_IDENTIFY);
+ break;
+ default:
+ pr_err("Unrecognized mount option \"%s\" or missing value\n",
+ p);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static loff_t max_file_size(unsigned bits)
+{
+ loff_t result = ADDRS_PER_INODE;
+ loff_t leaf_count = ADDRS_PER_BLOCK;
+
+ /* two direct node blocks */
+ result += (leaf_count * 2);
+
+ /* two indirect node blocks */
+ leaf_count *= NIDS_PER_BLOCK;
+ result += (leaf_count * 2);
+
+ /* one double indirect node block */
+ leaf_count *= NIDS_PER_BLOCK;
+ result += leaf_count;
+
+ result <<= bits;
+ return result;
+}
+
+static int sanity_check_raw_super(struct f2fs_super_block *raw_super)
+{
+ unsigned int blocksize;
+
+ if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic))
+ return 1;
+
+ /* Currently, support only 4KB block size */
+ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
+ if (blocksize != PAGE_CACHE_SIZE)
+ return 1;
+ if (le32_to_cpu(raw_super->log_sectorsize) !=
+ F2FS_LOG_SECTOR_SIZE)
+ return 1;
+ if (le32_to_cpu(raw_super->log_sectors_per_block) !=
+ F2FS_LOG_SECTORS_PER_BLOCK)
+ return 1;
+ return 0;
+}
+
+static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
+ struct f2fs_checkpoint *ckpt)
+{
+ unsigned int total, fsmeta;
+
+ total = le32_to_cpu(raw_super->segment_count);
+ fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
+ fsmeta += le32_to_cpu(raw_super->segment_count_sit);
+ fsmeta += le32_to_cpu(raw_super->segment_count_nat);
+ fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
+ fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
+
+ if (fsmeta >= total)
+ return 1;
+ return 0;
+}
+
+static void init_sb_info(struct f2fs_sb_info *sbi)
+{
+ struct f2fs_super_block *raw_super = sbi->raw_super;
+ int i;
+
+ sbi->log_sectors_per_block =
+ le32_to_cpu(raw_super->log_sectors_per_block);
+ sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
+ sbi->blocksize = 1 << sbi->log_blocksize;
+ sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+ sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
+ sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
+ sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
+ sbi->total_sections = le32_to_cpu(raw_super->section_count);
+ sbi->total_node_count =
+ (le32_to_cpu(raw_super->segment_count_nat) / 2)
+ * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+ sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
+ sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
+ sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
+
+ for (i = 0; i < NR_COUNT_TYPE; i++)
+ atomic_set(&sbi->nr_pages[i], 0);
+}
+
+static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct f2fs_sb_info *sbi;
+ struct f2fs_super_block *raw_super;
+ struct buffer_head *raw_super_buf;
+ struct inode *root;
+ long err = -EINVAL;
+ int i;
+
+ /* allocate memory for f2fs-specific super block info */
+ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ /* set a temporary block size */
+ if (!sb_set_blocksize(sb, F2FS_BLKSIZE))
+ goto free_sbi;
+
+ /* read f2fs raw super block */
+ raw_super_buf = sb_bread(sb, 0);
+ if (!raw_super_buf) {
+ err = -EIO;
+ goto free_sbi;
+ }
+ raw_super = (struct f2fs_super_block *)
+ ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
+
+ /* init some FS parameters */
+ sbi->active_logs = NR_CURSEG_TYPE;
+
+ set_opt(sbi, BG_GC);
+
+#ifdef CONFIG_F2FS_FS_XATTR
+ set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ set_opt(sbi, POSIX_ACL);
+#endif
+ /* parse mount options */
+ if (parse_options(sbi, (char *)data))
+ goto free_sb_buf;
+
+ /* sanity checking of raw super */
+ if (sanity_check_raw_super(raw_super))
+ goto free_sb_buf;
+
+ sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+ sb->s_max_links = F2FS_LINK_MAX;
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+
+ sb->s_op = &f2fs_sops;
+ sb->s_xattr = f2fs_xattr_handlers;
+ sb->s_export_op = &f2fs_export_ops;
+ sb->s_magic = F2FS_SUPER_MAGIC;
+ sb->s_fs_info = sbi;
+ sb->s_time_gran = 1;
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
+ memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+
+ /* init f2fs-specific super block info */
+ sbi->sb = sb;
+ sbi->raw_super = raw_super;
+ sbi->raw_super_buf = raw_super_buf;
+ mutex_init(&sbi->gc_mutex);
+ mutex_init(&sbi->write_inode);
+ mutex_init(&sbi->writepages);
+ mutex_init(&sbi->cp_mutex);
+ for (i = 0; i < NR_LOCK_TYPE; i++)
+ mutex_init(&sbi->fs_lock[i]);
+ sbi->por_doing = 0;
+ spin_lock_init(&sbi->stat_lock);
+ init_rwsem(&sbi->bio_sem);
+ init_sb_info(sbi);
+
+ /* get an inode for meta space */
+ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
+ if (IS_ERR(sbi->meta_inode)) {
+ err = PTR_ERR(sbi->meta_inode);
+ goto free_sb_buf;
+ }
+
+ err = get_valid_checkpoint(sbi);
+ if (err)
+ goto free_meta_inode;
+
+ /* sanity checking of checkpoint */
+ err = -EINVAL;
+ if (sanity_check_ckpt(raw_super, sbi->ckpt))
+ goto free_cp;
+
+ sbi->total_valid_node_count =
+ le32_to_cpu(sbi->ckpt->valid_node_count);
+ sbi->total_valid_inode_count =
+ le32_to_cpu(sbi->ckpt->valid_inode_count);
+ sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
+ sbi->total_valid_block_count =
+ le64_to_cpu(sbi->ckpt->valid_block_count);
+ sbi->last_valid_block_count = sbi->total_valid_block_count;
+ sbi->alloc_valid_block_count = 0;
+ INIT_LIST_HEAD(&sbi->dir_inode_list);
+ spin_lock_init(&sbi->dir_inode_lock);
+
+ /* init super block */
+ if (!sb_set_blocksize(sb, sbi->blocksize))
+ goto free_cp;
+
+ init_orphan_info(sbi);
+
+ /* setup f2fs internal modules */
+ err = build_segment_manager(sbi);
+ if (err)
+ goto free_sm;
+ err = build_node_manager(sbi);
+ if (err)
+ goto free_nm;
+
+ build_gc_manager(sbi);
+
+ /* get an inode for node space */
+ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
+ if (IS_ERR(sbi->node_inode)) {
+ err = PTR_ERR(sbi->node_inode);
+ goto free_nm;
+ }
+
+ /* if there are nt orphan nodes free them */
+ err = -EINVAL;
+ if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
+ recover_orphan_inodes(sbi))
+ goto free_node_inode;
+
+ /* read root inode and dentry */
+ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto free_node_inode;
+ }
+ if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+ goto free_root_inode;
+
+ sb->s_root = d_make_root(root); /* allocate root dentry */
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto free_root_inode;
+ }
+
+ /* recover fsynced data */
+ if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
+ !test_opt(sbi, DISABLE_ROLL_FORWARD))
+ recover_fsync_data(sbi);
+
+ /* After POR, we can run background GC thread */
+ err = start_gc_thread(sbi);
+ if (err)
+ goto fail;
+
+ err = f2fs_build_stats(sbi);
+ if (err)
+ goto fail;
+
+ return 0;
+fail:
+ stop_gc_thread(sbi);
+free_root_inode:
+ dput(sb->s_root);
+ sb->s_root = NULL;
+free_node_inode:
+ iput(sbi->node_inode);
+free_nm:
+ destroy_node_manager(sbi);
+free_sm:
+ destroy_segment_manager(sbi);
+free_cp:
+ kfree(sbi->ckpt);
+free_meta_inode:
+ make_bad_inode(sbi->meta_inode);
+ iput(sbi->meta_inode);
+free_sb_buf:
+ brelse(raw_super_buf);
+free_sbi:
+ kfree(sbi);
+ return err;
+}
+
+static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
+}
+
+static struct file_system_type f2fs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "f2fs",
+ .mount = f2fs_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static int init_inodecache(void)
+{
+ f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
+ sizeof(struct f2fs_inode_info), NULL);
+ if (f2fs_inode_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+static void destroy_inodecache(void)
+{
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(f2fs_inode_cachep);
+}
+
+static int __init init_f2fs_fs(void)
+{
+ int err;
+
+ err = init_inodecache();
+ if (err)
+ goto fail;
+ err = create_node_manager_caches();
+ if (err)
+ goto fail;
+ err = create_gc_caches();
+ if (err)
+ goto fail;
+ err = create_checkpoint_caches();
+ if (err)
+ goto fail;
+ return register_filesystem(&f2fs_fs_type);
+fail:
+ return err;
+}
+
+static void __exit exit_f2fs_fs(void)
+{
+ destroy_root_stats();
+ unregister_filesystem(&f2fs_fs_type);
+ destroy_checkpoint_caches();
+ destroy_gc_caches();
+ destroy_node_manager_caches();
+ destroy_inodecache();
+}
+
+module_init(init_f2fs_fs)
+module_exit(exit_f2fs_fs)
+
+MODULE_AUTHOR("Samsung Electronics's Praesto Team");
+MODULE_DESCRIPTION("Flash Friendly File System");
+MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
new file mode 100644
index 000000000000..7d52e8dc0c59
--- /dev/null
+++ b/fs/f2fs/xattr.c
@@ -0,0 +1,440 @@
+/*
+ * fs/f2fs/xattr.c
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.c
+ *
+ * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
+ *
+ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
+ * Extended attributes for symlinks and special files added per
+ * suggestion of Luka Renko <luka.renko@hermes.si>.
+ * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
+ * Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/rwsem.h>
+#include <linux/f2fs_fs.h>
+#include "f2fs.h"
+#include "xattr.h"
+
+static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+ int total_len, prefix_len = 0;
+ const char *prefix = NULL;
+
+ switch (type) {
+ case F2FS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ prefix = XATTR_USER_PREFIX;
+ prefix_len = XATTR_USER_PREFIX_LEN;
+ break;
+ case F2FS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ prefix = XATTR_TRUSTED_PREFIX;
+ prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ total_len = prefix_len + name_len + 1;
+ if (list && total_len <= list_size) {
+ memcpy(list, prefix, prefix_len);
+ memcpy(list+prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+ switch (type) {
+ case F2FS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ break;
+ case F2FS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return f2fs_getxattr(dentry->d_inode, type, name,
+ buffer, size);
+}
+
+static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+
+ switch (type) {
+ case F2FS_XATTR_INDEX_USER:
+ if (!test_opt(sbi, XATTR_USER))
+ return -EOPNOTSUPP;
+ break;
+ case F2FS_XATTR_INDEX_TRUSTED:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return f2fs_setxattr(dentry->d_inode, type, name, value, size);
+}
+
+static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
+ size_t list_size, const char *name, size_t name_len, int type)
+{
+ const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
+ size_t size;
+
+ if (type != F2FS_XATTR_INDEX_ADVISE)
+ return 0;
+
+ size = strlen(xname) + 1;
+ if (list && size <= list_size)
+ memcpy(list, xname, size);
+ return size;
+}
+
+static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
+ void *buffer, size_t size, int type)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+
+ *((char *)buffer) = F2FS_I(inode)->i_advise;
+ return sizeof(char);
+}
+
+static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct inode *inode = dentry->d_inode;
+
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+ if (value == NULL)
+ return -EINVAL;
+
+ F2FS_I(inode)->i_advise |= *(char *)value;
+ return 0;
+}
+
+const struct xattr_handler f2fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .flags = F2FS_XATTR_INDEX_USER,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
+const struct xattr_handler f2fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .flags = F2FS_XATTR_INDEX_TRUSTED,
+ .list = f2fs_xattr_generic_list,
+ .get = f2fs_xattr_generic_get,
+ .set = f2fs_xattr_generic_set,
+};
+
+const struct xattr_handler f2fs_xattr_advise_handler = {
+ .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+ .flags = F2FS_XATTR_INDEX_ADVISE,
+ .list = f2fs_xattr_advise_list,
+ .get = f2fs_xattr_advise_get,
+ .set = f2fs_xattr_advise_set,
+};
+
+static const struct xattr_handler *f2fs_xattr_handler_map[] = {
+ [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
+ [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
+#endif
+ [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
+ [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
+};
+
+const struct xattr_handler *f2fs_xattr_handlers[] = {
+ &f2fs_xattr_user_handler,
+#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ &f2fs_xattr_acl_access_handler,
+ &f2fs_xattr_acl_default_handler,
+#endif
+ &f2fs_xattr_trusted_handler,
+ &f2fs_xattr_advise_handler,
+ NULL,
+};
+
+static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
+{
+ const struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map))
+ handler = f2fs_xattr_handler_map[name_index];
+ return handler;
+}
+
+int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_xattr_entry *entry;
+ struct page *page;
+ void *base_addr;
+ int error = 0, found = 0;
+ int value_len, name_len;
+
+ if (name == NULL)
+ return -EINVAL;
+ name_len = strlen(name);
+
+ if (!fi->i_xattr_nid)
+ return -ENODATA;
+
+ page = get_node_page(sbi, fi->i_xattr_nid);
+ base_addr = page_address(page);
+
+ list_for_each_xattr(entry, base_addr) {
+ if (entry->e_name_index != name_index)
+ continue;
+ if (entry->e_name_len != name_len)
+ continue;
+ if (!memcmp(entry->e_name, name, name_len)) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ error = -ENODATA;
+ goto cleanup;
+ }
+
+ value_len = le16_to_cpu(entry->e_value_size);
+
+ if (buffer && value_len > buffer_size) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+
+ if (buffer) {
+ char *pval = entry->e_name + entry->e_name_len;
+ memcpy(buffer, pval, value_len);
+ }
+ error = value_len;
+
+cleanup:
+ f2fs_put_page(page, 1);
+ return error;
+}
+
+ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ struct inode *inode = dentry->d_inode;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_xattr_entry *entry;
+ struct page *page;
+ void *base_addr;
+ int error = 0;
+ size_t rest = buffer_size;
+
+ if (!fi->i_xattr_nid)
+ return 0;
+
+ page = get_node_page(sbi, fi->i_xattr_nid);
+ base_addr = page_address(page);
+
+ list_for_each_xattr(entry, base_addr) {
+ const struct xattr_handler *handler =
+ f2fs_xattr_handler(entry->e_name_index);
+ size_t size;
+
+ if (!handler)
+ continue;
+
+ size = handler->list(dentry, buffer, rest, entry->e_name,
+ entry->e_name_len, handler->flags);
+ if (buffer && size > rest) {
+ error = -ERANGE;
+ goto cleanup;
+ }
+
+ if (buffer)
+ buffer += size;
+ rest -= size;
+ }
+ error = buffer_size - rest;
+cleanup:
+ f2fs_put_page(page, 1);
+ return error;
+}
+
+int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t value_len)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_xattr_header *header = NULL;
+ struct f2fs_xattr_entry *here, *last;
+ struct page *page;
+ void *base_addr;
+ int error, found, free, name_len, newsize;
+ char *pval;
+
+ if (name == NULL)
+ return -EINVAL;
+ name_len = strlen(name);
+
+ if (value == NULL)
+ value_len = 0;
+
+ if (name_len > 255 || value_len > MAX_VALUE_LEN)
+ return -ERANGE;
+
+ mutex_lock_op(sbi, NODE_NEW);
+ if (!fi->i_xattr_nid) {
+ /* Allocate new attribute block */
+ struct dnode_of_data dn;
+
+ if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ return -ENOSPC;
+ }
+ set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
+ mark_inode_dirty(inode);
+
+ page = new_node_page(&dn, XATTR_NODE_OFFSET);
+ if (IS_ERR(page)) {
+ alloc_nid_failed(sbi, fi->i_xattr_nid);
+ fi->i_xattr_nid = 0;
+ mutex_unlock_op(sbi, NODE_NEW);
+ return PTR_ERR(page);
+ }
+
+ alloc_nid_done(sbi, fi->i_xattr_nid);
+ base_addr = page_address(page);
+ header = XATTR_HDR(base_addr);
+ header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
+ header->h_refcount = cpu_to_le32(1);
+ } else {
+ /* The inode already has an extended attribute block. */
+ page = get_node_page(sbi, fi->i_xattr_nid);
+ if (IS_ERR(page)) {
+ mutex_unlock_op(sbi, NODE_NEW);
+ return PTR_ERR(page);
+ }
+
+ base_addr = page_address(page);
+ header = XATTR_HDR(base_addr);
+ }
+
+ if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
+ error = -EIO;
+ goto cleanup;
+ }
+
+ /* find entry with wanted name. */
+ found = 0;
+ list_for_each_xattr(here, base_addr) {
+ if (here->e_name_index != name_index)
+ continue;
+ if (here->e_name_len != name_len)
+ continue;
+ if (!memcmp(here->e_name, name, name_len)) {
+ found = 1;
+ break;
+ }
+ }
+
+ last = here;
+
+ while (!IS_XATTR_LAST_ENTRY(last))
+ last = XATTR_NEXT_ENTRY(last);
+
+ newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) +
+ name_len + value_len);
+
+ /* 1. Check space */
+ if (value) {
+ /* If value is NULL, it is remove operation.
+ * In case of update operation, we caculate free.
+ */
+ free = MIN_OFFSET - ((char *)last - (char *)header);
+ if (found)
+ free = free - ENTRY_SIZE(here);
+
+ if (free < newsize) {
+ error = -ENOSPC;
+ goto cleanup;
+ }
+ }
+
+ /* 2. Remove old entry */
+ if (found) {
+ /* If entry is found, remove old entry.
+ * If not found, remove operation is not needed.
+ */
+ struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
+ int oldsize = ENTRY_SIZE(here);
+
+ memmove(here, next, (char *)last - (char *)next);
+ last = (struct f2fs_xattr_entry *)((char *)last - oldsize);
+ memset(last, 0, oldsize);
+ }
+
+ /* 3. Write new entry */
+ if (value) {
+ /* Before we come here, old entry is removed.
+ * We just write new entry. */
+ memset(last, 0, newsize);
+ last->e_name_index = name_index;
+ last->e_name_len = name_len;
+ memcpy(last->e_name, name, name_len);
+ pval = last->e_name + name_len;
+ memcpy(pval, value, value_len);
+ last->e_value_size = cpu_to_le16(value_len);
+ }
+
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+
+ if (is_inode_flag_set(fi, FI_ACL_MODE)) {
+ inode->i_mode = fi->i_acl_mode;
+ inode->i_ctime = CURRENT_TIME;
+ clear_inode_flag(fi, FI_ACL_MODE);
+ }
+ f2fs_write_inode(inode, NULL);
+ mutex_unlock_op(sbi, NODE_NEW);
+
+ return 0;
+cleanup:
+ f2fs_put_page(page, 1);
+ mutex_unlock_op(sbi, NODE_NEW);
+ return error;
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
new file mode 100644
index 000000000000..49c9558305e3
--- /dev/null
+++ b/fs/f2fs/xattr.h
@@ -0,0 +1,145 @@
+/*
+ * fs/f2fs/xattr.h
+ *
+ * Copyright (c) 2012 Samsung Electronics Co., Ltd.
+ * http://www.samsung.com/
+ *
+ * Portions of this code from linux/fs/ext2/xattr.h
+ *
+ * On-disk format of extended attributes for the ext2 filesystem.
+ *
+ * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_XATTR_H__
+#define __F2FS_XATTR_H__
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+/* Magic value in attribute blocks */
+#define F2FS_XATTR_MAGIC 0xF2F52011
+
+/* Maximum number of references to one attribute block */
+#define F2FS_XATTR_REFCOUNT_MAX 1024
+
+/* Name indexes */
+#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
+#define F2FS_XATTR_INDEX_USER 1
+#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
+#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define F2FS_XATTR_INDEX_TRUSTED 4
+#define F2FS_XATTR_INDEX_LUSTRE 5
+#define F2FS_XATTR_INDEX_SECURITY 6
+#define F2FS_XATTR_INDEX_ADVISE 7
+
+struct f2fs_xattr_header {
+ __le32 h_magic; /* magic number for identification */
+ __le32 h_refcount; /* reference count */
+ __u32 h_reserved[4]; /* zero right now */
+};
+
+struct f2fs_xattr_entry {
+ __u8 e_name_index;
+ __u8 e_name_len;
+ __le16 e_value_size; /* size of attribute value */
+ char e_name[0]; /* attribute name */
+};
+
+#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
+#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr))
+#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1))
+#define XATTR_ROUND (3)
+
+#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
+
+#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
+ entry->e_name_len + le16_to_cpu(entry->e_value_size)))
+
+#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
+ ENTRY_SIZE(entry)))
+
+#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define list_for_each_xattr(entry, addr) \
+ for (entry = XATTR_FIRST_ENTRY(addr);\
+ !IS_XATTR_LAST_ENTRY(entry);\
+ entry = XATTR_NEXT_ENTRY(entry))
+
+
+#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \
+ sizeof(struct node_footer) - \
+ sizeof(__u32))
+
+#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
+ sizeof(struct f2fs_xattr_entry))
+
+/*
+ * On-disk structure of f2fs_xattr
+ * We use only 1 block for xattr.
+ *
+ * +--------------------+
+ * | f2fs_xattr_header |
+ * | |
+ * +--------------------+
+ * | f2fs_xattr_entry |
+ * | .e_name_index = 1 |
+ * | .e_name_len = 3 |
+ * | .e_value_size = 14 |
+ * | .e_name = "foo" |
+ * | "value_of_xattr" |<- value_offs = e_name + e_name_len
+ * +--------------------+
+ * | f2fs_xattr_entry |
+ * | .e_name_index = 4 |
+ * | .e_name = "bar" |
+ * +--------------------+
+ * | |
+ * | Free |
+ * | |
+ * +--------------------+<- MIN_OFFSET
+ * | node_footer |
+ * | (nid, ino, offset) |
+ * +--------------------+
+ *
+ **/
+
+#ifdef CONFIG_F2FS_FS_XATTR
+extern const struct xattr_handler f2fs_xattr_user_handler;
+extern const struct xattr_handler f2fs_xattr_trusted_handler;
+extern const struct xattr_handler f2fs_xattr_acl_access_handler;
+extern const struct xattr_handler f2fs_xattr_acl_default_handler;
+extern const struct xattr_handler f2fs_xattr_advise_handler;
+
+extern const struct xattr_handler *f2fs_xattr_handlers[];
+
+extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
+ const void *value, size_t value_len);
+extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+ void *buffer, size_t buffer_size);
+extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size);
+
+#else
+
+#define f2fs_xattr_handlers NULL
+static inline int f2fs_setxattr(struct inode *inode, int name_index,
+ const char *name, const void *value, size_t value_len)
+{
+ return -EOPNOTSUPP;
+}
+static inline int f2fs_getxattr(struct inode *inode, int name_index,
+ const char *name, void *buffer, size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
+ size_t buffer_size)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fhandle.c b/fs/fhandle.c
index cccdc874bb55..999ff5c3cab0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path,
handle_bytes = handle_dwords * sizeof(u32);
handle->handle_bytes = handle_bytes;
if ((handle->handle_bytes > f_handle.handle_bytes) ||
- (retval == 255) || (retval == -ENOSPC)) {
+ (retval == FILEID_INVALID) || (retval == -ENOSPC)) {
/* As per old exportfs_encode_fh documentation
* we could return ENOSPC to indicate overflow
* But file system returned 255 always. So handle
diff --git a/fs/file_table.c b/fs/file_table.c
index a72bf9ddd0d2..de9e9653d611 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -458,8 +458,8 @@ void mark_files_ro(struct super_block *sb)
spin_unlock(&f->f_lock);
if (file_check_writeable(f) != 0)
continue;
+ __mnt_drop_write(f->f_path.mnt);
file_release_write(f);
- mnt_drop_write_file(f);
} while_file_list_for_each_entry;
lg_global_unlock(&files_lglock);
}
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 6a3c48abd677..b52aed1dca97 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache);
*/
void fscache_io_error(struct fscache_cache *cache)
{
- set_bit(FSCACHE_IOERROR, &cache->flags);
-
- printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n",
- cache->ops->name);
+ if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
+ printk(KERN_ERR "FS-Cache:"
+ " Cache '%s' stopped due to I/O error\n",
+ cache->ops->name);
}
EXPORT_SYMBOL(fscache_io_error);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 990535071a8a..8dcb114758e3 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -370,6 +370,66 @@ cant_attach_object:
}
/*
+ * Invalidate an object. Callable with spinlocks held.
+ */
+void __fscache_invalidate(struct fscache_cookie *cookie)
+{
+ struct fscache_object *object;
+
+ _enter("{%s}", cookie->def->name);
+
+ fscache_stat(&fscache_n_invalidates);
+
+ /* Only permit invalidation of data files. Invalidating an index will
+ * require the caller to release all its attachments to the tree rooted
+ * there, and if it's doing that, it may as well just retire the
+ * cookie.
+ */
+ ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
+
+ /* We will be updating the cookie too. */
+ BUG_ON(!cookie->def->get_aux);
+
+ /* If there's an object, we tell the object state machine to handle the
+ * invalidation on our behalf, otherwise there's nothing to do.
+ */
+ if (!hlist_empty(&cookie->backing_objects)) {
+ spin_lock(&cookie->lock);
+
+ if (!hlist_empty(&cookie->backing_objects) &&
+ !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
+ &cookie->flags)) {
+ object = hlist_entry(cookie->backing_objects.first,
+ struct fscache_object,
+ cookie_link);
+ if (object->state < FSCACHE_OBJECT_DYING)
+ fscache_raise_event(
+ object, FSCACHE_OBJECT_EV_INVALIDATE);
+ }
+
+ spin_unlock(&cookie->lock);
+ }
+
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_invalidate);
+
+/*
+ * Wait for object invalidation to complete.
+ */
+void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
+{
+ _enter("%p", cookie);
+
+ wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
+ fscache_wait_bit_interruptible,
+ TASK_UNINTERRUPTIBLE);
+
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_wait_on_invalidate);
+
+/*
* update the index entries backing a cookie
*/
void __fscache_update_cookie(struct fscache_cookie *cookie)
@@ -442,16 +502,34 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
+try_again:
spin_lock(&cookie->lock);
/* break links with all the active objects */
while (!hlist_empty(&cookie->backing_objects)) {
+ int n_reads;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object,
cookie_link);
_debug("RELEASE OBJ%x", object->debug_id);
+ set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
+ n_reads = atomic_read(&object->n_reads);
+ if (n_reads) {
+ int n_ops = object->n_ops;
+ int n_in_progress = object->n_in_progress;
+ spin_unlock(&cookie->lock);
+ printk(KERN_ERR "FS-Cache:"
+ " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
+ cookie->def->name,
+ n_reads, n_ops, n_in_progress);
+ wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ printk("Wait finished\n");
+ goto try_again;
+ }
+
/* detach each cache object from the object cookie */
spin_lock(&object->lock);
hlist_del_init(&object->cookie_link);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f6aad48d38a8..ee38fef4be51 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -121,12 +121,19 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
struct fscache_operation *);
extern int fscache_submit_op(struct fscache_object *,
struct fscache_operation *);
-extern int fscache_cancel_op(struct fscache_operation *);
+extern int fscache_cancel_op(struct fscache_operation *,
+ void (*)(struct fscache_operation *));
+extern void fscache_cancel_all_ops(struct fscache_object *);
extern void fscache_abort_object(struct fscache_object *);
extern void fscache_start_operations(struct fscache_object *);
extern void fscache_operation_gc(struct work_struct *);
/*
+ * page.c
+ */
+extern void fscache_invalidate_writes(struct fscache_cookie *);
+
+/*
* proc.c
*/
#ifdef CONFIG_PROC_FS
@@ -194,6 +201,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing;
extern atomic_t fscache_n_store_vmscan_gone;
extern atomic_t fscache_n_store_vmscan_busy;
extern atomic_t fscache_n_store_vmscan_cancelled;
+extern atomic_t fscache_n_store_vmscan_wait;
extern atomic_t fscache_n_marks;
extern atomic_t fscache_n_uncaches;
@@ -205,6 +213,9 @@ extern atomic_t fscache_n_acquires_ok;
extern atomic_t fscache_n_acquires_nobufs;
extern atomic_t fscache_n_acquires_oom;
+extern atomic_t fscache_n_invalidates;
+extern atomic_t fscache_n_invalidates_run;
+
extern atomic_t fscache_n_updates;
extern atomic_t fscache_n_updates_null;
extern atomic_t fscache_n_updates_run;
@@ -237,6 +248,7 @@ extern atomic_t fscache_n_cop_alloc_object;
extern atomic_t fscache_n_cop_lookup_object;
extern atomic_t fscache_n_cop_lookup_complete;
extern atomic_t fscache_n_cop_grab_object;
+extern atomic_t fscache_n_cop_invalidate_object;
extern atomic_t fscache_n_cop_update_object;
extern atomic_t fscache_n_cop_drop_object;
extern atomic_t fscache_n_cop_put_object;
@@ -278,6 +290,7 @@ extern const struct file_operations fscache_stats_fops;
static inline void fscache_raise_event(struct fscache_object *object,
unsigned event)
{
+ BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
if (!test_and_set_bit(event, &object->events) &&
test_bit(event, &object->event_mask))
fscache_enqueue_object(object);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index ebe29c581380..f27c89d17885 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
obj->n_in_progress,
obj->n_exclusive,
atomic_read(&obj->n_reads),
- obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK,
+ obj->event_mask,
obj->events,
obj->flags,
work_busy(&obj->work));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index b6b897c550ac..50d41c180211 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,6 +14,7 @@
#define FSCACHE_DEBUG_LEVEL COOKIE
#include <linux/module.h>
+#include <linux/slab.h>
#include "internal.h"
const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
[FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
[FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
[FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
+ [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
[FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
[FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
[FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
@@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
[FSCACHE_OBJECT_CREATING] = "CRTN",
[FSCACHE_OBJECT_AVAILABLE] = "AVBL",
[FSCACHE_OBJECT_ACTIVE] = "ACTV",
+ [FSCACHE_OBJECT_INVALIDATING] = "INVL",
[FSCACHE_OBJECT_UPDATING] = "UPDT",
[FSCACHE_OBJECT_DYING] = "DYNG",
[FSCACHE_OBJECT_LC_DYING] = "LCDY",
@@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *);
static void fscache_initialise_object(struct fscache_object *);
static void fscache_lookup_object(struct fscache_object *);
static void fscache_object_available(struct fscache_object *);
+static void fscache_invalidate_object(struct fscache_object *);
static void fscache_release_object(struct fscache_object *);
static void fscache_withdraw_object(struct fscache_object *);
static void fscache_enqueue_dependents(struct fscache_object *);
@@ -79,6 +83,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
}
/*
+ * Notify netfs of invalidation completion.
+ */
+static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
+{
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+}
+
+/*
* process events that have been sent to an object's state machine
* - initiates parent lookup
* - does object lookup
@@ -90,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
{
enum fscache_object_state new_state;
struct fscache_cookie *cookie;
+ int event;
ASSERT(object != NULL);
@@ -101,7 +115,8 @@ static void fscache_object_state_machine(struct fscache_object *object)
/* wait for the parent object to become ready */
case FSCACHE_OBJECT_INIT:
object->event_mask =
- ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED);
+ FSCACHE_OBJECT_EVENTS_MASK &
+ ~(1 << FSCACHE_OBJECT_EV_CLEARED);
fscache_initialise_object(object);
goto done;
@@ -125,6 +140,16 @@ static void fscache_object_state_machine(struct fscache_object *object)
case FSCACHE_OBJECT_ACTIVE:
goto active_transit;
+ /* Invalidate an object on disk */
+ case FSCACHE_OBJECT_INVALIDATING:
+ clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
+ fscache_stat(&fscache_n_invalidates_run);
+ fscache_stat(&fscache_n_cop_invalidate_object);
+ fscache_invalidate_object(object);
+ fscache_stat_d(&fscache_n_cop_invalidate_object);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+ goto active_transit;
+
/* update the object metadata on disk */
case FSCACHE_OBJECT_UPDATING:
clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
@@ -251,13 +276,17 @@ static void fscache_object_state_machine(struct fscache_object *object)
/* determine the transition from a lookup state */
lookup_transit:
- switch (fls(object->events & object->event_mask) - 1) {
+ event = fls(object->events & object->event_mask) - 1;
+ switch (event) {
case FSCACHE_OBJECT_EV_WITHDRAW:
case FSCACHE_OBJECT_EV_RETIRE:
case FSCACHE_OBJECT_EV_RELEASE:
case FSCACHE_OBJECT_EV_ERROR:
new_state = FSCACHE_OBJECT_LC_DYING;
goto change_state;
+ case FSCACHE_OBJECT_EV_INVALIDATE:
+ new_state = FSCACHE_OBJECT_INVALIDATING;
+ goto change_state;
case FSCACHE_OBJECT_EV_REQUEUE:
goto done;
case -1:
@@ -268,13 +297,17 @@ lookup_transit:
/* determine the transition from an active state */
active_transit:
- switch (fls(object->events & object->event_mask) - 1) {
+ event = fls(object->events & object->event_mask) - 1;
+ switch (event) {
case FSCACHE_OBJECT_EV_WITHDRAW:
case FSCACHE_OBJECT_EV_RETIRE:
case FSCACHE_OBJECT_EV_RELEASE:
case FSCACHE_OBJECT_EV_ERROR:
new_state = FSCACHE_OBJECT_DYING;
goto change_state;
+ case FSCACHE_OBJECT_EV_INVALIDATE:
+ new_state = FSCACHE_OBJECT_INVALIDATING;
+ goto change_state;
case FSCACHE_OBJECT_EV_UPDATE:
new_state = FSCACHE_OBJECT_UPDATING;
goto change_state;
@@ -287,7 +320,8 @@ active_transit:
/* determine the transition from a terminal state */
terminal_transit:
- switch (fls(object->events & object->event_mask) - 1) {
+ event = fls(object->events & object->event_mask) - 1;
+ switch (event) {
case FSCACHE_OBJECT_EV_WITHDRAW:
new_state = FSCACHE_OBJECT_WITHDRAWING;
goto change_state;
@@ -320,8 +354,8 @@ done:
unsupported_event:
printk(KERN_ERR "FS-Cache:"
- " Unsupported event %lx [mask %lx] in state %s\n",
- object->events, object->event_mask,
+ " Unsupported event %d [%lx/%lx] in state %s\n",
+ event, object->events, object->event_mask,
fscache_object_states[object->state]);
BUG();
}
@@ -587,8 +621,6 @@ static void fscache_object_available(struct fscache_object *object)
if (object->n_in_progress == 0) {
if (object->n_ops > 0) {
ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
- ASSERTIF(object->n_ops > object->n_obj_ops,
- !list_empty(&object->pending_ops));
fscache_start_operations(object);
} else {
ASSERT(list_empty(&object->pending_ops));
@@ -681,6 +713,7 @@ static void fscache_withdraw_object(struct fscache_object *object)
if (object->cookie == cookie) {
hlist_del_init(&object->cookie_link);
object->cookie = NULL;
+ fscache_invalidation_complete(cookie);
detached = true;
}
spin_unlock(&cookie->lock);
@@ -890,3 +923,55 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
return result;
}
EXPORT_SYMBOL(fscache_check_aux);
+
+/*
+ * Asynchronously invalidate an object.
+ */
+static void fscache_invalidate_object(struct fscache_object *object)
+{
+ struct fscache_operation *op;
+ struct fscache_cookie *cookie = object->cookie;
+
+ _enter("{OBJ%x}", object->debug_id);
+
+ /* Reject any new read/write ops and abort any that are pending. */
+ fscache_invalidate_writes(cookie);
+ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+ fscache_cancel_all_ops(object);
+
+ /* Now we have to wait for in-progress reads and writes */
+ op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+ _leave(" [ENOMEM]");
+ return;
+ }
+
+ fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
+ op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+
+ spin_lock(&cookie->lock);
+ if (fscache_submit_exclusive_op(object, op) < 0)
+ goto submit_op_failed;
+ spin_unlock(&cookie->lock);
+ fscache_put_operation(op);
+
+ /* Once we've completed the invalidation, we know there will be no data
+ * stored in the cache and thus we can reinstate the data-check-skip
+ * optimisation.
+ */
+ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+
+ /* We can allow read and write requests to come in once again. They'll
+ * queue up behind our exclusive invalidation operation.
+ */
+ fscache_invalidation_complete(cookie);
+ _leave("");
+ return;
+
+submit_op_failed:
+ spin_unlock(&cookie->lock);
+ kfree(op);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
+ _leave(" [EIO]");
+}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 30afdfa7aec7..762a9ec4ffa4 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
ASSERT(op->processor != NULL);
ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
ASSERTCMP(atomic_read(&op->usage), >, 0);
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
fscache_stat(&fscache_n_op_enqueue);
switch (op->flags & FSCACHE_OP_TYPE) {
@@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
static void fscache_run_op(struct fscache_object *object,
struct fscache_operation *op)
{
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+
+ op->state = FSCACHE_OP_ST_IN_PROGRESS;
object->n_in_progress++;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -84,18 +88,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
spin_lock(&object->lock);
ASSERTCMP(object->n_ops, >=, object->n_in_progress);
ASSERTCMP(object->n_ops, >=, object->n_exclusive);
ASSERT(list_empty(&op->pend_link));
- ret = -ENOBUFS;
+ op->state = FSCACHE_OP_ST_PENDING;
if (fscache_object_is_active(object)) {
op->object = object;
object->n_ops++;
object->n_exclusive++; /* reads and writes must wait */
- if (object->n_ops > 1) {
+ if (object->n_in_progress > 0) {
atomic_inc(&op->usage);
list_add_tail(&op->pend_link, &object->pending_ops);
fscache_stat(&fscache_n_op_pend);
@@ -121,8 +128,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
fscache_stat(&fscache_n_op_pend);
ret = 0;
} else {
- /* not allowed to submit ops in any other state */
- BUG();
+ /* If we're in any other state, there must have been an I/O
+ * error of some nature.
+ */
+ ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
+ ret = -EIO;
}
spin_unlock(&object->lock);
@@ -186,6 +196,7 @@ int fscache_submit_op(struct fscache_object *object,
_enter("{OBJ%x OP%x},{%u}",
object->debug_id, op->debug_id, atomic_read(&op->usage));
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
ASSERTCMP(atomic_read(&op->usage), >, 0);
spin_lock(&object->lock);
@@ -196,6 +207,7 @@ int fscache_submit_op(struct fscache_object *object,
ostate = object->state;
smp_rmb();
+ op->state = FSCACHE_OP_ST_PENDING;
if (fscache_object_is_active(object)) {
op->object = object;
object->n_ops++;
@@ -225,12 +237,15 @@ int fscache_submit_op(struct fscache_object *object,
object->state == FSCACHE_OBJECT_LC_DYING ||
object->state == FSCACHE_OBJECT_WITHDRAWING) {
fscache_stat(&fscache_n_op_rejected);
+ op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
} else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
fscache_report_unexpected_submission(object, op, ostate);
ASSERT(!fscache_object_is_active(object));
+ op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
} else {
+ op->state = FSCACHE_OP_ST_CANCELLED;
ret = -ENOBUFS;
}
@@ -283,20 +298,28 @@ void fscache_start_operations(struct fscache_object *object)
/*
* cancel an operation that's pending on an object
*/
-int fscache_cancel_op(struct fscache_operation *op)
+int fscache_cancel_op(struct fscache_operation *op,
+ void (*do_cancel)(struct fscache_operation *))
{
struct fscache_object *object = op->object;
int ret;
_enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
+ ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
+ ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
+ ASSERTCMP(atomic_read(&op->usage), >, 0);
+
spin_lock(&object->lock);
ret = -EBUSY;
- if (!list_empty(&op->pend_link)) {
+ if (op->state == FSCACHE_OP_ST_PENDING) {
+ ASSERT(!list_empty(&op->pend_link));
fscache_stat(&fscache_n_op_cancelled);
list_del_init(&op->pend_link);
- object->n_ops--;
+ if (do_cancel)
+ do_cancel(op);
+ op->state = FSCACHE_OP_ST_CANCELLED;
if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
object->n_exclusive--;
if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
@@ -311,6 +334,70 @@ int fscache_cancel_op(struct fscache_operation *op)
}
/*
+ * Cancel all pending operations on an object
+ */
+void fscache_cancel_all_ops(struct fscache_object *object)
+{
+ struct fscache_operation *op;
+
+ _enter("OBJ%x", object->debug_id);
+
+ spin_lock(&object->lock);
+
+ while (!list_empty(&object->pending_ops)) {
+ op = list_entry(object->pending_ops.next,
+ struct fscache_operation, pend_link);
+ fscache_stat(&fscache_n_op_cancelled);
+ list_del_init(&op->pend_link);
+
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
+ op->state = FSCACHE_OP_ST_CANCELLED;
+
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
+ wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
+ fscache_put_operation(op);
+ cond_resched_lock(&object->lock);
+ }
+
+ spin_unlock(&object->lock);
+ _leave("");
+}
+
+/*
+ * Record the completion or cancellation of an in-progress operation.
+ */
+void fscache_op_complete(struct fscache_operation *op, bool cancelled)
+{
+ struct fscache_object *object = op->object;
+
+ _enter("OBJ%x", object->debug_id);
+
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
+ ASSERTCMP(object->n_in_progress, >, 0);
+ ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+ object->n_exclusive, >, 0);
+ ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
+ object->n_in_progress, ==, 1);
+
+ spin_lock(&object->lock);
+
+ op->state = cancelled ?
+ FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
+
+ if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
+ object->n_exclusive--;
+ object->n_in_progress--;
+ if (object->n_in_progress == 0)
+ fscache_start_operations(object);
+
+ spin_unlock(&object->lock);
+ _leave("");
+}
+EXPORT_SYMBOL(fscache_op_complete);
+
+/*
* release an operation
* - queues pending ops if this is the last in-progress op
*/
@@ -328,8 +415,9 @@ void fscache_put_operation(struct fscache_operation *op)
return;
_debug("PUT OP");
- if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags))
- BUG();
+ ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
+ op->state, ==, FSCACHE_OP_ST_CANCELLED);
+ op->state = FSCACHE_OP_ST_DEAD;
fscache_stat(&fscache_n_op_release);
@@ -340,8 +428,14 @@ void fscache_put_operation(struct fscache_operation *op)
object = op->object;
- if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags))
- atomic_dec(&object->n_reads);
+ if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
+ if (atomic_dec_and_test(&object->n_reads)) {
+ clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
+ &object->cookie->flags);
+ wake_up_bit(&object->cookie->flags,
+ FSCACHE_COOKIE_WAITING_ON_READS);
+ }
+ }
/* now... we may get called with the object spinlock held, so we
* complete the cleanup here only if we can immediately acquire the
@@ -359,16 +453,6 @@ void fscache_put_operation(struct fscache_operation *op)
return;
}
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
- ASSERTCMP(object->n_exclusive, >, 0);
- object->n_exclusive--;
- }
-
- ASSERTCMP(object->n_in_progress, >, 0);
- object->n_in_progress--;
- if (object->n_in_progress == 0)
- fscache_start_operations(object);
-
ASSERTCMP(object->n_ops, >, 0);
object->n_ops--;
if (object->n_ops == 0)
@@ -407,23 +491,14 @@ void fscache_operation_gc(struct work_struct *work)
spin_unlock(&cache->op_gc_list_lock);
object = op->object;
+ spin_lock(&object->lock);
_debug("GC DEFERRED REL OBJ%x OP%x",
object->debug_id, op->debug_id);
fscache_stat(&fscache_n_op_gc);
ASSERTCMP(atomic_read(&op->usage), ==, 0);
-
- spin_lock(&object->lock);
- if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
- ASSERTCMP(object->n_exclusive, >, 0);
- object->n_exclusive--;
- }
-
- ASSERTCMP(object->n_in_progress, >, 0);
- object->n_in_progress--;
- if (object->n_in_progress == 0)
- fscache_start_operations(object);
+ ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
ASSERTCMP(object->n_ops, >, 0);
object->n_ops--;
@@ -431,6 +506,7 @@ void fscache_operation_gc(struct work_struct *work)
fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
spin_unlock(&object->lock);
+ kfree(op);
} while (count++ < 20);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 3f7a59bfa7ad..ff000e52072d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
_enter("%p,%p,%x", cookie, page, gfp);
+try_again:
rcu_read_lock();
val = radix_tree_lookup(&cookie->stores, page->index);
if (!val) {
@@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
return true;
page_busy:
- /* we might want to wait here, but that could deadlock the allocator as
- * the work threads writing to the cache may all end up sleeping
- * on memory allocation */
- fscache_stat(&fscache_n_store_vmscan_busy);
- return false;
+ /* We will wait here if we're allowed to, but that could deadlock the
+ * allocator as the work threads writing to the cache may all end up
+ * sleeping on memory allocation, so we may need to impose a timeout
+ * too. */
+ if (!(gfp & __GFP_WAIT)) {
+ fscache_stat(&fscache_n_store_vmscan_busy);
+ return false;
+ }
+
+ fscache_stat(&fscache_n_store_vmscan_wait);
+ __fscache_wait_on_page_write(cookie, page);
+ gfp &= ~__GFP_WAIT;
+ goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -162,6 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_abort_object(object);
}
+ fscache_op_complete(op, true);
_leave("");
}
@@ -223,6 +233,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
_enter("{OP%x}", op->op.debug_id);
+ ASSERTCMP(op->n_pages, ==, 0);
+
fscache_hist(fscache_retrieval_histogram, op->start_time);
if (op->context)
fscache_put_context(op->op.object->cookie, op->context);
@@ -291,6 +303,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
}
/*
+ * Handle cancellation of a pending retrieval op
+ */
+static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
+{
+ struct fscache_retrieval *op =
+ container_of(_op, struct fscache_retrieval, op);
+
+ op->n_pages = 0;
+}
+
+/*
* wait for an object to become active (or dead)
*/
static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
@@ -307,8 +330,8 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
fscache_stat(stat_op_waits);
if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
fscache_wait_bit_interruptible,
- TASK_INTERRUPTIBLE) < 0) {
- ret = fscache_cancel_op(&op->op);
+ TASK_INTERRUPTIBLE) != 0) {
+ ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
if (ret == 0)
return -ERESTARTSYS;
@@ -320,7 +343,14 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
_debug("<<< GO");
check_if_dead:
+ if (op->op.state == FSCACHE_OP_ST_CANCELLED) {
+ fscache_stat(stat_object_dead);
+ _leave(" = -ENOBUFS [cancelled]");
+ return -ENOBUFS;
+ }
if (unlikely(fscache_object_is_dead(object))) {
+ pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state);
+ fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
fscache_stat(stat_object_dead);
return -ENOBUFS;
}
@@ -353,6 +383,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
if (hlist_empty(&cookie->backing_objects))
goto nobufs;
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
ASSERTCMP(page, !=, NULL);
@@ -364,6 +399,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
_leave(" = -ENOMEM");
return -ENOMEM;
}
+ op->n_pages = 1;
spin_lock(&cookie->lock);
@@ -375,10 +411,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
atomic_inc(&object->n_reads);
- set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+ __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock;
+ goto nobufs_unlock_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_retrieval_ops);
@@ -425,6 +461,8 @@ error:
_leave(" = %d", ret);
return ret;
+nobufs_unlock_dec:
+ atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
kfree(op);
@@ -472,6 +510,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
if (hlist_empty(&cookie->backing_objects))
goto nobufs;
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
ASSERTCMP(*nr_pages, >, 0);
ASSERT(!list_empty(pages));
@@ -482,6 +525,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
op = fscache_alloc_retrieval(mapping, end_io_func, context);
if (!op)
return -ENOMEM;
+ op->n_pages = *nr_pages;
spin_lock(&cookie->lock);
@@ -491,10 +535,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
struct fscache_object, cookie_link);
atomic_inc(&object->n_reads);
- set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
+ __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock;
+ goto nobufs_unlock_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_retrieval_ops);
@@ -541,6 +585,8 @@ error:
_leave(" = %d", ret);
return ret;
+nobufs_unlock_dec:
+ atomic_dec(&object->n_reads);
nobufs_unlock:
spin_unlock(&cookie->lock);
kfree(op);
@@ -577,12 +623,18 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
ASSERTCMP(page, !=, NULL);
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
if (fscache_wait_for_deferred_lookup(cookie) < 0)
return -ERESTARTSYS;
op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
if (!op)
return -ENOMEM;
+ op->n_pages = 1;
spin_lock(&cookie->lock);
@@ -658,9 +710,27 @@ static void fscache_write_op(struct fscache_operation *_op)
spin_lock(&object->lock);
cookie = object->cookie;
- if (!fscache_object_is_active(object) || !cookie) {
+ if (!fscache_object_is_active(object)) {
+ /* If we get here, then the on-disk cache object likely longer
+ * exists, so we should just cancel this write operation.
+ */
+ spin_unlock(&object->lock);
+ fscache_op_complete(&op->op, false);
+ _leave(" [inactive]");
+ return;
+ }
+
+ if (!cookie) {
+ /* If we get here, then the cookie belonging to the object was
+ * detached, probably by the cookie being withdrawn due to
+ * memory pressure, which means that the pages we might write
+ * to the cache from no longer exist - therefore, we can just
+ * cancel this write operation.
+ */
spin_unlock(&object->lock);
- _leave("");
+ fscache_op_complete(&op->op, false);
+ _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
+ _op->flags, _op->state, object->state, object->flags);
return;
}
@@ -696,6 +766,7 @@ static void fscache_write_op(struct fscache_operation *_op)
fscache_end_page_write(object, page);
if (ret < 0) {
fscache_abort_object(object);
+ fscache_op_complete(&op->op, true);
} else {
fscache_enqueue_operation(&op->op);
}
@@ -710,6 +781,38 @@ superseded:
spin_unlock(&cookie->stores_lock);
clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
spin_unlock(&object->lock);
+ fscache_op_complete(&op->op, true);
+ _leave("");
+}
+
+/*
+ * Clear the pages pending writing for invalidation
+ */
+void fscache_invalidate_writes(struct fscache_cookie *cookie)
+{
+ struct page *page;
+ void *results[16];
+ int n, i;
+
+ _enter("");
+
+ while (spin_lock(&cookie->stores_lock),
+ n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
+ ARRAY_SIZE(results),
+ FSCACHE_COOKIE_PENDING_TAG),
+ n > 0) {
+ for (i = n - 1; i >= 0; i--) {
+ page = results[i];
+ radix_tree_delete(&cookie->stores, page->index);
+ }
+
+ spin_unlock(&cookie->stores_lock);
+
+ for (i = n - 1; i >= 0; i--)
+ page_cache_release(results[i]);
+ }
+
+ spin_unlock(&cookie->stores_lock);
_leave("");
}
@@ -759,7 +862,12 @@ int __fscache_write_page(struct fscache_cookie *cookie,
fscache_stat(&fscache_n_stores);
- op = kzalloc(sizeof(*op), GFP_NOIO);
+ if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
+ _leave(" = -ENOBUFS [invalidating]");
+ return -ENOBUFS;
+ }
+
+ op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
if (!op)
goto nomem;
@@ -915,6 +1023,40 @@ done:
EXPORT_SYMBOL(__fscache_uncache_page);
/**
+ * fscache_mark_page_cached - Mark a page as being cached
+ * @op: The retrieval op pages are being marked for
+ * @page: The page to be marked
+ *
+ * Mark a netfs page as being cached. After this is called, the netfs
+ * must call fscache_uncache_page() to remove the mark.
+ */
+void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
+{
+ struct fscache_cookie *cookie = op->op.object->cookie;
+
+#ifdef CONFIG_FSCACHE_STATS
+ atomic_inc(&fscache_n_marks);
+#endif
+
+ _debug("- mark %p{%lx}", page, page->index);
+ if (TestSetPageFsCache(page)) {
+ static bool once_only;
+ if (!once_only) {
+ once_only = true;
+ printk(KERN_WARNING "FS-Cache:"
+ " Cookie type %s marked page %lx"
+ " multiple times\n",
+ cookie->def->name, page->index);
+ }
+ }
+
+ if (cookie->def->mark_page_cached)
+ cookie->def->mark_page_cached(cookie->netfs_data,
+ op->mapping, page);
+}
+EXPORT_SYMBOL(fscache_mark_page_cached);
+
+/**
* fscache_mark_pages_cached - Mark pages as being cached
* @op: The retrieval op pages are being marked for
* @pagevec: The pages to be marked
@@ -925,32 +1067,11 @@ EXPORT_SYMBOL(__fscache_uncache_page);
void fscache_mark_pages_cached(struct fscache_retrieval *op,
struct pagevec *pagevec)
{
- struct fscache_cookie *cookie = op->op.object->cookie;
unsigned long loop;
-#ifdef CONFIG_FSCACHE_STATS
- atomic_add(pagevec->nr, &fscache_n_marks);
-#endif
-
- for (loop = 0; loop < pagevec->nr; loop++) {
- struct page *page = pagevec->pages[loop];
-
- _debug("- mark %p{%lx}", page, page->index);
- if (TestSetPageFsCache(page)) {
- static bool once_only;
- if (!once_only) {
- once_only = true;
- printk(KERN_WARNING "FS-Cache:"
- " Cookie type %s marked page %lx"
- " multiple times\n",
- cookie->def->name, page->index);
- }
- }
- }
+ for (loop = 0; loop < pagevec->nr; loop++)
+ fscache_mark_page_cached(op, pagevec->pages[loop]);
- if (cookie->def->mark_pages_cached)
- cookie->def->mark_pages_cached(cookie->netfs_data,
- op->mapping, pagevec);
pagevec_reinit(pagevec);
}
EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 4765190d537f..8179e8bc4a3d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing;
atomic_t fscache_n_store_vmscan_gone;
atomic_t fscache_n_store_vmscan_busy;
atomic_t fscache_n_store_vmscan_cancelled;
+atomic_t fscache_n_store_vmscan_wait;
atomic_t fscache_n_marks;
atomic_t fscache_n_uncaches;
@@ -80,6 +81,9 @@ atomic_t fscache_n_acquires_ok;
atomic_t fscache_n_acquires_nobufs;
atomic_t fscache_n_acquires_oom;
+atomic_t fscache_n_invalidates;
+atomic_t fscache_n_invalidates_run;
+
atomic_t fscache_n_updates;
atomic_t fscache_n_updates_null;
atomic_t fscache_n_updates_run;
@@ -112,6 +116,7 @@ atomic_t fscache_n_cop_alloc_object;
atomic_t fscache_n_cop_lookup_object;
atomic_t fscache_n_cop_lookup_complete;
atomic_t fscache_n_cop_grab_object;
+atomic_t fscache_n_cop_invalidate_object;
atomic_t fscache_n_cop_update_object;
atomic_t fscache_n_cop_drop_object;
atomic_t fscache_n_cop_put_object;
@@ -168,6 +173,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_object_created),
atomic_read(&fscache_n_object_lookups_timed_out));
+ seq_printf(m, "Invals : n=%u run=%u\n",
+ atomic_read(&fscache_n_invalidates),
+ atomic_read(&fscache_n_invalidates_run));
+
seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
atomic_read(&fscache_n_updates),
atomic_read(&fscache_n_updates_null),
@@ -224,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_store_radix_deletes),
atomic_read(&fscache_n_store_pages_over_limit));
- seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n",
+ seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
atomic_read(&fscache_n_store_vmscan_not_storing),
atomic_read(&fscache_n_store_vmscan_gone),
atomic_read(&fscache_n_store_vmscan_busy),
- atomic_read(&fscache_n_store_vmscan_cancelled));
+ atomic_read(&fscache_n_store_vmscan_cancelled),
+ atomic_read(&fscache_n_store_vmscan_wait));
seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
atomic_read(&fscache_n_op_pend),
@@ -246,7 +256,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_cop_lookup_object),
atomic_read(&fscache_n_cop_lookup_complete),
atomic_read(&fscache_n_cop_grab_object));
- seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+ seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
+ atomic_read(&fscache_n_cop_invalidate_object),
atomic_read(&fscache_n_cop_update_object),
atomic_read(&fscache_n_cop_drop_object),
atomic_read(&fscache_n_cop_put_object),
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 0b35903219bc..d47f11658c17 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page, hfs_get_block);
}
+static void hfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ hfs_file_truncate(inode);
+ }
+}
+
static int hfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ hfs_write_failed(mapping, pos + len);
return ret;
}
@@ -120,6 +127,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
ssize_t ret;
@@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ hfs_write_failed(mapping, end);
}
return ret;
@@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
attr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ hfs_file_truncate(inode);
}
setattr_copy(inode, attr);
@@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = {
static const struct inode_operations hfs_file_inode_operations = {
.lookup = hfs_file_lookup,
- .truncate = hfs_file_truncate,
.setattr = hfs_inode_setattr,
.setxattr = hfs_setxattr,
.getxattr = hfs_getxattr,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2172aa5976f5..799b336b59f9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -28,6 +28,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
return block_write_full_page(page, hfsplus_get_block, wbc);
}
+static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ hfsplus_file_truncate(inode);
+ }
+}
+
static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -38,11 +48,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ hfsplus_write_failed(mapping, pos + len);
return ret;
}
@@ -116,6 +123,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
ssize_t ret;
@@ -131,7 +139,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ hfsplus_write_failed(mapping, end);
}
return ret;
@@ -300,10 +308,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
-
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
+ truncate_setsize(inode, attr->ia_size);
+ hfsplus_file_truncate(inode);
}
setattr_copy(inode, attr);
@@ -358,7 +364,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
static const struct inode_operations hfsplus_file_inode_operations = {
.lookup = hfsplus_file_lookup,
- .truncate = hfsplus_file_truncate,
.setattr = hfsplus_setattr,
.setxattr = hfsplus_setxattr,
.getxattr = hfsplus_getxattr,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89d2a5803ae3..fbfe2df5624b 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
return disk_secno;
}
-static void hpfs_truncate(struct inode *i)
+void hpfs_truncate(struct inode *i)
{
if (IS_IMMUTABLE(i)) return /*-EPERM*/;
hpfs_lock_assert(i->i_sb);
@@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page)
return block_read_full_page(page,hpfs_get_block);
}
+static void hpfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ hpfs_truncate(inode);
+ }
+}
+
static int hpfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ hpfs_write_failed(mapping, pos + len);
return ret;
}
@@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops =
const struct inode_operations hpfs_file_iops =
{
- .truncate = hpfs_truncate,
.setattr = hpfs_setattr,
};
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 7102aaecc244..b7ae286646b5 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
/* file.c */
int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
+void hpfs_truncate(struct inode *);
extern const struct file_operations hpfs_file_ops;
extern const struct inode_operations hpfs_file_iops;
extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 804a9a842cbc..5dc06c837105 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
goto out_unlock;
+
+ truncate_setsize(inode, attr->ia_size);
+ hpfs_truncate(inode);
}
setattr_copy(inode, attr);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 9d3afd157f99..dd7442c58358 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
iattr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
- rc = vmtruncate(inode, iattr->ia_size);
+ rc = inode_newsize_ok(inode, iattr->ia_size);
if (rc)
return rc;
+
+ truncate_setsize(inode, iattr->ia_size);
+ jfs_truncate(inode);
}
setattr_copy(inode, iattr);
@@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
}
const struct inode_operations jfs_file_inode_operations = {
- .truncate = jfs_truncate,
.setxattr = jfs_setxattr,
.getxattr = jfs_getxattr,
.listxattr = jfs_listxattr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 4692bf3ca8cb..b7dc47ba675e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
}
+static void jfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ jfs_truncate(inode);
+ }
+}
+
static int jfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
jfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ jfs_write_failed(mapping, pos + len);
return ret;
}
@@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
const struct iovec *iov, loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
@@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ jfs_write_failed(mapping, end);
}
return ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index 35fc6e74cd88..916da8c4158b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = dentry->d_inode;
int error;
- WARN_ON_ONCE(inode->i_op->truncate);
-
error = inode_change_ok(inode, iattr);
if (error)
return error;
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index e1a3b6bf6324..9a59cbade2fb 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target)
logfs_put_wblocks(sb, NULL, 1);
}
- if (!err)
- err = vmtruncate(inode, target);
+ if (!err) {
+ err = inode_newsize_ok(inode, target);
+ if (err)
+ goto out;
+
+ truncate_setsize(inode, target);
+ }
+ out:
/* I don't trust error recovery yet. */
WARN_ON(err);
return err;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 4493ce695ab8..adc6f5494231 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+
+ truncate_setsize(inode, attr->ia_size);
+ minix_truncate(inode);
}
setattr_copy(inode, attr);
@@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
}
const struct inode_operations minix_file_inode_operations = {
- .truncate = minix_truncate,
.setattr = minix_setattr,
.getattr = minix_getattr,
};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 4fc5f8ab1c44..99541cceb584 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, minix_get_block);
}
+static void minix_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ minix_truncate(inode);
+ }
+}
+
static int minix_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
minix_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ minix_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/namei.c b/fs/namei.c
index 5f4cdf3ad913..43a97ee1d4c8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1275,9 +1275,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
*need_lookup = false;
dentry = d_lookup(dir, name);
if (dentry) {
- if (d_need_lookup(dentry)) {
- *need_lookup = true;
- } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
+ if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
if (error < 0) {
@@ -1383,8 +1381,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
return -ECHILD;
nd->seq = seq;
- if (unlikely(d_need_lookup(dentry)))
- goto unlazy;
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
@@ -1410,11 +1406,6 @@ unlazy:
if (unlikely(!dentry))
goto need_lookup;
- if (unlikely(d_need_lookup(dentry))) {
- dput(dentry);
- goto need_lookup;
- }
-
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
status = d_revalidate(dentry, nd->flags);
if (unlikely(status <= 0)) {
@@ -1859,7 +1850,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (flags & LOOKUP_ROOT) {
struct inode *inode = nd->root.dentry->d_inode;
if (*name) {
- if (!inode->i_op->lookup)
+ if (!can_lookup(inode))
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
if (retval)
@@ -1903,6 +1894,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
get_fs_pwd(current->fs, &nd->path);
}
} else {
+ /* Caller must check execute permissions on the starting path component */
struct fd f = fdget_raw(dfd);
struct dentry *dentry;
@@ -1912,16 +1904,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
dentry = f.file->f_path.dentry;
if (*name) {
- if (!S_ISDIR(dentry->d_inode->i_mode)) {
+ if (!can_lookup(dentry->d_inode)) {
fdput(f);
return -ENOTDIR;
}
-
- retval = inode_permission(dentry->d_inode, MAY_EXEC);
- if (retval) {
- fdput(f);
- return retval;
- }
}
nd->path = f.file->f_path;
@@ -2189,15 +2175,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
* path-walking is complete.
*/
static struct filename *
-user_path_parent(int dfd, const char __user *path, struct nameidata *nd)
+user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
+ unsigned int flags)
{
struct filename *s = getname(path);
int error;
+ /* only LOOKUP_REVAL is allowed in extra flags */
+ flags &= LOOKUP_REVAL;
+
if (IS_ERR(s))
return s;
- error = filename_lookup(dfd, s, LOOKUP_PARENT, nd);
+ error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
if (error) {
putname(s);
return ERR_PTR(error);
@@ -3044,12 +3034,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return file;
}
-struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir)
+struct dentry *kern_path_create(int dfd, const char *pathname,
+ struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
struct nameidata nd;
int err2;
- int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+ int error;
+ bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
+
+ /*
+ * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
+ * other flags passed in are ignored!
+ */
+ lookup_flags &= LOOKUP_REVAL;
+
+ error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
if (error)
return ERR_PTR(error);
@@ -3113,13 +3113,14 @@ void done_path_create(struct path *path, struct dentry *dentry)
}
EXPORT_SYMBOL(done_path_create);
-struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
+struct dentry *user_path_create(int dfd, const char __user *pathname,
+ struct path *path, unsigned int lookup_flags)
{
struct filename *tmp = getname(pathname);
struct dentry *res;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
- res = kern_path_create(dfd, tmp->name, path, is_dir);
+ res = kern_path_create(dfd, tmp->name, path, lookup_flags);
putname(tmp);
return res;
}
@@ -3175,12 +3176,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
struct dentry *dentry;
struct path path;
int error;
+ unsigned int lookup_flags = 0;
error = may_mknod(mode);
if (error)
return error;
-
- dentry = user_path_create(dfd, filename, &path, 0);
+retry:
+ dentry = user_path_create(dfd, filename, &path, lookup_flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -3203,6 +3205,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
}
out:
done_path_create(&path, dentry);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -3241,8 +3247,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
struct dentry *dentry;
struct path path;
int error;
+ unsigned int lookup_flags = LOOKUP_DIRECTORY;
- dentry = user_path_create(dfd, pathname, &path, 1);
+retry:
+ dentry = user_path_create(dfd, pathname, &path, lookup_flags);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
@@ -3252,6 +3260,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
if (!error)
error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
done_path_create(&path, dentry);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -3327,8 +3339,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
struct filename *name;
struct dentry *dentry;
struct nameidata nd;
-
- name = user_path_parent(dfd, pathname, &nd);
+ unsigned int lookup_flags = 0;
+retry:
+ name = user_path_parent(dfd, pathname, &nd, lookup_flags);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -3370,6 +3383,10 @@ exit2:
exit1:
path_put(&nd.path);
putname(name);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -3423,8 +3440,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
struct dentry *dentry;
struct nameidata nd;
struct inode *inode = NULL;
-
- name = user_path_parent(dfd, pathname, &nd);
+ unsigned int lookup_flags = 0;
+retry:
+ name = user_path_parent(dfd, pathname, &nd, lookup_flags);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -3462,6 +3480,11 @@ exit2:
exit1:
path_put(&nd.path);
putname(name);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ inode = NULL;
+ goto retry;
+ }
return error;
slashes:
@@ -3513,12 +3536,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
struct filename *from;
struct dentry *dentry;
struct path path;
+ unsigned int lookup_flags = 0;
from = getname(oldname);
if (IS_ERR(from))
return PTR_ERR(from);
-
- dentry = user_path_create(newdfd, newname, &path, 0);
+retry:
+ dentry = user_path_create(newdfd, newname, &path, lookup_flags);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_putname;
@@ -3527,6 +3551,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
if (!error)
error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
done_path_create(&path, dentry);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out_putname:
putname(from);
return error;
@@ -3613,12 +3641,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
if (flags & AT_SYMLINK_FOLLOW)
how |= LOOKUP_FOLLOW;
-
+retry:
error = user_path_at(olddfd, oldname, how, &old_path);
if (error)
return error;
- new_dentry = user_path_create(newdfd, newname, &new_path, 0);
+ new_dentry = user_path_create(newdfd, newname, &new_path,
+ (how & LOOKUP_REVAL));
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out;
@@ -3635,6 +3664,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
out_dput:
done_path_create(&new_path, new_dentry);
+ if (retry_estale(error, how)) {
+ how |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
path_put(&old_path);
@@ -3807,15 +3840,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
struct nameidata oldnd, newnd;
struct filename *from;
struct filename *to;
+ unsigned int lookup_flags = 0;
+ bool should_retry = false;
int error;
-
- from = user_path_parent(olddfd, oldname, &oldnd);
+retry:
+ from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
if (IS_ERR(from)) {
error = PTR_ERR(from);
goto exit;
}
- to = user_path_parent(newdfd, newname, &newnd);
+ to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
if (IS_ERR(to)) {
error = PTR_ERR(to);
goto exit1;
@@ -3887,11 +3922,18 @@ exit3:
unlock_rename(new_dir, old_dir);
mnt_drop_write(oldnd.path.mnt);
exit2:
+ if (retry_estale(error, lookup_flags))
+ should_retry = true;
path_put(&newnd.path);
putname(to);
exit1:
path_put(&oldnd.path);
putname(from);
+ if (should_retry) {
+ should_retry = false;
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
exit:
return error;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 398a50ff2438..55605c552787 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
+ while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
cpu_relax();
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d7e9fe77188a..1acdad7fcec7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -976,9 +976,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
goto out;
if (attr->ia_size != i_size_read(inode)) {
- result = vmtruncate(inode, attr->ia_size);
- if (result)
- goto out;
+ truncate_setsize(inode, attr->ia_size);
mark_inode_dirty(inode);
}
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c817787fbdb4..24d1d1c5fcaf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
nfs_fscache_inode_unlock(inode);
}
}
+EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
/*
* Replace a per-inode cookie due to revalidation detecting a file having
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index c5b11b53ff33..277b02782897 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -153,6 +153,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,
}
/*
+ * Invalidate the contents of fscache for this inode. This will not sleep.
+ */
+static inline void nfs_fscache_invalidate(struct inode *inode)
+{
+ fscache_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
+ * Wait for an object to finish being invalidated.
+ */
+static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
+{
+ fscache_wait_on_invalidate(NFS_I(inode)->fscache);
+}
+
+/*
* indicate the client caching state as readable text
*/
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
@@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
return "no ";
}
-
#else /* CONFIG_NFS_FSCACHE */
static inline int nfs_fscache_register(void) { return 0; }
static inline void nfs_fscache_unregister(void) {}
@@ -205,6 +220,9 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
static inline void nfs_readpage_to_fscache(struct inode *inode,
struct page *page, int sync) {}
+
+static inline void nfs_fscache_invalidate(struct inode *inode) {}
+
static inline const char *nfs_server_fscache_state(struct nfs_server *server)
{
return "no ";
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2faae14d89f4..ebeb94ce1b0b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -161,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode)
nfsi->attrtimeo_timestamp = jiffies;
memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
- if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
+ if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
- else
+ nfs_fscache_invalidate(inode);
+ } else {
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+ }
}
void nfs_zap_caches(struct inode *inode)
@@ -179,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
if (mapping->nrpages != 0) {
spin_lock(&inode->i_lock);
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_fscache_invalidate(inode);
spin_unlock(&inode->i_lock);
}
}
@@ -881,7 +884,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
spin_unlock(&inode->i_lock);
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
- nfs_fscache_reset_inode_cookie(inode);
+ nfs_fscache_wait_on_invalidate(inode);
dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
return 0;
@@ -957,6 +960,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
ret |= NFS_INO_INVALID_ATTR;
}
+
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+
return ret;
}
@@ -1205,8 +1212,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
struct nfs_inode *nfsi = NFS_I(inode);
nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_fscache_invalidate(inode);
+ }
if ((fattr->valid & NFS_ATTR_FATTR) == 0)
return 0;
return nfs_refresh_inode_locked(inode, fattr);
@@ -1494,6 +1503,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(save_cache_validity & NFS_INO_REVAL_FORCED))
nfsi->cache_validity |= invalid;
+ if (invalid & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+
return 0;
out_err:
/*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e7699308364a..08ddcccb8887 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -5,6 +5,7 @@
*/
#include <linux/nfs_fs.h>
#include "internal.h"
+#include "fscache.h"
#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_FILE
@@ -74,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
nfs_file_set_open_context(filp, ctx);
+ nfs_fscache_set_inode_cookie(inode, filp);
err = 0;
out_put_ctx:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 493f0f41c554..5d864fb36578 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,7 +64,7 @@
#include "pnfs.h"
#include "netns.h"
#include "nfs4session.h"
-
+#include "fscache.h"
#define NFSDBG_FACILITY NFSDBG_PROC
@@ -734,6 +734,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
if (!cinfo->atomic || cinfo->before != dir->i_version)
nfs_force_lookup_revalidate(dir);
dir->i_version = cinfo->after;
+ nfs_fscache_invalidate(dir);
spin_unlock(&dir->i_lock);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5209916e1222..b673be31590e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1794,7 +1794,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
if (PagePrivate(page))
return -EBUSY;
- nfs_fscache_release_page(page, GFP_KERNEL);
+ if (!nfs_fscache_release_page(page, GFP_KERNEL))
+ return -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index e6c38159622f..e761ee95617f 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -8,61 +8,144 @@
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/sunrpc/clnt.h>
+#include <asm/uaccess.h>
#include "state.h"
-#include "fault_inject.h"
+#include "netns.h"
struct nfsd_fault_inject_op {
char *file;
- void (*func)(u64);
+ u64 (*forget)(struct nfs4_client *, u64);
+ u64 (*print)(struct nfs4_client *, u64);
};
static struct nfsd_fault_inject_op inject_ops[] = {
{
.file = "forget_clients",
- .func = nfsd_forget_clients,
+ .forget = nfsd_forget_client,
+ .print = nfsd_print_client,
},
{
.file = "forget_locks",
- .func = nfsd_forget_locks,
+ .forget = nfsd_forget_client_locks,
+ .print = nfsd_print_client_locks,
},
{
.file = "forget_openowners",
- .func = nfsd_forget_openowners,
+ .forget = nfsd_forget_client_openowners,
+ .print = nfsd_print_client_openowners,
},
{
.file = "forget_delegations",
- .func = nfsd_forget_delegations,
+ .forget = nfsd_forget_client_delegations,
+ .print = nfsd_print_client_delegations,
},
{
.file = "recall_delegations",
- .func = nfsd_recall_delegations,
+ .forget = nfsd_recall_client_delegations,
+ .print = nfsd_print_client_delegations,
},
};
static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
static struct dentry *debug_dir;
-static int nfsd_inject_set(void *op_ptr, u64 val)
+static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
{
- struct nfsd_fault_inject_op *op = op_ptr;
+ u64 count = 0;
if (val == 0)
printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
else
printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
- op->func(val);
- return 0;
+ nfs4_lock_state();
+ count = nfsd_for_n_state(val, op->forget);
+ nfs4_unlock_state();
+ printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
}
-static int nfsd_inject_get(void *data, u64 *val)
+static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
+ struct sockaddr_storage *addr,
+ size_t addr_size)
{
- *val = 0;
- return 0;
+ char buf[INET6_ADDRSTRLEN];
+ struct nfs4_client *clp;
+ u64 count;
+
+ nfs4_lock_state();
+ clp = nfsd_find_client(addr, addr_size);
+ if (clp) {
+ count = op->forget(clp, 0);
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
+ }
+ nfs4_unlock_state();
+}
+
+static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
+{
+ nfs4_lock_state();
+ *val = nfsd_for_n_state(0, op->print);
+ nfs4_unlock_state();
}
-DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
+static ssize_t fault_inject_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ static u64 val;
+ char read_buf[25];
+ size_t size, ret;
+ loff_t pos = *ppos;
+
+ if (!pos)
+ nfsd_inject_get(file->f_dentry->d_inode->i_private, &val);
+ size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
+
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= size || !len)
+ return 0;
+ if (len > size - pos)
+ len = size - pos;
+ ret = copy_to_user(buf, read_buf + pos, len);
+ if (ret == len)
+ return -EFAULT;
+ len -= ret;
+ *ppos = pos + len;
+ return len;
+}
+
+static ssize_t fault_inject_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ char write_buf[INET6_ADDRSTRLEN];
+ size_t size = min(sizeof(write_buf) - 1, len);
+ struct net *net = current->nsproxy->net_ns;
+ struct sockaddr_storage sa;
+ u64 val;
+
+ if (copy_from_user(write_buf, buf, size))
+ return -EFAULT;
+ write_buf[size] = '\0';
+
+ size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
+ if (size > 0)
+ nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size);
+ else {
+ val = simple_strtoll(write_buf, NULL, 0);
+ nfsd_inject_set(file->f_dentry->d_inode->i_private, val);
+ }
+ return len; /* on success, claim we got the whole input */
+}
+
+static const struct file_operations fops_nfsd = {
+ .owner = THIS_MODULE,
+ .read = fault_inject_read,
+ .write = fault_inject_write,
+};
void nfsd_fault_inject_cleanup(void)
{
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
deleted file mode 100644
index 90bd0570956c..000000000000
--- a/fs/nfsd/fault_inject.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
- *
- * Function definitions for fault injection
- */
-
-#ifndef LINUX_NFSD_FAULT_INJECT_H
-#define LINUX_NFSD_FAULT_INJECT_H
-
-#ifdef CONFIG_NFSD_FAULT_INJECTION
-int nfsd_fault_inject_init(void);
-void nfsd_fault_inject_cleanup(void);
-void nfsd_forget_clients(u64);
-void nfsd_forget_locks(u64);
-void nfsd_forget_openowners(u64);
-void nfsd_forget_delegations(u64);
-void nfsd_recall_delegations(u64);
-#else /* CONFIG_NFSD_FAULT_INJECTION */
-static inline int nfsd_fault_inject_init(void) { return 0; }
-static inline void nfsd_fault_inject_cleanup(void) {}
-static inline void nfsd_forget_clients(u64 num) {}
-static inline void nfsd_forget_locks(u64 num) {}
-static inline void nfsd_forget_openowners(u64 num) {}
-static inline void nfsd_forget_delegations(u64 num) {}
-static inline void nfsd_recall_delegations(u64 num) {}
-#endif /* CONFIG_NFSD_FAULT_INJECTION */
-
-#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 65c2431ea32f..1051bebff1b0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -24,7 +24,18 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+/* Hash tables for nfs4_clientid state */
+#define CLIENT_HASH_BITS 4
+#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
+#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
+
+#define LOCKOWNER_INO_HASH_BITS 8
+#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
+
+#define SESSION_HASH_SIZE 512
+
struct cld_net;
+struct nfsd4_client_tracking_ops;
struct nfsd_net {
struct cld_net *cld_net;
@@ -38,7 +49,62 @@ struct nfsd_net {
struct lock_manager nfsd4_manager;
bool grace_ended;
time_t boot_time;
+
+ /*
+ * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
+ * used in reboot/reset lease grace period processing
+ *
+ * conf_id_hashtbl[], and conf_name_tree hold confirmed
+ * setclientid_confirmed info.
+ *
+ * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
+ * setclientid info.
+ */
+ struct list_head *reclaim_str_hashtbl;
+ int reclaim_str_hashtbl_size;
+ struct list_head *conf_id_hashtbl;
+ struct rb_root conf_name_tree;
+ struct list_head *unconf_id_hashtbl;
+ struct rb_root unconf_name_tree;
+ struct list_head *ownerstr_hashtbl;
+ struct list_head *lockowner_ino_hashtbl;
+ struct list_head *sessionid_hashtbl;
+ /*
+ * client_lru holds client queue ordered by nfs4_client.cl_time
+ * for lease renewal.
+ *
+ * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
+ * for last close replay.
+ *
+ * All of the above fields are protected by the client_mutex.
+ */
+ struct list_head client_lru;
+ struct list_head close_lru;
+
+ struct delayed_work laundromat_work;
+
+ /* client_lock protects the client lru list and session hash table */
+ spinlock_t client_lock;
+
+ struct file *rec_file;
+ bool in_grace;
+ struct nfsd4_client_tracking_ops *client_tracking_ops;
+
+ time_t nfsd4_lease;
+ time_t nfsd4_grace;
+
+ bool nfsd_net_up;
+
+ /*
+ * Time of server startup
+ */
+ struct timeval nfssvc_boot;
+
+ struct svc_serv *nfsd_serv;
};
+/* Simple check to find out if a given net was properly initialized */
+#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
+
extern int nfsd_net_id;
#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b314888825d5..9170861c804a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -253,7 +253,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
- if (!rqstp->rq_respages[rqstp->rq_resused++])
+ if (!*(rqstp->rq_next_page++))
return 0;
w -= PAGE_SIZE;
}
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index a596e9d987e4..9cbc1a841f87 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
(resp->mask & NFS_ACL) ? resp->acl_access : NULL,
(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
while (w > 0) {
- if (!rqstp->rq_respages[rqstp->rq_resused++])
+ if (!*(rqstp->rq_next_page++))
return 0;
w -= PAGE_SIZE;
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 97d90d1c8608..1fc02dfdc5c4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -460,7 +460,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
__be32 nfserr;
int count = 0;
loff_t offset;
- int i;
+ struct page **p;
caddr_t page_addr = NULL;
dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
@@ -484,8 +484,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
&resp->common,
nfs3svc_encode_entry_plus);
memcpy(resp->verf, argp->verf, 8);
- for (i=1; i<rqstp->rq_resused ; i++) {
- page_addr = page_address(rqstp->rq_respages[i]);
+ for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+ page_addr = page_address(*p);
if (((caddr_t)resp->buffer >= page_addr) &&
((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 43f46cd9edea..324c0baf7cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -7,8 +7,10 @@
*/
#include <linux/namei.h>
+#include <linux/sunrpc/svc_xprt.h>
#include "xdr3.h"
#include "auth.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -323,7 +325,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readargs *args)
{
unsigned int len;
- int v,pn;
+ int v;
u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh)))
@@ -338,8 +340,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
/* set up the kvec */
v=0;
while (len > 0) {
- pn = rqstp->rq_resused++;
- rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+ struct page *p = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(p);
rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
len -= rqstp->rq_vec[v].iov_len;
v++;
@@ -461,8 +464,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
len = ntohl(*p++);
if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
return 0;
- args->tname = new =
- page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->tname = new = page_address(*(rqstp->rq_next_page++));
args->tlen = len;
/* first copy and check from the first page */
old = (char*)p;
@@ -533,8 +535,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
- args->buffer =
- page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
@@ -565,8 +566,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
if (args->count > PAGE_SIZE)
args->count = PAGE_SIZE;
- args->buffer =
- page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
@@ -575,7 +575,7 @@ int
nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirargs *args)
{
- int len, pn;
+ int len;
u32 max_blocksize = svc_max_payload(rqstp);
if (!(p = decode_fh(p, &args->fh)))
@@ -590,9 +590,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
args->count = len;
while (len > 0) {
- pn = rqstp->rq_resused++;
+ struct page *p = *(rqstp->rq_next_page++);
if (!args->buffer)
- args->buffer = page_address(rqstp->rq_respages[pn]);
+ args->buffer = page_address(p);
len -= PAGE_SIZE;
}
@@ -720,12 +720,14 @@ int
nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_writeres *resp)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
p = encode_wcc_data(rqstp, p, &resp->fh);
if (resp->status == 0) {
*p++ = htonl(resp->count);
*p++ = htonl(resp->committed);
- *p++ = htonl(nfssvc_boot.tv_sec);
- *p++ = htonl(nfssvc_boot.tv_usec);
+ *p++ = htonl(nn->nfssvc_boot.tv_sec);
+ *p++ = htonl(nn->nfssvc_boot.tv_usec);
}
return xdr_ressize_check(rqstp, p);
}
@@ -876,7 +878,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
common);
__be32 *p = cd->buffer;
caddr_t curr_page_addr = NULL;
- int pn; /* current page number */
+ struct page ** page;
int slen; /* string (name) length */
int elen; /* estimated entry length in words */
int num_entry_words = 0; /* actual number of words */
@@ -913,8 +915,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
}
/* determine which page in rq_respages[] we are currently filling */
- for (pn=1; pn < cd->rqstp->rq_resused; pn++) {
- curr_page_addr = page_address(cd->rqstp->rq_respages[pn]);
+ for (page = cd->rqstp->rq_respages + 1;
+ page < cd->rqstp->rq_next_page; page++) {
+ curr_page_addr = page_address(*page);
if (((caddr_t)cd->buffer >= curr_page_addr) &&
((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
@@ -929,14 +932,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
if (plus)
p = encode_entryplus_baggage(cd, p, name, namlen);
num_entry_words = p - cd->buffer;
- } else if (cd->rqstp->rq_respages[pn+1] != NULL) {
+ } else if (*(page+1) != NULL) {
/* temporarily encode entry into next page, then move back to
* current and next page in rq_respages[] */
__be32 *p1, *tmp;
int len1, len2;
/* grab next page for temporary storage of entry */
- p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]);
+ p1 = tmp = page_address(*(page+1));
p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
@@ -1082,11 +1085,13 @@ int
nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_commitres *resp)
{
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
p = encode_wcc_data(rqstp, p, &resp->fh);
/* Write verifier */
if (resp->status == 0) {
- *p++ = htonl(nfssvc_boot.tv_sec);
- *p++ = htonl(nfssvc_boot.tv_usec);
+ *p++ = htonl(nn->nfssvc_boot.tv_sec);
+ *p++ = htonl(nn->nfssvc_boot.tv_usec);
}
return xdr_ressize_check(rqstp, p);
}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index bdf29c96e4cd..99bc85ff0217 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include "nfsd.h"
#include "state.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -625,20 +626,46 @@ static const struct rpc_program cb_program = {
.pipe_dir_name = "nfsd4_cb",
};
-static int max_cb_time(void)
+static int max_cb_time(struct net *net)
{
- return max(nfsd4_lease/10, (time_t)1) * HZ;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
}
+static struct rpc_cred *callback_cred;
+
+int set_callback_cred(void)
+{
+ if (callback_cred)
+ return 0;
+ callback_cred = rpc_lookup_machine_cred("nfs");
+ if (!callback_cred)
+ return -ENOMEM;
+ return 0;
+}
+
+static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
+{
+ if (clp->cl_minorversion == 0) {
+ return get_rpccred(callback_cred);
+ } else {
+ struct rpc_auth *auth = client->cl_auth;
+ struct auth_cred acred = {};
+
+ acred.uid = ses->se_cb_sec.uid;
+ acred.gid = ses->se_cb_sec.gid;
+ return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
+ }
+}
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
struct rpc_timeout timeparms = {
- .to_initval = max_cb_time(),
+ .to_initval = max_cb_time(clp->net),
.to_retries = 0,
};
struct rpc_create_args args = {
- .net = &init_net,
+ .net = clp->net,
.address = (struct sockaddr *) &conn->cb_addr,
.addrsize = conn->cb_addrlen,
.saddress = (struct sockaddr *) &conn->cb_saddr,
@@ -648,6 +675,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_clnt *client;
+ struct rpc_cred *cred;
if (clp->cl_minorversion == 0) {
if (!clp->cl_cred.cr_principal &&
@@ -666,7 +694,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
args.protocol = XPRT_TRANSPORT_BC_TCP;
- args.authflavor = RPC_AUTH_UNIX;
+ args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
client = rpc_create(&args);
@@ -675,9 +703,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
PTR_ERR(client));
return PTR_ERR(client);
}
+ cred = get_backchannel_cred(clp, client, ses);
+ if (IS_ERR(cred)) {
+ rpc_shutdown_client(client);
+ return PTR_ERR(cred);
+ }
clp->cl_cb_client = client;
+ clp->cl_cb_cred = cred;
return 0;
-
}
static void warn_no_callback_path(struct nfs4_client *clp, int reason)
@@ -714,18 +747,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
.rpc_call_done = nfsd4_cb_probe_done,
};
-static struct rpc_cred *callback_cred;
-
-int set_callback_cred(void)
-{
- if (callback_cred)
- return 0;
- callback_cred = rpc_lookup_machine_cred("nfs");
- if (!callback_cred)
- return -ENOMEM;
- return 0;
-}
-
static struct workqueue_struct *callback_wq;
static void run_nfsd4_cb(struct nfsd4_callback *cb)
@@ -743,7 +764,6 @@ static void do_probe_callback(struct nfs4_client *clp)
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
cb->cb_msg.rpc_argp = NULL;
cb->cb_msg.rpc_resp = NULL;
- cb->cb_msg.rpc_cred = callback_cred;
cb->cb_ops = &nfsd4_cb_probe_ops;
@@ -962,6 +982,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
if (clp->cl_cb_client) {
rpc_shutdown_client(clp->cl_cb_client);
clp->cl_cb_client = NULL;
+ put_rpccred(clp->cl_cb_cred);
+ clp->cl_cb_cred = NULL;
}
if (clp->cl_cb_conn.cb_xprt) {
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -995,7 +1017,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
run_nfsd4_cb(cb);
}
-void nfsd4_do_callback_rpc(struct work_struct *w)
+static void nfsd4_do_callback_rpc(struct work_struct *w)
{
struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
@@ -1010,10 +1032,16 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
nfsd4_release_cb(cb);
return;
}
+ cb->cb_msg.rpc_cred = clp->cl_cb_cred;
rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
cb->cb_ops, cb);
}
+void nfsd4_init_callback(struct nfsd4_callback *cb)
+{
+ INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
+}
+
void nfsd4_cb_recall(struct nfs4_delegation *dp)
{
struct nfsd4_callback *cb = &dp->dl_recall;
@@ -1025,7 +1053,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
- cb->cb_msg.rpc_cred = callback_cred;
cb->cb_ops = &nfsd4_cb_recall_ops;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6c9a4b291dba..9d1c5dba2bbb 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -40,6 +40,7 @@
#include "xdr4.h"
#include "vfs.h"
#include "current_stateid.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -194,6 +195,7 @@ static __be32
do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
{
struct svc_fh *resfh;
+ int accmode;
__be32 status;
resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
@@ -253,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
/* set reply cache */
fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
&resfh->fh_handle);
- if (!open->op_created)
- status = do_open_permission(rqstp, resfh, open,
- NFSD_MAY_NOP);
+ accmode = NFSD_MAY_NOP;
+ if (open->op_created)
+ accmode |= NFSD_MAY_OWNER_OVERRIDE;
+ status = do_open_permission(rqstp, resfh, open, accmode);
set_change_info(&open->op_cinfo, current_fh);
fh_dup2(current_fh, resfh);
out:
@@ -304,6 +307,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
struct nfsd4_compoundres *resp;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
(int)open->op_fname.len, open->op_fname.data,
@@ -331,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* check seqid for replay. set nfs4_owner */
resp = rqstp->rq_resp;
- status = nfsd4_process_open1(&resp->cstate, open);
+ status = nfsd4_process_open1(&resp->cstate, open, nn);
if (status == nfserr_replay_me) {
struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
fh_put(&cstate->current_fh);
@@ -354,10 +359,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -370,7 +375,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
- status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion);
+ status = nfs4_check_open_reclaim(&open->op_clientid,
+ cstate->minorversion,
+ nn);
if (status)
goto out;
case NFS4_OPEN_CLAIM_FH:
@@ -490,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&access->ac_supported);
}
-static void gen_boot_verifier(nfs4_verifier *verifier)
+static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
{
__be32 verf[2];
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- verf[0] = (__be32)nfssvc_boot.tv_sec;
- verf[1] = (__be32)nfssvc_boot.tv_usec;
+ verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
+ verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
memcpy(verifier->data, verf, sizeof(verifier->data));
}
@@ -503,7 +511,7 @@ static __be32
nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_commit *commit)
{
- gen_boot_verifier(&commit->co_verf);
+ gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));
return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
commit->co_count);
}
@@ -684,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (read->rd_offset >= OFFSET_MAX)
return nfserr_inval;
+ /*
+ * If we do a zero copy read, then a client will see read data
+ * that reflects the state of the file *after* performing the
+ * following compound.
+ *
+ * To ensure proper ordering, we therefore turn off zero copy if
+ * the client wants us to do more in this compound:
+ */
+ if (!nfsd4_last_compound_op(rqstp))
+ rqstp->rq_splice_ok = false;
+
nfs4_lock_state();
/* check stateid */
if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
@@ -876,6 +895,24 @@ out:
return status;
}
+static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write)
+{
+ int i = 1;
+ int buflen = write->wr_buflen;
+
+ vec[0].iov_base = write->wr_head.iov_base;
+ vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len);
+ buflen -= vec[0].iov_len;
+
+ while (buflen) {
+ vec[i].iov_base = page_address(write->wr_pagelist[i - 1]);
+ vec[i].iov_len = min_t(int, PAGE_SIZE, buflen);
+ buflen -= vec[i].iov_len;
+ i++;
+ }
+ return i;
+}
+
static __be32
nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_write *write)
@@ -884,6 +921,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file *filp = NULL;
__be32 status = nfs_ok;
unsigned long cnt;
+ int nvecs;
/* no need to check permission - this will be done in nfsd_write() */
@@ -904,10 +942,13 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
- gen_boot_verifier(&write->wr_verifier);
+ gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
+
+ nvecs = fill_in_write_vector(rqstp->rq_vec, write);
+ WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
status = nfsd_write(rqstp, &cstate->current_fh, filp,
- write->wr_offset, rqstp->rq_vec, write->wr_vlen,
+ write->wr_offset, rqstp->rq_vec, nvecs,
&cnt, &write->wr_how_written);
if (filp)
fput(filp);
@@ -1666,6 +1707,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_name = "OP_EXCHANGE_ID",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
},
+ [OP_BACKCHANNEL_CTL] = {
+ .op_func = (nfsd4op_func)nfsd4_backchannel_ctl,
+ .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
+ .op_name = "OP_BACKCHANNEL_CTL",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+ },
[OP_BIND_CONN_TO_SESSION] = {
.op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
@@ -1719,6 +1766,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_func = (nfsd4op_func)nfsd4_free_stateid,
.op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
.op_name = "OP_FREE_STATEID",
+ .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 43295d45cc2b..ba6fdd4a0455 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,13 +58,11 @@ struct nfsd4_client_tracking_ops {
void (*create)(struct nfs4_client *);
void (*remove)(struct nfs4_client *);
int (*check)(struct nfs4_client *);
- void (*grace_done)(struct net *, time_t);
+ void (*grace_done)(struct nfsd_net *, time_t);
};
/* Globals */
-static struct file *rec_file;
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
-static struct nfsd4_client_tracking_ops *client_tracking_ops;
static int
nfs4_save_creds(const struct cred **original_creds)
@@ -102,33 +100,39 @@ md5_to_hex(char *out, char *md5)
*out = '\0';
}
-__be32
-nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
+static int
+nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
{
struct xdr_netobj cksum;
struct hash_desc desc;
struct scatterlist sg;
- __be32 status = nfserr_jukebox;
+ int status;
dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
clname->len, clname->data);
desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm))
+ if (IS_ERR(desc.tfm)) {
+ status = PTR_ERR(desc.tfm);
goto out_no_tfm;
+ }
+
cksum.len = crypto_hash_digestsize(desc.tfm);
cksum.data = kmalloc(cksum.len, GFP_KERNEL);
- if (cksum.data == NULL)
+ if (cksum.data == NULL) {
+ status = -ENOMEM;
goto out;
+ }
sg_init_one(&sg, clname->data, clname->len);
- if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data))
+ status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
+ if (status)
goto out;
md5_to_hex(dname, cksum.data);
- status = nfs_ok;
+ status = 0;
out:
kfree(cksum.data);
crypto_free_hash(desc.tfm);
@@ -136,29 +140,61 @@ out_no_tfm:
return status;
}
+/*
+ * If we had an error generating the recdir name for the legacy tracker
+ * then warn the admin. If the error doesn't appear to be transient,
+ * then disable recovery tracking.
+ */
+static void
+legacy_recdir_name_error(int error)
+{
+ printk(KERN_ERR "NFSD: unable to generate recoverydir "
+ "name (%d).\n", error);
+
+ /*
+ * if the algorithm just doesn't exist, then disable the recovery
+ * tracker altogether. The crypto libs will generally return this if
+ * FIPS is enabled as well.
+ */
+ if (error == -ENOENT) {
+ printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
+ "Reboot recovery will not function correctly!\n");
+
+ /* the argument is ignored by the legacy exit function */
+ nfsd4_client_tracking_exit(NULL);
+ }
+}
+
static void
nfsd4_create_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
- char *dname = clp->cl_recdir;
+ char dname[HEXDIR_LEN];
struct dentry *dir, *dentry;
+ struct nfs4_client_reclaim *crp;
int status;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
- if (!rec_file)
+ if (!nn->rec_file)
return;
+
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return legacy_recdir_name_error(status);
+
status = nfs4_save_creds(&original_cred);
if (status < 0)
return;
- status = mnt_want_write_file(rec_file);
+ status = mnt_want_write_file(nn->rec_file);
if (status)
return;
- dir = rec_file->f_path.dentry;
+ dir = nn->rec_file->f_path.dentry;
/* lock the parent */
mutex_lock(&dir->d_inode->i_mutex);
@@ -182,18 +218,24 @@ out_put:
dput(dentry);
out_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
- if (status == 0)
- vfs_fsync(rec_file, 0);
- else
+ if (status == 0) {
+ if (nn->in_grace) {
+ crp = nfs4_client_to_reclaim(dname, nn);
+ if (crp)
+ crp->cr_clp = clp;
+ }
+ vfs_fsync(nn->rec_file, 0);
+ } else {
printk(KERN_ERR "NFSD: failed to write recovery record"
" (err %d); please check that %s exists"
" and is writeable", status,
user_recovery_dirname);
- mnt_drop_write_file(rec_file);
+ }
+ mnt_drop_write_file(nn->rec_file);
nfs4_reset_creds(original_cred);
}
-typedef int (recdir_func)(struct dentry *, struct dentry *);
+typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
struct name_list {
char name[HEXDIR_LEN];
@@ -219,10 +261,10 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
}
static int
-nfsd4_list_rec_dir(recdir_func *f)
+nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
{
const struct cred *original_cred;
- struct dentry *dir = rec_file->f_path.dentry;
+ struct dentry *dir = nn->rec_file->f_path.dentry;
LIST_HEAD(names);
int status;
@@ -230,13 +272,13 @@ nfsd4_list_rec_dir(recdir_func *f)
if (status < 0)
return status;
- status = vfs_llseek(rec_file, 0, SEEK_SET);
+ status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
if (status < 0) {
nfs4_reset_creds(original_cred);
return status;
}
- status = vfs_readdir(rec_file, nfsd4_build_namelist, &names);
+ status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
while (!list_empty(&names)) {
struct name_list *entry;
@@ -248,7 +290,7 @@ nfsd4_list_rec_dir(recdir_func *f)
status = PTR_ERR(dentry);
break;
}
- status = f(dir, dentry);
+ status = f(dir, dentry, nn);
dput(dentry);
}
list_del(&entry->list);
@@ -260,14 +302,14 @@ nfsd4_list_rec_dir(recdir_func *f)
}
static int
-nfsd4_unlink_clid_dir(char *name, int namlen)
+nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
{
struct dentry *dir, *dentry;
int status;
dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
- dir = rec_file->f_path.dentry;
+ dir = nn->rec_file->f_path.dentry;
mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, dir, namlen);
if (IS_ERR(dentry)) {
@@ -289,37 +331,52 @@ static void
nfsd4_remove_clid_dir(struct nfs4_client *clp)
{
const struct cred *original_cred;
+ struct nfs4_client_reclaim *crp;
+ char dname[HEXDIR_LEN];
int status;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+ if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return;
- status = mnt_want_write_file(rec_file);
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status)
+ return legacy_recdir_name_error(status);
+
+ status = mnt_want_write_file(nn->rec_file);
if (status)
goto out;
clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
status = nfs4_save_creds(&original_cred);
if (status < 0)
- goto out;
+ goto out_drop_write;
- status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
+ status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
nfs4_reset_creds(original_cred);
- if (status == 0)
- vfs_fsync(rec_file, 0);
- mnt_drop_write_file(rec_file);
+ if (status == 0) {
+ vfs_fsync(nn->rec_file, 0);
+ if (nn->in_grace) {
+ /* remove reclaim record */
+ crp = nfsd4_find_reclaim_client(dname, nn);
+ if (crp)
+ nfs4_remove_reclaim_record(crp, nn);
+ }
+ }
+out_drop_write:
+ mnt_drop_write_file(nn->rec_file);
out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
- " %.*s\n", HEXDIR_LEN, clp->cl_recdir);
+ " %.*s\n", HEXDIR_LEN, dname);
}
static int
-purge_old(struct dentry *parent, struct dentry *child)
+purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
{
int status;
- if (nfs4_has_reclaimed_state(child->d_name.name, false))
+ if (nfs4_has_reclaimed_state(child->d_name.name, nn))
return 0;
status = vfs_rmdir(parent->d_inode, child);
@@ -331,27 +388,29 @@ purge_old(struct dentry *parent, struct dentry *child)
}
static void
-nfsd4_recdir_purge_old(struct net *net, time_t boot_time)
+nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
{
int status;
- if (!rec_file)
+ nn->in_grace = false;
+ if (!nn->rec_file)
return;
- status = mnt_want_write_file(rec_file);
+ status = mnt_want_write_file(nn->rec_file);
if (status)
goto out;
- status = nfsd4_list_rec_dir(purge_old);
+ status = nfsd4_list_rec_dir(purge_old, nn);
if (status == 0)
- vfs_fsync(rec_file, 0);
- mnt_drop_write_file(rec_file);
+ vfs_fsync(nn->rec_file, 0);
+ mnt_drop_write_file(nn->rec_file);
out:
+ nfs4_release_reclaim(nn);
if (status)
printk("nfsd4: failed to purge old clients from recovery"
- " directory %s\n", rec_file->f_path.dentry->d_name.name);
+ " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
}
static int
-load_recdir(struct dentry *parent, struct dentry *child)
+load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
{
if (child->d_name.len != HEXDIR_LEN - 1) {
printk("nfsd4: illegal name %s in recovery directory\n",
@@ -359,21 +418,22 @@ load_recdir(struct dentry *parent, struct dentry *child)
/* Keep trying; maybe the others are OK: */
return 0;
}
- nfs4_client_to_reclaim(child->d_name.name);
+ nfs4_client_to_reclaim(child->d_name.name, nn);
return 0;
}
static int
-nfsd4_recdir_load(void) {
+nfsd4_recdir_load(struct net *net) {
int status;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- if (!rec_file)
+ if (!nn->rec_file)
return 0;
- status = nfsd4_list_rec_dir(load_recdir);
+ status = nfsd4_list_rec_dir(load_recdir, nn);
if (status)
printk("nfsd4: failed loading clients from recovery"
- " directory %s\n", rec_file->f_path.dentry->d_name.name);
+ " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
return status;
}
@@ -382,15 +442,16 @@ nfsd4_recdir_load(void) {
*/
static int
-nfsd4_init_recdir(void)
+nfsd4_init_recdir(struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
const struct cred *original_cred;
int status;
printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
user_recovery_dirname);
- BUG_ON(rec_file);
+ BUG_ON(nn->rec_file);
status = nfs4_save_creds(&original_cred);
if (status < 0) {
@@ -400,23 +461,65 @@ nfsd4_init_recdir(void)
return status;
}
- rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
- if (IS_ERR(rec_file)) {
+ nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
+ if (IS_ERR(nn->rec_file)) {
printk("NFSD: unable to find recovery directory %s\n",
user_recovery_dirname);
- status = PTR_ERR(rec_file);
- rec_file = NULL;
+ status = PTR_ERR(nn->rec_file);
+ nn->rec_file = NULL;
}
nfs4_reset_creds(original_cred);
+ if (!status)
+ nn->in_grace = true;
return status;
}
+
+static int
+nfs4_legacy_state_init(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) *
+ CLIENT_HASH_SIZE, GFP_KERNEL);
+ if (!nn->reclaim_str_hashtbl)
+ return -ENOMEM;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
+ nn->reclaim_str_hashtbl_size = 0;
+
+ return 0;
+}
+
+static void
+nfs4_legacy_state_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ kfree(nn->reclaim_str_hashtbl);
+}
+
static int
nfsd4_load_reboot_recovery_data(struct net *net)
{
int status;
+ status = nfsd4_init_recdir(net);
+ if (!status)
+ status = nfsd4_recdir_load(net);
+ if (status)
+ printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+}
+
+static int
+nfsd4_legacy_tracking_init(struct net *net)
+{
+ int status;
+
/* XXX: The legacy code won't work in a container */
if (net != &init_net) {
WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
@@ -424,30 +527,37 @@ nfsd4_load_reboot_recovery_data(struct net *net)
return -EINVAL;
}
- nfs4_lock_state();
- status = nfsd4_init_recdir();
- if (!status)
- status = nfsd4_recdir_load();
- nfs4_unlock_state();
+ status = nfs4_legacy_state_init(net);
if (status)
- printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
+ return status;
+
+ status = nfsd4_load_reboot_recovery_data(net);
+ if (status)
+ goto err;
+ return 0;
+
+err:
+ nfs4_legacy_state_shutdown(net);
return status;
}
static void
-nfsd4_shutdown_recdir(void)
+nfsd4_shutdown_recdir(struct nfsd_net *nn)
{
- if (!rec_file)
+ if (!nn->rec_file)
return;
- fput(rec_file);
- rec_file = NULL;
+ fput(nn->rec_file);
+ nn->rec_file = NULL;
}
static void
nfsd4_legacy_tracking_exit(struct net *net)
{
- nfs4_release_reclaim();
- nfsd4_shutdown_recdir();
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_release_reclaim(nn);
+ nfsd4_shutdown_recdir(nn);
+ nfs4_legacy_state_shutdown(net);
}
/*
@@ -480,13 +590,26 @@ nfs4_recoverydir(void)
static int
nfsd4_check_legacy_client(struct nfs4_client *clp)
{
+ int status;
+ char dname[HEXDIR_LEN];
+ struct nfs4_client_reclaim *crp;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
/* did we already find that this client is stable? */
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
return 0;
+ status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+ if (status) {
+ legacy_recdir_name_error(status);
+ return status;
+ }
+
/* look for it in the reclaim hashtable otherwise */
- if (nfsd4_find_reclaim_client(clp)) {
+ crp = nfsd4_find_reclaim_client(dname, nn);
+ if (crp) {
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ crp->cr_clp = clp;
return 0;
}
@@ -494,7 +617,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
}
static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
- .init = nfsd4_load_reboot_recovery_data,
+ .init = nfsd4_legacy_tracking_init,
.exit = nfsd4_legacy_tracking_exit,
.create = nfsd4_create_clid_dir,
.remove = nfsd4_remove_clid_dir,
@@ -785,8 +908,7 @@ nfsd4_cld_create(struct nfs4_client *clp)
{
int ret;
struct cld_upcall *cup;
- /* FIXME: determine net from clp */
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
/* Don't upcall if it's already stored */
@@ -823,8 +945,7 @@ nfsd4_cld_remove(struct nfs4_client *clp)
{
int ret;
struct cld_upcall *cup;
- /* FIXME: determine net from clp */
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
/* Don't upcall if it's already removed */
@@ -861,8 +982,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
{
int ret;
struct cld_upcall *cup;
- /* FIXME: determine net from clp */
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
/* Don't upcall if one was already stored during this grace pd */
@@ -892,11 +1012,10 @@ nfsd4_cld_check(struct nfs4_client *clp)
}
static void
-nfsd4_cld_grace_done(struct net *net, time_t boot_time)
+nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
{
int ret;
struct cld_upcall *cup;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
cup = alloc_cld_upcall(cn);
@@ -926,28 +1045,261 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
.grace_done = nfsd4_cld_grace_done,
};
+/* upcall via usermodehelper */
+static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
+module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
+ S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
+
+static bool cltrack_legacy_disable;
+module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cltrack_legacy_disable,
+ "Disable legacy recoverydir conversion. Default: false");
+
+#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
+#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
+
+static char *
+nfsd4_cltrack_legacy_topdir(void)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ if (cltrack_legacy_disable)
+ return NULL;
+
+ len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
+ strlen(nfs4_recoverydir()) + 1;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
+ nfs4_recoverydir());
+ if (copied >= len) {
+ /* just return nothing if output was truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static char *
+nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
+{
+ int copied;
+ size_t len;
+ char *result;
+
+ if (cltrack_legacy_disable)
+ return NULL;
+
+ /* +1 is for '/' between "topdir" and "recdir" */
+ len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
+ strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
+
+ result = kmalloc(len, GFP_KERNEL);
+ if (!result)
+ return result;
+
+ copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
+ nfs4_recoverydir());
+ if (copied > (len - HEXDIR_LEN)) {
+ /* just return nothing if output will be truncated */
+ kfree(result);
+ return NULL;
+ }
+
+ copied = nfs4_make_rec_clidname(result + copied, name);
+ if (copied) {
+ kfree(result);
+ return NULL;
+ }
+
+ return result;
+}
+
+static int
+nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
+{
+ char *envp[2];
+ char *argv[4];
+ int ret;
+
+ if (unlikely(!cltrack_prog[0])) {
+ dprintk("%s: cltrack_prog is disabled\n", __func__);
+ return -EACCES;
+ }
+
+ dprintk("%s: cmd: %s\n", __func__, cmd);
+ dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
+ dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
+
+ envp[0] = legacy;
+ envp[1] = NULL;
+
+ argv[0] = (char *)cltrack_prog;
+ argv[1] = cmd;
+ argv[2] = arg;
+ argv[3] = NULL;
+
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ /*
+ * Disable the upcall mechanism if we're getting an ENOENT or EACCES
+ * error. The admin can re-enable it on the fly by using sysfs
+ * once the problem has been fixed.
+ */
+ if (ret == -ENOENT || ret == -EACCES) {
+ dprintk("NFSD: %s was not found or isn't executable (%d). "
+ "Setting cltrack_prog to blank string!",
+ cltrack_prog, ret);
+ cltrack_prog[0] = '\0';
+ }
+ dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
+
+ return ret;
+}
+
+static char *
+bin_to_hex_dup(const unsigned char *src, int srclen)
+{
+ int i;
+ char *buf, *hex;
+
+ /* +1 for terminating NULL */
+ buf = kmalloc((srclen * 2) + 1, GFP_KERNEL);
+ if (!buf)
+ return buf;
+
+ hex = buf;
+ for (i = 0; i < srclen; i++) {
+ sprintf(hex, "%2.2x", *src++);
+ hex += 2;
+ }
+ return buf;
+}
+
+static int
+nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
+{
+ return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
+}
+
+static void
+nfsd4_umh_cltrack_create(struct nfs4_client *clp)
+{
+ char *hexid;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return;
+ }
+ nfsd4_umh_cltrack_upcall("create", hexid, NULL);
+ kfree(hexid);
+}
+
+static void
+nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
+{
+ char *hexid;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return;
+ }
+ nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
+ kfree(hexid);
+}
+
+static int
+nfsd4_umh_cltrack_check(struct nfs4_client *clp)
+{
+ int ret;
+ char *hexid, *legacy;
+
+ hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
+ if (!hexid) {
+ dprintk("%s: can't allocate memory for upcall!\n", __func__);
+ return -ENOMEM;
+ }
+ legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
+ ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
+ kfree(legacy);
+ kfree(hexid);
+ return ret;
+}
+
+static void
+nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
+ time_t boot_time)
+{
+ char *legacy;
+ char timestr[22]; /* FIXME: better way to determine max size? */
+
+ sprintf(timestr, "%ld", boot_time);
+ legacy = nfsd4_cltrack_legacy_topdir();
+ nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
+ kfree(legacy);
+}
+
+static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+ .init = nfsd4_umh_cltrack_init,
+ .exit = NULL,
+ .create = nfsd4_umh_cltrack_create,
+ .remove = nfsd4_umh_cltrack_remove,
+ .check = nfsd4_umh_cltrack_check,
+ .grace_done = nfsd4_umh_cltrack_grace_done,
+};
+
int
nfsd4_client_tracking_init(struct net *net)
{
int status;
struct path path;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- if (!client_tracking_ops) {
- client_tracking_ops = &nfsd4_cld_tracking_ops;
- status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
- if (!status) {
- if (S_ISDIR(path.dentry->d_inode->i_mode))
- client_tracking_ops =
- &nfsd4_legacy_tracking_ops;
- path_put(&path);
- }
+ /* just run the init if it the method is already decided */
+ if (nn->client_tracking_ops)
+ goto do_init;
+
+ /*
+ * First, try a UMH upcall. It should succeed or fail quickly, so
+ * there's little harm in trying that first.
+ */
+ nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
+
+ /*
+ * See if the recoverydir exists and is a directory. If it is,
+ * then use the legacy ops.
+ */
+ nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+ status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+ if (!status) {
+ status = S_ISDIR(path.dentry->d_inode->i_mode);
+ path_put(&path);
+ if (status)
+ goto do_init;
}
- status = client_tracking_ops->init(net);
+ /* Finally, try to use nfsdcld */
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+ printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
+ "removed in 3.10. Please transition to using "
+ "nfsdcltrack.\n");
+do_init:
+ status = nn->client_tracking_ops->init(net);
if (status) {
printk(KERN_WARNING "NFSD: Unable to initialize client "
"recovery tracking! (%d)\n", status);
- client_tracking_ops = NULL;
+ nn->client_tracking_ops = NULL;
}
return status;
}
@@ -955,40 +1307,49 @@ nfsd4_client_tracking_init(struct net *net)
void
nfsd4_client_tracking_exit(struct net *net)
{
- if (client_tracking_ops) {
- client_tracking_ops->exit(net);
- client_tracking_ops = NULL;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->client_tracking_ops) {
+ if (nn->client_tracking_ops->exit)
+ nn->client_tracking_ops->exit(net);
+ nn->client_tracking_ops = NULL;
}
}
void
nfsd4_client_record_create(struct nfs4_client *clp)
{
- if (client_tracking_ops)
- client_tracking_ops->create(clp);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->create(clp);
}
void
nfsd4_client_record_remove(struct nfs4_client *clp)
{
- if (client_tracking_ops)
- client_tracking_ops->remove(clp);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->remove(clp);
}
int
nfsd4_client_record_check(struct nfs4_client *clp)
{
- if (client_tracking_ops)
- return client_tracking_ops->check(clp);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ if (nn->client_tracking_ops)
+ return nn->client_tracking_ops->check(clp);
return -EOPNOTSUPP;
}
void
-nfsd4_record_grace_done(struct net *net, time_t boot_time)
+nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
{
- if (client_tracking_ops)
- client_tracking_ops->grace_done(net, boot_time);
+ if (nn->client_tracking_ops)
+ nn->client_tracking_ops->grace_done(nn, boot_time);
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d0237f872cc4..ac8ed96c4199 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,16 +44,11 @@
#include "xdr4.h"
#include "vfs.h"
#include "current_stateid.h"
-#include "fault_inject.h"
#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
-/* Globals */
-time_t nfsd4_lease = 90; /* default lease time */
-time_t nfsd4_grace = 90;
-
#define all_ones {{~0,~0},~0}
static const stateid_t one_stateid = {
.si_generation = ~0,
@@ -176,8 +171,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
return ret & OWNER_HASH_MASK;
}
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
-
/* hash table for nfs4_file */
#define FILE_HASH_BITS 8
#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
@@ -192,7 +185,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE];
static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
{
- BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
+ WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
atomic_inc(&fp->fi_access[oflag]);
}
@@ -251,7 +244,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)
* preallocations that can exist at a time, but the state lock
* prevents anyone from using ours before we get here:
*/
- BUG_ON(error);
+ WARN_ON_ONCE(error);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
* I don't think it is for 4.1. But with 4.0 I worry that, for
@@ -340,7 +333,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
dp->dl_time = 0;
atomic_set(&dp->dl_count, 1);
- INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
+ nfsd4_init_callback(&dp->dl_recall);
return dp;
}
@@ -390,14 +383,6 @@ unhash_delegation(struct nfs4_delegation *dp)
* SETCLIENTID state
*/
-/* client_lock protects the client lru list and session hash table */
-static DEFINE_SPINLOCK(client_lock);
-
-/* Hash tables for nfs4_clientid state */
-#define CLIENT_HASH_BITS 4
-#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
-#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
-
static unsigned int clientid_hashval(u32 id)
{
return id & CLIENT_HASH_MASK;
@@ -409,31 +394,6 @@ static unsigned int clientstr_hashval(const char *name)
}
/*
- * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
- * used in reboot/reset lease grace period processing
- *
- * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
- * setclientid_confirmed info.
- *
- * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
- * setclientid info.
- *
- * client_lru holds client queue ordered by nfs4_client.cl_time
- * for lease renewal.
- *
- * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
- * for last close replay.
- */
-static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
-static int reclaim_str_hashtbl_size = 0;
-static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
-static struct list_head client_lru;
-static struct list_head close_lru;
-
-/*
* We store the NONE, READ, WRITE, and BOTH bits separately in the
* st_{access,deny}_bmap field of the stateid, in order to track not
* only what share bits are currently in force, but also what
@@ -526,7 +486,8 @@ static int nfs4_access_to_omode(u32 access)
case NFS4_SHARE_ACCESS_BOTH:
return O_RDWR;
}
- BUG();
+ WARN_ON_ONCE(1);
+ return O_RDONLY;
}
/* release all access and file references for a given stateid */
@@ -652,9 +613,6 @@ static void release_openowner(struct nfs4_openowner *oo)
nfs4_free_openowner(oo);
}
-#define SESSION_HASH_SIZE 512
-static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
-
static inline int
hash_sessionid(struct nfs4_sessionid *sessionid)
{
@@ -785,9 +743,12 @@ out_free:
return NULL;
}
-static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize)
+static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
+ struct nfsd4_channel_attrs *req,
+ int numslots, int slotsize,
+ struct nfsd_net *nn)
{
- u32 maxrpc = nfsd_serv->sv_max_mesg;
+ u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
new->maxreqs = numslots;
new->maxresp_cached = min_t(u32, req->maxresp_cached,
@@ -906,21 +867,27 @@ static void __free_session(struct nfsd4_session *ses)
static void free_session(struct kref *kref)
{
struct nfsd4_session *ses;
+ struct nfsd_net *nn;
- lockdep_assert_held(&client_lock);
ses = container_of(kref, struct nfsd4_session, se_ref);
+ nn = net_generic(ses->se_client->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
nfsd4_del_conns(ses);
__free_session(ses);
}
void nfsd4_put_session(struct nfsd4_session *ses)
{
- spin_lock(&client_lock);
+ struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
nfsd4_put_session_locked(ses);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
+ struct nfsd_net *nn)
{
struct nfsd4_session *new;
int numslots, slotsize;
@@ -941,13 +908,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
return NULL;
}
- init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize);
+ init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
return new;
}
-static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
+static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
{
int idx;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
new->se_client = clp;
gen_sessionid(new);
@@ -957,14 +925,15 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
new->se_cb_seq_nr = 1;
new->se_flags = cses->flags;
new->se_cb_prog = cses->callback_prog;
+ new->se_cb_sec = cses->cb_sec;
kref_init(&new->se_ref);
idx = hash_sessionid(&new->se_sessionid);
- spin_lock(&client_lock);
- list_add(&new->se_hash, &sessionid_hashtbl[idx]);
+ spin_lock(&nn->client_lock);
+ list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
spin_lock(&clp->cl_lock);
list_add(&new->se_perclnt, &clp->cl_sessions);
spin_unlock(&clp->cl_lock);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
if (cses->flags & SESSION4_BACK_CHAN) {
struct sockaddr *sa = svc_addr(rqstp);
@@ -978,20 +947,20 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
}
- return new;
}
/* caller must hold client_lock */
static struct nfsd4_session *
-find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
+find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
{
struct nfsd4_session *elem;
int idx;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dump_sessionid(__func__, sessionid);
idx = hash_sessionid(sessionid);
/* Search in the appropriate list */
- list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
+ list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
if (!memcmp(elem->se_sessionid.data, sessionid->data,
NFS4_MAX_SESSIONID_LEN)) {
return elem;
@@ -1016,6 +985,8 @@ unhash_session(struct nfsd4_session *ses)
static inline void
renew_client_locked(struct nfs4_client *clp)
{
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
if (is_client_expired(clp)) {
WARN_ON(1);
printk("%s: client (clientid %08x/%08x) already expired\n",
@@ -1028,16 +999,18 @@ renew_client_locked(struct nfs4_client *clp)
dprintk("renewing client (clientid %08x/%08x)\n",
clp->cl_clientid.cl_boot,
clp->cl_clientid.cl_id);
- list_move_tail(&clp->cl_lru, &client_lru);
+ list_move_tail(&clp->cl_lru, &nn->client_lru);
clp->cl_time = get_seconds();
}
static inline void
renew_client(struct nfs4_client *clp)
{
- spin_lock(&client_lock);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
renew_client_locked(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
@@ -1075,7 +1048,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
static inline void
free_client(struct nfs4_client *clp)
{
- lockdep_assert_held(&client_lock);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ lockdep_assert_held(&nn->client_lock);
while (!list_empty(&clp->cl_sessions)) {
struct nfsd4_session *ses;
ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1092,15 +1067,16 @@ void
release_session_client(struct nfsd4_session *session)
{
struct nfs4_client *clp = session->se_client;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock))
+ if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
return;
if (is_client_expired(clp)) {
free_client(clp);
session->se_client = NULL;
} else
renew_client_locked(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
/* must be called under the client_lock */
@@ -1123,6 +1099,7 @@ destroy_client(struct nfs4_client *clp)
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head reaplist;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
@@ -1144,12 +1121,15 @@ destroy_client(struct nfs4_client *clp)
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
list_del(&clp->cl_idhash);
- list_del(&clp->cl_strhash);
- spin_lock(&client_lock);
+ if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
+ rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
+ else
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ spin_lock(&nn->client_lock);
unhash_client_locked(clp);
if (atomic_read(&clp->cl_refcount) == 0)
free_client(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
}
static void expire_client(struct nfs4_client *clp)
@@ -1187,6 +1167,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
return 0;
}
+static long long
+compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
+{
+ long long res;
+
+ res = o1->len - o2->len;
+ if (res)
+ return res;
+ return (long long)memcmp(o1->data, o2->data, o1->len);
+}
+
static int same_name(const char *n1, const char *n2)
{
return 0 == memcmp(n1, n2, HEXDIR_LEN);
@@ -1247,10 +1238,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
}
-static void gen_clid(struct nfs4_client *clp)
+static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
{
static u32 current_clientid = 1;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
clp->cl_clientid.cl_boot = nn->boot_time;
clp->cl_clientid.cl_id = current_clientid++;
@@ -1283,12 +1273,14 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t
return NULL;
}
-static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
+static struct nfs4_client *create_client(struct xdr_netobj name,
struct svc_rqst *rqstp, nfs4_verifier *verf)
{
struct nfs4_client *clp;
struct sockaddr *sa = svc_addr(rqstp);
int ret;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
clp = alloc_client(name);
if (clp == NULL)
@@ -1297,23 +1289,21 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
INIT_LIST_HEAD(&clp->cl_sessions);
ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
if (ret) {
- spin_lock(&client_lock);
+ spin_lock(&nn->client_lock);
free_client(clp);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
return NULL;
}
idr_init(&clp->cl_stateids);
- memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
atomic_set(&clp->cl_refcount, 0);
clp->cl_cb_state = NFSD4_CB_UNKNOWN;
INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_strhash);
INIT_LIST_HEAD(&clp->cl_openowners);
INIT_LIST_HEAD(&clp->cl_delegations);
INIT_LIST_HEAD(&clp->cl_lru);
INIT_LIST_HEAD(&clp->cl_callbacks);
spin_lock_init(&clp->cl_lock);
- INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc);
+ nfsd4_init_callback(&clp->cl_cb_null);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -1321,17 +1311,60 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
gen_confirm(clp);
clp->cl_cb_session = NULL;
+ clp->net = net;
return clp;
}
static void
-add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
+add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ struct nfs4_client *clp;
+
+ while (*new) {
+ clp = rb_entry(*new, struct nfs4_client, cl_namenode);
+ parent = *new;
+
+ if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ rb_link_node(&new_clp->cl_namenode, parent, new);
+ rb_insert_color(&new_clp->cl_namenode, root);
+}
+
+static struct nfs4_client *
+find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
+{
+ long long cmp;
+ struct rb_node *node = root->rb_node;
+ struct nfs4_client *clp;
+
+ while (node) {
+ clp = rb_entry(node, struct nfs4_client, cl_namenode);
+ cmp = compare_blob(&clp->cl_name, name);
+ if (cmp > 0)
+ node = node->rb_left;
+ else if (cmp < 0)
+ node = node->rb_right;
+ else
+ return clp;
+ }
+ return NULL;
+}
+
+static void
+add_to_unconfirmed(struct nfs4_client *clp)
{
unsigned int idhashval;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]);
+ clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
+ add_clp_to_name_tree(clp, &nn->unconf_name_tree);
idhashval = clientid_hashval(clp->cl_clientid.cl_id);
- list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]);
+ list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
renew_client(clp);
}
@@ -1339,22 +1372,23 @@ static void
move_to_confirmed(struct nfs4_client *clp)
{
unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
- unsigned int strhashval;
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
- list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
- strhashval = clientstr_hashval(clp->cl_recdir);
- list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
+ list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ add_clp_to_name_tree(clp, &nn->conf_name_tree);
+ set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
renew_client(clp);
}
static struct nfs4_client *
-find_confirmed_client(clientid_t *clid, bool sessions)
+find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct nfs4_client *clp;
unsigned int idhashval = clientid_hashval(clid->cl_id);
- list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) {
+ list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
if (same_clid(&clp->cl_clientid, clid)) {
if ((bool)clp->cl_minorversion != sessions)
return NULL;
@@ -1366,12 +1400,12 @@ find_confirmed_client(clientid_t *clid, bool sessions)
}
static struct nfs4_client *
-find_unconfirmed_client(clientid_t *clid, bool sessions)
+find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct nfs4_client *clp;
unsigned int idhashval = clientid_hashval(clid->cl_id);
- list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) {
+ list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
if (same_clid(&clp->cl_clientid, clid)) {
if ((bool)clp->cl_minorversion != sessions)
return NULL;
@@ -1387,27 +1421,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
}
static struct nfs4_client *
-find_confirmed_client_by_str(const char *dname, unsigned int hashval)
+find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
{
- struct nfs4_client *clp;
-
- list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname))
- return clp;
- }
- return NULL;
+ return find_clp_in_name_tree(name, &nn->conf_name_tree);
}
static struct nfs4_client *
-find_unconfirmed_client_by_str(const char *dname, unsigned int hashval)
+find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
{
- struct nfs4_client *clp;
-
- list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
- if (same_name(clp->cl_recdir, dname))
- return clp;
- }
- return NULL;
+ return find_clp_in_name_tree(name, &nn->unconf_name_tree);
}
static void
@@ -1428,7 +1450,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
else
goto out_err;
- conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val,
+ conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
se->se_callback_addr_len,
(struct sockaddr *)&conn->cb_addr,
sizeof(conn->cb_addr));
@@ -1572,12 +1594,11 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
{
struct nfs4_client *unconf, *conf, *new;
__be32 status;
- unsigned int strhashval;
- char dname[HEXDIR_LEN];
char addr_str[INET6_ADDRSTRLEN];
nfs4_verifier verf = exid->verifier;
struct sockaddr *sa = svc_addr(rqstp);
bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rpc_ntop(sa, addr_str, sizeof(addr_str));
dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1592,24 +1613,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
switch (exid->spa_how) {
case SP4_NONE:
break;
+ default: /* checked by xdr code */
+ WARN_ON_ONCE(1);
case SP4_SSV:
- return nfserr_serverfault;
- default:
- BUG(); /* checked by xdr code */
case SP4_MACH_CRED:
return nfserr_serverfault; /* no excuse :-/ */
}
- status = nfs4_make_rec_clidname(dname, &exid->clname);
-
- if (status)
- return status;
-
- strhashval = clientstr_hashval(dname);
-
/* Cases below refer to rfc 5661 section 18.35.4: */
nfs4_lock_state();
- conf = find_confirmed_client_by_str(dname, strhashval);
+ conf = find_confirmed_client_by_name(&exid->clname, nn);
if (conf) {
bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
bool verfs_match = same_verf(&verf, &conf->cl_verifier);
@@ -1654,21 +1667,21 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
goto out;
}
- unconf = find_unconfirmed_client_by_str(dname, strhashval);
+ unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
if (unconf) /* case 4, possible retry or client restart */
expire_client(unconf);
/* case 1 (normal case) */
out_new:
- new = create_client(exid->clname, dname, rqstp, &verf);
+ new = create_client(exid->clname, rqstp, &verf);
if (new == NULL) {
status = nfserr_jukebox;
goto out;
}
new->cl_minorversion = 1;
- gen_clid(new);
- add_to_unconfirmed(new, strhashval);
+ gen_clid(new, nn);
+ add_to_unconfirmed(new);
out_copy:
exid->clientid.cl_boot = new->cl_clientid.cl_boot;
exid->clientid.cl_id = new->cl_clientid.cl_id;
@@ -1761,12 +1774,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
struct nfsd4_conn *conn;
struct nfsd4_clid_slot *cs_slot = NULL;
__be32 status = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
return nfserr_inval;
if (check_forechannel_attrs(cr_ses->fore_channel))
return nfserr_toosmall;
- new = alloc_session(&cr_ses->fore_channel);
+ new = alloc_session(&cr_ses->fore_channel, nn);
if (!new)
return nfserr_jukebox;
status = nfserr_jukebox;
@@ -1775,8 +1789,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_session;
nfs4_lock_state();
- unconf = find_unconfirmed_client(&cr_ses->clientid, true);
- conf = find_confirmed_client(&cr_ses->clientid, true);
+ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
+ conf = find_confirmed_client(&cr_ses->clientid, true, nn);
if (conf) {
cs_slot = &conf->cl_cs_slot;
@@ -1789,7 +1803,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
} else if (unconf) {
- unsigned int hash;
struct nfs4_client *old;
if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
!rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1803,8 +1816,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
status = nfserr_seq_misordered;
goto out_free_conn;
}
- hash = clientstr_hashval(unconf->cl_recdir);
- old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+ old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old)
expire_client(old);
move_to_confirmed(unconf);
@@ -1843,14 +1855,6 @@ out_free_session:
goto out;
}
-static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
-{
- struct nfsd4_compoundres *resp = rqstp->rq_resp;
- struct nfsd4_compoundargs *argp = rqstp->rq_argp;
-
- return argp->opcnt == resp->opcnt;
-}
-
static __be32 nfsd4_map_bcts_dir(u32 *dir)
{
switch (*dir) {
@@ -1865,24 +1869,40 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir)
return nfserr_inval;
}
+__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc)
+{
+ struct nfsd4_session *session = cstate->session;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ spin_lock(&nn->client_lock);
+ session->se_cb_prog = bc->bc_cb_program;
+ session->se_cb_sec = bc->bc_cb_sec;
+ spin_unlock(&nn->client_lock);
+
+ nfsd4_probe_callback(session->se_client);
+
+ return nfs_ok;
+}
+
__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate,
struct nfsd4_bind_conn_to_session *bcts)
{
__be32 status;
struct nfsd4_conn *conn;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (!nfsd4_last_compound_op(rqstp))
return nfserr_not_only_op;
- spin_lock(&client_lock);
- cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid);
+ spin_lock(&nn->client_lock);
+ cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
/* Sorta weird: we only need the refcnt'ing because new_conn acquires
* client_lock iself: */
if (cstate->session) {
nfsd4_get_session(cstate->session);
atomic_inc(&cstate->session->se_client->cl_refcount);
}
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
if (!cstate->session)
return nfserr_badsession;
@@ -1910,6 +1930,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
{
struct nfsd4_session *ses;
__be32 status = nfserr_badsession;
+ struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
/* Notes:
* - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1923,24 +1944,24 @@ nfsd4_destroy_session(struct svc_rqst *r,
return nfserr_not_only_op;
}
dump_sessionid(__func__, &sessionid->sessionid);
- spin_lock(&client_lock);
- ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
+ spin_lock(&nn->client_lock);
+ ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
if (!ses) {
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
goto out;
}
unhash_session(ses);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
nfs4_lock_state();
nfsd4_probe_callback_sync(ses->se_client);
nfs4_unlock_state();
- spin_lock(&client_lock);
+ spin_lock(&nn->client_lock);
nfsd4_del_conns(ses);
nfsd4_put_session_locked(ses);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
status = nfs_ok;
out:
dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -2006,6 +2027,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_slot *slot;
struct nfsd4_conn *conn;
__be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (resp->opcnt != 1)
return nfserr_sequence_pos;
@@ -2018,9 +2040,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (!conn)
return nfserr_jukebox;
- spin_lock(&client_lock);
+ spin_lock(&nn->client_lock);
status = nfserr_badsession;
- session = find_in_sessionid_hashtbl(&seq->sessionid);
+ session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
if (!session)
goto out;
@@ -2094,7 +2116,7 @@ out:
}
}
kfree(conn);
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
dprintk("%s: return %d\n", __func__, ntohl(status));
return status;
}
@@ -2104,10 +2126,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
{
struct nfs4_client *conf, *unconf, *clp;
__be32 status = 0;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
nfs4_lock_state();
- unconf = find_unconfirmed_client(&dc->clientid, true);
- conf = find_confirmed_client(&dc->clientid, true);
+ unconf = find_unconfirmed_client(&dc->clientid, true, nn);
+ conf = find_confirmed_client(&dc->clientid, true, nn);
if (conf) {
clp = conf;
@@ -2181,20 +2204,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
struct xdr_netobj clname = setclid->se_name;
nfs4_verifier clverifier = setclid->se_verf;
- unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
- char dname[HEXDIR_LEN];
-
- status = nfs4_make_rec_clidname(dname, &clname);
- if (status)
- return status;
-
- strhashval = clientstr_hashval(dname);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
/* Cases below refer to rfc 3530 section 14.2.33: */
nfs4_lock_state();
- conf = find_confirmed_client_by_str(dname, strhashval);
+ conf = find_confirmed_client_by_name(&clname, nn);
if (conf) {
/* case 0: */
status = nfserr_clid_inuse;
@@ -2209,21 +2225,21 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
}
- unconf = find_unconfirmed_client_by_str(dname, strhashval);
+ unconf = find_unconfirmed_client_by_name(&clname, nn);
if (unconf)
expire_client(unconf);
status = nfserr_jukebox;
- new = create_client(clname, dname, rqstp, &clverifier);
+ new = create_client(clname, rqstp, &clverifier);
if (new == NULL)
goto out;
if (conf && same_verf(&conf->cl_verifier, &clverifier))
/* case 1: probable callback update */
copy_clid(new, conf);
else /* case 4 (new client) or cases 2, 3 (client reboot): */
- gen_clid(new);
+ gen_clid(new, nn);
new->cl_minorversion = 0;
gen_callback(new, setclid, rqstp);
- add_to_unconfirmed(new, strhashval);
+ add_to_unconfirmed(new);
setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
@@ -2243,14 +2259,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
nfs4_verifier confirm = setclientid_confirm->sc_confirm;
clientid_t * clid = &setclientid_confirm->sc_clientid;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (STALE_CLIENTID(clid, nn))
return nfserr_stale_clientid;
nfs4_lock_state();
- conf = find_confirmed_client(clid, false);
- unconf = find_unconfirmed_client(clid, false);
+ conf = find_confirmed_client(clid, false, nn);
+ unconf = find_unconfirmed_client(clid, false, nn);
/*
* We try hard to give out unique clientid's, so if we get an
* attempt to confirm the same clientid with a different cred,
@@ -2276,9 +2292,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
nfsd4_probe_callback(conf);
expire_client(unconf);
} else { /* case 3: normal case; new or rebooted client */
- unsigned int hash = clientstr_hashval(unconf->cl_recdir);
-
- conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+ conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (conf)
expire_client(conf);
move_to_confirmed(unconf);
@@ -2340,7 +2354,7 @@ nfsd4_init_slabs(void)
if (openowner_slab == NULL)
goto out_nomem;
lockowner_slab = kmem_cache_create("nfsd4_lockowners",
- sizeof(struct nfs4_openowner), 0, 0, NULL);
+ sizeof(struct nfs4_lockowner), 0, 0, NULL);
if (lockowner_slab == NULL)
goto out_nomem;
file_slab = kmem_cache_create("nfsd4_files",
@@ -2404,7 +2418,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
{
- list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+
+ list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
list_add(&oo->oo_perclient, &clp->cl_openowners);
}
@@ -2444,11 +2460,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
}
static void
-move_to_close_lru(struct nfs4_openowner *oo)
+move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
- list_move_tail(&oo->oo_close_lru, &close_lru);
+ list_move_tail(&oo->oo_close_lru, &nn->close_lru);
oo->oo_time = get_seconds();
}
@@ -2462,13 +2480,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
}
static struct nfs4_openowner *
-find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions)
+find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
+ bool sessions, struct nfsd_net *nn)
{
struct nfs4_stateowner *so;
struct nfs4_openowner *oo;
struct nfs4_client *clp;
- list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
+ list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
if (!so->so_is_open_owner)
continue;
if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
@@ -2555,9 +2574,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
struct nfs4_delegation *dp;
- BUG_ON(!fp);
- /* We assume break_lease is only called once per lease: */
- BUG_ON(fp->fi_had_conflict);
+ if (!fp) {
+ WARN(1, "(%p)->fl_owner NULL\n", fl);
+ return;
+ }
+ if (fp->fi_had_conflict) {
+ WARN(1, "duplicate break on %p\n", fp);
+ return;
+ }
/*
* We don't want the locks code to timeout the lease for us;
* we'll remove it ourself if a delegation isn't returned
@@ -2599,14 +2623,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
__be32
nfsd4_process_open1(struct nfsd4_compound_state *cstate,
- struct nfsd4_open *open)
+ struct nfsd4_open *open, struct nfsd_net *nn)
{
clientid_t *clientid = &open->op_clientid;
struct nfs4_client *clp = NULL;
unsigned int strhashval;
struct nfs4_openowner *oo = NULL;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
if (STALE_CLIENTID(&open->op_clientid, nn))
return nfserr_stale_clientid;
@@ -2619,10 +2642,11 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
return nfserr_jukebox;
strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
- oo = find_openstateowner_str(strhashval, open, cstate->minorversion);
+ oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
open->op_openowner = oo;
if (!oo) {
- clp = find_confirmed_client(clientid, cstate->minorversion);
+ clp = find_confirmed_client(clientid, cstate->minorversion,
+ nn);
if (clp == NULL)
return nfserr_expired;
goto new_owner;
@@ -2891,7 +2915,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
open->op_why_no_deleg = WND4_CANCELLED;
break;
case NFS4_SHARE_WANT_NO_DELEG:
- BUG(); /* not supposed to get here */
+ WARN_ON_ONCE(1);
}
}
}
@@ -2959,6 +2983,7 @@ out:
}
return;
out_free:
+ unhash_stid(&dp->dl_stid);
nfs4_put_delegation(dp);
out_no_deleg:
flag = NFS4_OPEN_DELEGATE_NONE;
@@ -3104,27 +3129,32 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
free_generic_stateid(open->op_stp);
}
+static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
+{
+ struct nfs4_client *found;
+
+ if (STALE_CLIENTID(clid, nn))
+ return nfserr_stale_clientid;
+ found = find_confirmed_client(clid, session, nn);
+ if (clp)
+ *clp = found;
+ return found ? nfs_ok : nfserr_expired;
+}
+
__be32
nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
clientid_t *clid)
{
struct nfs4_client *clp;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
nfs4_lock_state();
dprintk("process_renew(%08x/%08x): starting\n",
clid->cl_boot, clid->cl_id);
- status = nfserr_stale_clientid;
- if (STALE_CLIENTID(clid, nn))
- goto out;
- clp = find_confirmed_client(clid, cstate->minorversion);
- status = nfserr_expired;
- if (clp == NULL) {
- /* We assume the client took too long to RENEW. */
- dprintk("nfsd4_renew: clientid not found!\n");
+ status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
+ if (status)
goto out;
- }
status = nfserr_cb_path_down;
if (!list_empty(&clp->cl_delegations)
&& clp->cl_cb_state != NFSD4_CB_UP)
@@ -3136,44 +3166,42 @@ out:
}
static void
-nfsd4_end_grace(struct net *net)
+nfsd4_end_grace(struct nfsd_net *nn)
{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
/* do nothing if grace period already ended */
if (nn->grace_ended)
return;
dprintk("NFSD: end of grace period\n");
nn->grace_ended = true;
- nfsd4_record_grace_done(net, nn->boot_time);
+ nfsd4_record_grace_done(nn, nn->boot_time);
locks_end_grace(&nn->nfsd4_manager);
/*
* Now that every NFSv4 client has had the chance to recover and
* to see the (possibly new, possibly shorter) lease time, we
* can safely set the next grace time to the current lease time:
*/
- nfsd4_grace = nfsd4_lease;
+ nn->nfsd4_grace = nn->nfsd4_lease;
}
static time_t
-nfs4_laundromat(void)
+nfs4_laundromat(struct nfsd_net *nn)
{
struct nfs4_client *clp;
struct nfs4_openowner *oo;
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
- time_t cutoff = get_seconds() - nfsd4_lease;
- time_t t, clientid_val = nfsd4_lease;
- time_t u, test_val = nfsd4_lease;
+ time_t cutoff = get_seconds() - nn->nfsd4_lease;
+ time_t t, clientid_val = nn->nfsd4_lease;
+ time_t u, test_val = nn->nfsd4_lease;
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
- nfsd4_end_grace(&init_net);
+ nfsd4_end_grace(nn);
INIT_LIST_HEAD(&reaplist);
- spin_lock(&client_lock);
- list_for_each_safe(pos, next, &client_lru) {
+ spin_lock(&nn->client_lock);
+ list_for_each_safe(pos, next, &nn->client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
t = clp->cl_time - cutoff;
@@ -3189,7 +3217,7 @@ nfs4_laundromat(void)
unhash_client_locked(clp);
list_add(&clp->cl_lru, &reaplist);
}
- spin_unlock(&client_lock);
+ spin_unlock(&nn->client_lock);
list_for_each_safe(pos, next, &reaplist) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
dprintk("NFSD: purging unused client (clientid %08x)\n",
@@ -3199,6 +3227,8 @@ nfs4_laundromat(void)
spin_lock(&recall_lock);
list_for_each_safe(pos, next, &del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
+ continue;
if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
u = dp->dl_time - cutoff;
if (test_val > u)
@@ -3212,8 +3242,8 @@ nfs4_laundromat(void)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
unhash_delegation(dp);
}
- test_val = nfsd4_lease;
- list_for_each_safe(pos, next, &close_lru) {
+ test_val = nn->nfsd4_lease;
+ list_for_each_safe(pos, next, &nn->close_lru) {
oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
u = oo->oo_time - cutoff;
@@ -3231,16 +3261,19 @@ nfs4_laundromat(void)
static struct workqueue_struct *laundry_wq;
static void laundromat_main(struct work_struct *);
-static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);
static void
-laundromat_main(struct work_struct *not_used)
+laundromat_main(struct work_struct *laundry)
{
time_t t;
+ struct delayed_work *dwork = container_of(laundry, struct delayed_work,
+ work);
+ struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+ laundromat_work);
- t = nfs4_laundromat();
+ t = nfs4_laundromat(nn);
dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
- queue_delayed_work(laundry_wq, &laundromat_work, t*HZ);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
}
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
@@ -3385,16 +3418,17 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
return nfs_ok;
}
-static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions)
+static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, bool sessions,
+ struct nfsd_net *nn)
{
struct nfs4_client *cl;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return nfserr_bad_stateid;
if (STALE_STATEID(stateid, nn))
return nfserr_stale_stateid;
- cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions);
+ cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
if (!cl)
return nfserr_expired;
*s = find_stateid_by_type(cl, stateid, typemask);
@@ -3416,6 +3450,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
struct nfs4_delegation *dp = NULL;
struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
__be32 status;
if (filpp)
@@ -3427,7 +3462,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(net, current_fh, stateid, flags);
- status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion);
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+ &s, cstate->minorversion, nn);
if (status)
return status;
status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3441,7 +3477,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
goto out;
if (filpp) {
*filpp = dp->dl_file->fi_deleg_file;
- BUG_ON(!*filpp);
+ if (!*filpp) {
+ WARN_ON_ONCE(1);
+ status = nfserr_serverfault;
+ goto out;
+ }
}
break;
case NFS4_OPEN_STID:
@@ -3568,7 +3608,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
static __be32
nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
stateid_t *stateid, char typemask,
- struct nfs4_ol_stateid **stpp)
+ struct nfs4_ol_stateid **stpp,
+ struct nfsd_net *nn)
{
__be32 status;
struct nfs4_stid *s;
@@ -3577,7 +3618,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
seqid, STATEID_VAL(stateid));
*stpp = NULL;
- status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion);
+ status = nfsd4_lookup_stateid(stateid, typemask, &s,
+ cstate->minorversion, nn);
if (status)
return status;
*stpp = openlockstateid(s);
@@ -3586,13 +3628,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
}
-static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp)
+static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
+ stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
{
__be32 status;
struct nfs4_openowner *oo;
status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
- NFS4_OPEN_STID, stpp);
+ NFS4_OPEN_STID, stpp, nn);
if (status)
return status;
oo = openowner((*stpp)->st_stateowner);
@@ -3608,6 +3651,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfs4_openowner *oo;
struct nfs4_ol_stateid *stp;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3621,7 +3665,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_seqid_op(cstate,
oc->oc_seqid, &oc->oc_req_stateid,
- NFS4_OPEN_STID, &stp);
+ NFS4_OPEN_STID, &stp, nn);
if (status)
goto out;
oo = openowner(stp->st_stateowner);
@@ -3664,7 +3708,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
case NFS4_SHARE_ACCESS_BOTH:
break;
default:
- BUG();
+ WARN_ON_ONCE(1);
}
}
@@ -3685,6 +3729,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
{
__be32 status;
struct nfs4_ol_stateid *stp;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3697,7 +3742,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
nfs4_lock_state();
status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
- &od->od_stateid, &stp);
+ &od->od_stateid, &stp, nn);
if (status)
goto out;
status = nfserr_inval;
@@ -3760,6 +3805,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
struct nfs4_openowner *oo;
struct nfs4_ol_stateid *stp;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dprintk("NFSD: nfsd4_close on file %.*s\n",
(int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3769,7 +3816,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
&close->cl_stateid,
NFS4_OPEN_STID|NFS4_CLOSED_STID,
- &stp);
+ &stp, nn);
if (status)
goto out;
oo = openowner(stp->st_stateowner);
@@ -3791,7 +3838,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* little while to handle CLOSE replay.
*/
if (list_empty(&oo->oo_owner.so_stateids))
- move_to_close_lru(oo);
+ move_to_close_lru(oo, SVC_NET(rqstp));
}
}
out:
@@ -3807,15 +3854,15 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfs4_delegation *dp;
stateid_t *stateid = &dr->dr_stateid;
struct nfs4_stid *s;
- struct inode *inode;
__be32 status;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- inode = cstate->current_fh.fh_dentry->d_inode;
nfs4_lock_state();
- status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion);
+ status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
+ cstate->minorversion, nn);
if (status)
goto out;
dp = delegstateid(s);
@@ -3833,8 +3880,6 @@ out:
#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
-#define LOCKOWNER_INO_HASH_BITS 8
-#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
static inline u64
@@ -3852,7 +3897,7 @@ last_byte_offset(u64 start, u64 len)
{
u64 end;
- BUG_ON(!len);
+ WARN_ON_ONCE(!len);
end = start + len;
return end > start ? end - 1: NFS4_MAX_UINT64;
}
@@ -3864,8 +3909,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct
& LOCKOWNER_INO_HASH_MASK;
}
-static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
-
/*
* TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
* we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3931,12 +3974,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
static struct nfs4_lockowner *
find_lockowner_str(struct inode *inode, clientid_t *clid,
- struct xdr_netobj *owner)
+ struct xdr_netobj *owner, struct nfsd_net *nn)
{
unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
struct nfs4_lockowner *lo;
- list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
+ list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
if (same_lockowner_ino(lo, inode, clid, owner))
return lo;
}
@@ -3948,9 +3991,10 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
struct inode *inode = open_stp->st_file->fi_inode;
unsigned int inohash = lockowner_ino_hashval(inode,
clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
- list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
+ list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
+ list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
}
@@ -4024,8 +4068,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s
struct nfs4_client *cl = oo->oo_owner.so_client;
struct nfs4_lockowner *lo;
unsigned int strhashval;
+ struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
- lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
+ lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
+ &lock->v.new.owner, nn);
if (lo) {
if (!cstate->minorversion)
return nfserr_bad_seqid;
@@ -4065,7 +4111,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
bool new_state = false;
int lkflg;
int err;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
(long long) lock->lk_offset,
@@ -4099,7 +4146,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_confirmed_seqid_op(cstate,
lock->lk_new_open_seqid,
&lock->lk_new_open_stateid,
- &open_stp);
+ &open_stp, nn);
if (status)
goto out;
open_sop = openowner(open_stp->st_stateowner);
@@ -4113,7 +4160,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
&lock->lk_old_lock_stateid,
- NFS4_LOCK_STID, &lock_stp);
+ NFS4_LOCK_STID, &lock_stp, nn);
if (status)
goto out;
lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4124,10 +4171,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = nfserr_grace;
- if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim)
+ if (locks_in_grace(net) && !lock->lk_reclaim)
goto out;
status = nfserr_no_grace;
- if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim)
+ if (!locks_in_grace(net) && lock->lk_reclaim)
goto out;
file_lock = locks_alloc_lock();
@@ -4238,7 +4285,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file_lock *file_lock = NULL;
struct nfs4_lockowner *lo;
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (locks_in_grace(SVC_NET(rqstp)))
return nfserr_grace;
@@ -4248,9 +4295,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
- status = nfserr_stale_clientid;
- if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn))
- goto out;
+ if (!nfsd4_has_session(cstate)) {
+ status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
+ if (status)
+ goto out;
+ }
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
goto out;
@@ -4278,7 +4327,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner);
+ lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
if (lo)
file_lock->fl_owner = (fl_owner_t)lo;
file_lock->fl_pid = current->tgid;
@@ -4313,7 +4362,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct file_lock *file_lock = NULL;
__be32 status;
int err;
-
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
(long long) locku->lu_offset,
(long long) locku->lu_length);
@@ -4324,7 +4374,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfs4_lock_state();
status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
- &locku->lu_stateid, NFS4_LOCK_STID, &stp);
+ &locku->lu_stateid, NFS4_LOCK_STID,
+ &stp, nn);
if (status)
goto out;
filp = find_any_file(stp->st_file);
@@ -4414,23 +4465,21 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
struct list_head matches;
unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
__be32 status;
- struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
- /* XXX check for lease expiration */
-
- status = nfserr_stale_clientid;
- if (STALE_CLIENTID(clid, nn))
- return status;
-
nfs4_lock_state();
+ status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
+ if (status)
+ goto out;
+
status = nfserr_locks_held;
INIT_LIST_HEAD(&matches);
- list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
+ list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
if (sop->so_is_open_owner)
continue;
if (!same_owner_str(sop, owner, clid))
@@ -4466,73 +4515,74 @@ alloc_reclaim(void)
return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
}
-int
-nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
+bool
+nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
{
- unsigned int strhashval = clientstr_hashval(name);
- struct nfs4_client *clp;
+ struct nfs4_client_reclaim *crp;
- clp = find_confirmed_client_by_str(name, strhashval);
- if (!clp)
- return 0;
- return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+ crp = nfsd4_find_reclaim_client(name, nn);
+ return (crp && crp->cr_clp);
}
/*
* failure => all reset bets are off, nfserr_no_grace...
*/
-int
-nfs4_client_to_reclaim(const char *name)
+struct nfs4_client_reclaim *
+nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
{
unsigned int strhashval;
- struct nfs4_client_reclaim *crp = NULL;
+ struct nfs4_client_reclaim *crp;
dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
crp = alloc_reclaim();
- if (!crp)
- return 0;
- strhashval = clientstr_hashval(name);
- INIT_LIST_HEAD(&crp->cr_strhash);
- list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
- memcpy(crp->cr_recdir, name, HEXDIR_LEN);
- reclaim_str_hashtbl_size++;
- return 1;
+ if (crp) {
+ strhashval = clientstr_hashval(name);
+ INIT_LIST_HEAD(&crp->cr_strhash);
+ list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
+ memcpy(crp->cr_recdir, name, HEXDIR_LEN);
+ crp->cr_clp = NULL;
+ nn->reclaim_str_hashtbl_size++;
+ }
+ return crp;
+}
+
+void
+nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
+{
+ list_del(&crp->cr_strhash);
+ kfree(crp);
+ nn->reclaim_str_hashtbl_size--;
}
void
-nfs4_release_reclaim(void)
+nfs4_release_reclaim(struct nfsd_net *nn)
{
struct nfs4_client_reclaim *crp = NULL;
int i;
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
- while (!list_empty(&reclaim_str_hashtbl[i])) {
- crp = list_entry(reclaim_str_hashtbl[i].next,
+ while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
+ crp = list_entry(nn->reclaim_str_hashtbl[i].next,
struct nfs4_client_reclaim, cr_strhash);
- list_del(&crp->cr_strhash);
- kfree(crp);
- reclaim_str_hashtbl_size--;
+ nfs4_remove_reclaim_record(crp, nn);
}
}
- BUG_ON(reclaim_str_hashtbl_size);
+ WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
}
/*
* called from OPEN, CLAIM_PREVIOUS with a new clientid. */
struct nfs4_client_reclaim *
-nfsd4_find_reclaim_client(struct nfs4_client *clp)
+nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
{
unsigned int strhashval;
struct nfs4_client_reclaim *crp = NULL;
- dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n",
- clp->cl_name.len, clp->cl_name.data,
- clp->cl_recdir);
+ dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
- /* find clp->cl_name in reclaim_str_hashtbl */
- strhashval = clientstr_hashval(clp->cl_recdir);
- list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
- if (same_name(crp->cr_recdir, clp->cl_recdir)) {
+ strhashval = clientstr_hashval(recdir);
+ list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
+ if (same_name(crp->cr_recdir, recdir)) {
return crp;
}
}
@@ -4543,12 +4593,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
* Called from OPEN. Look for clientid in reclaim list.
*/
__be32
-nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
+nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
{
struct nfs4_client *clp;
/* find clientid in conf_id_hashtbl */
- clp = find_confirmed_client(clid, sessions);
+ clp = find_confirmed_client(clid, sessions, nn);
if (clp == NULL)
return nfserr_reclaim_bad;
@@ -4557,124 +4607,177 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
#ifdef CONFIG_NFSD_FAULT_INJECTION
-void nfsd_forget_clients(u64 num)
+u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
{
- struct nfs4_client *clp, *next;
- int count = 0;
-
- nfs4_lock_state();
- list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
- expire_client(clp);
- if (++count == num)
- break;
- }
- nfs4_unlock_state();
-
- printk(KERN_INFO "NFSD: Forgot %d clients", count);
+ expire_client(clp);
+ return 1;
}
-static void release_lockowner_sop(struct nfs4_stateowner *sop)
+u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
{
- release_lockowner(lockowner(sop));
+ char buf[INET6_ADDRSTRLEN];
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ printk(KERN_INFO "NFS Client: %s\n", buf);
+ return 1;
}
-static void release_openowner_sop(struct nfs4_stateowner *sop)
+static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
+ const char *type)
{
- release_openowner(openowner(sop));
+ char buf[INET6_ADDRSTRLEN];
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
+ printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
}
-static int nfsd_release_n_owners(u64 num, bool is_open_owner,
- void (*release_sop)(struct nfs4_stateowner *))
+static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
{
- int i, count = 0;
- struct nfs4_stateowner *sop, *next;
+ struct nfs4_openowner *oop;
+ struct nfs4_lockowner *lop, *lo_next;
+ struct nfs4_ol_stateid *stp, *st_next;
+ u64 count = 0;
- for (i = 0; i < OWNER_HASH_SIZE; i++) {
- list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
- if (sop->so_is_open_owner != is_open_owner)
- continue;
- release_sop(sop);
- if (++count == num)
- return count;
+ list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
+ list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
+ list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
+ if (func)
+ func(lop);
+ if (++count == max)
+ return count;
+ }
}
}
+
return count;
}
-void nfsd_forget_locks(u64 num)
+u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
{
- int count;
-
- nfs4_lock_state();
- count = nfsd_release_n_owners(num, false, release_lockowner_sop);
- nfs4_unlock_state();
+ return nfsd_foreach_client_lock(clp, max, release_lockowner);
+}
- printk(KERN_INFO "NFSD: Forgot %d locks", count);
+u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
+{
+ u64 count = nfsd_foreach_client_lock(clp, max, NULL);
+ nfsd_print_count(clp, count, "locked files");
+ return count;
}
-void nfsd_forget_openowners(u64 num)
+static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
{
- int count;
+ struct nfs4_openowner *oop, *next;
+ u64 count = 0;
- nfs4_lock_state();
- count = nfsd_release_n_owners(num, true, release_openowner_sop);
- nfs4_unlock_state();
+ list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
+ if (func)
+ func(oop);
+ if (++count == max)
+ break;
+ }
- printk(KERN_INFO "NFSD: Forgot %d open owners", count);
+ return count;
}
-static int nfsd_process_n_delegations(u64 num, struct list_head *list)
+u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
{
- int i, count = 0;
- struct nfs4_file *fp, *fnext;
- struct nfs4_delegation *dp, *dnext;
+ return nfsd_foreach_client_open(clp, max, release_openowner);
+}
- for (i = 0; i < FILE_HASH_SIZE; i++) {
- list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
- list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
- list_move(&dp->dl_recall_lru, list);
- if (++count == num)
- return count;
- }
- }
- }
+u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
+{
+ u64 count = nfsd_foreach_client_open(clp, max, NULL);
+ nfsd_print_count(clp, count, "open files");
+ return count;
+}
+
+static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
+ struct list_head *victims)
+{
+ struct nfs4_delegation *dp, *next;
+ u64 count = 0;
+ list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
+ if (victims)
+ list_move(&dp->dl_recall_lru, victims);
+ if (++count == max)
+ break;
+ }
return count;
}
-void nfsd_forget_delegations(u64 num)
+u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
{
- unsigned int count;
+ struct nfs4_delegation *dp, *next;
LIST_HEAD(victims);
- struct nfs4_delegation *dp, *dnext;
+ u64 count;
spin_lock(&recall_lock);
- count = nfsd_process_n_delegations(num, &victims);
+ count = nfsd_find_all_delegations(clp, max, &victims);
spin_unlock(&recall_lock);
- nfs4_lock_state();
- list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru)
+ list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
unhash_delegation(dp);
- nfs4_unlock_state();
- printk(KERN_INFO "NFSD: Forgot %d delegations", count);
+ return count;
}
-void nfsd_recall_delegations(u64 num)
+u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
{
- unsigned int count;
+ struct nfs4_delegation *dp, *next;
LIST_HEAD(victims);
- struct nfs4_delegation *dp, *dnext;
+ u64 count;
spin_lock(&recall_lock);
- count = nfsd_process_n_delegations(num, &victims);
- list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) {
- list_del(&dp->dl_recall_lru);
+ count = nfsd_find_all_delegations(clp, max, &victims);
+ list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
nfsd_break_one_deleg(dp);
- }
spin_unlock(&recall_lock);
- printk(KERN_INFO "NFSD: Recalled %d delegations", count);
+ return count;
+}
+
+u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
+{
+ u64 count = 0;
+
+ spin_lock(&recall_lock);
+ count = nfsd_find_all_delegations(clp, max, NULL);
+ spin_unlock(&recall_lock);
+
+ nfsd_print_count(clp, count, "delegations");
+ return count;
+}
+
+u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
+{
+ struct nfs4_client *clp, *next;
+ u64 count = 0;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return 0;
+
+ list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
+ count += func(clp, max - count);
+ if ((max != 0) && (count >= max))
+ break;
+ }
+
+ return count;
+}
+
+struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
+{
+ struct nfs4_client *clp;
+ struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
+
+ if (!nfsd_netns_ready(nn))
+ return NULL;
+
+ list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+ if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
+ return clp;
+ }
+ return NULL;
}
#endif /* CONFIG_NFSD_FAULT_INJECTION */
@@ -4686,27 +4789,10 @@ nfs4_state_init(void)
{
int i;
- for (i = 0; i < CLIENT_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&conf_id_hashtbl[i]);
- INIT_LIST_HEAD(&conf_str_hashtbl[i]);
- INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
- INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
- INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
- }
- for (i = 0; i < SESSION_HASH_SIZE; i++)
- INIT_LIST_HEAD(&sessionid_hashtbl[i]);
for (i = 0; i < FILE_HASH_SIZE; i++) {
INIT_LIST_HEAD(&file_hashtbl[i]);
}
- for (i = 0; i < OWNER_HASH_SIZE; i++) {
- INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
- }
- for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
- INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
- INIT_LIST_HEAD(&close_lru);
- INIT_LIST_HEAD(&client_lru);
INIT_LIST_HEAD(&del_recall_lru);
- reclaim_str_hashtbl_size = 0;
}
/*
@@ -4730,12 +4816,100 @@ set_max_delegations(void)
max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
}
-/* initialization to perform when the nfsd service is started: */
+static int nfs4_state_create_net(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int i;
+
+ nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+ CLIENT_HASH_SIZE, GFP_KERNEL);
+ if (!nn->conf_id_hashtbl)
+ goto err;
+ nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) *
+ CLIENT_HASH_SIZE, GFP_KERNEL);
+ if (!nn->unconf_id_hashtbl)
+ goto err_unconf_id;
+ nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
+ OWNER_HASH_SIZE, GFP_KERNEL);
+ if (!nn->ownerstr_hashtbl)
+ goto err_ownerstr;
+ nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
+ LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
+ if (!nn->lockowner_ino_hashtbl)
+ goto err_lockowner_ino;
+ nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
+ SESSION_HASH_SIZE, GFP_KERNEL);
+ if (!nn->sessionid_hashtbl)
+ goto err_sessionid;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
+ INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
+ }
+ for (i = 0; i < OWNER_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
+ for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
+ for (i = 0; i < SESSION_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
+ nn->conf_name_tree = RB_ROOT;
+ nn->unconf_name_tree = RB_ROOT;
+ INIT_LIST_HEAD(&nn->client_lru);
+ INIT_LIST_HEAD(&nn->close_lru);
+ spin_lock_init(&nn->client_lock);
+
+ INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
+ get_net(net);
+
+ return 0;
+
+err_sessionid:
+ kfree(nn->lockowner_ino_hashtbl);
+err_lockowner_ino:
+ kfree(nn->ownerstr_hashtbl);
+err_ownerstr:
+ kfree(nn->unconf_id_hashtbl);
+err_unconf_id:
+ kfree(nn->conf_id_hashtbl);
+err:
+ return -ENOMEM;
+}
+
+static void
+nfs4_state_destroy_net(struct net *net)
+{
+ int i;
+ struct nfs4_client *clp = NULL;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct rb_node *node, *tmp;
+
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->conf_id_hashtbl[i])) {
+ clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+ destroy_client(clp);
+ }
+ }
+
+ node = rb_first(&nn->unconf_name_tree);
+ while (node != NULL) {
+ tmp = node;
+ node = rb_next(tmp);
+ clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
+ rb_erase(tmp, &nn->unconf_name_tree);
+ destroy_client(clp);
+ }
+
+ kfree(nn->sessionid_hashtbl);
+ kfree(nn->lockowner_ino_hashtbl);
+ kfree(nn->ownerstr_hashtbl);
+ kfree(nn->unconf_id_hashtbl);
+ kfree(nn->conf_id_hashtbl);
+ put_net(net);
+}
int
-nfs4_state_start(void)
+nfs4_state_start_net(struct net *net)
{
- struct net *net = &init_net;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
@@ -4746,18 +4920,32 @@ nfs4_state_start(void)
* to that instead and then do most of the rest of this on a per-net
* basis.
*/
- get_net(net);
+ if (net != &init_net)
+ return -EINVAL;
+
+ ret = nfs4_state_create_net(net);
+ if (ret)
+ return ret;
nfsd4_client_tracking_init(net);
nn->boot_time = get_seconds();
locks_start_grace(net, &nn->nfsd4_manager);
nn->grace_ended = false;
- printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
- nfsd4_grace);
+ printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
+ nn->nfsd4_grace, net);
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
+ return 0;
+}
+
+/* initialization to perform when the nfsd service is started: */
+
+int
+nfs4_state_start(void)
+{
+ int ret;
+
ret = set_callback_cred();
- if (ret) {
- ret = -ENOMEM;
- goto out_recovery;
- }
+ if (ret)
+ return -ENOMEM;
laundry_wq = create_singlethread_workqueue("nfsd4");
if (laundry_wq == NULL) {
ret = -ENOMEM;
@@ -4766,39 +4954,34 @@ nfs4_state_start(void)
ret = nfsd4_create_callback_queue();
if (ret)
goto out_free_laundry;
- queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ);
+
set_max_delegations();
+
return 0;
+
out_free_laundry:
destroy_workqueue(laundry_wq);
out_recovery:
- nfsd4_client_tracking_exit(net);
- put_net(net);
return ret;
}
-static void
-__nfs4_state_shutdown(void)
+/* should be called with the state lock held */
+void
+nfs4_state_shutdown_net(struct net *net)
{
- int i;
- struct nfs4_client *clp = NULL;
struct nfs4_delegation *dp = NULL;
struct list_head *pos, *next, reaplist;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ cancel_delayed_work_sync(&nn->laundromat_work);
+ locks_end_grace(&nn->nfsd4_manager);
- for (i = 0; i < CLIENT_HASH_SIZE; i++) {
- while (!list_empty(&conf_id_hashtbl[i])) {
- clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
- destroy_client(clp);
- }
- while (!list_empty(&unconf_str_hashtbl[i])) {
- clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
- destroy_client(clp);
- }
- }
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
list_for_each_safe(pos, next, &del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+ if (dp->dl_stid.sc_client->net != net)
+ continue;
list_move(&dp->dl_recall_lru, &reaplist);
}
spin_unlock(&recall_lock);
@@ -4807,22 +4990,14 @@ __nfs4_state_shutdown(void)
unhash_delegation(dp);
}
- nfsd4_client_tracking_exit(&init_net);
- put_net(&init_net);
+ nfsd4_client_tracking_exit(net);
+ nfs4_state_destroy_net(net);
}
void
nfs4_state_shutdown(void)
{
- struct net *net = &init_net;
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- cancel_delayed_work_sync(&laundromat_work);
destroy_workqueue(laundry_wq);
- locks_end_grace(&nn->nfsd4_manager);
- nfs4_lock_state();
- __nfs4_state_shutdown();
- nfs4_unlock_state();
nfsd4_destroy_callback_queue();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fd548d155088..0dc11586682f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -53,6 +53,7 @@
#include "vfs.h"
#include "state.h"
#include "cache.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_XDR
@@ -65,17 +66,17 @@
#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
static __be32
-check_filename(char *str, int len, __be32 err)
+check_filename(char *str, int len)
{
int i;
if (len == 0)
return nfserr_inval;
if (isdotent(str, len))
- return err;
+ return nfserr_badname;
for (i = 0; i < len; i++)
if (str[i] == '/')
- return err;
+ return nfserr_badname;
return 0;
}
@@ -422,6 +423,86 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
DECODE_TAIL;
}
+static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
+{
+ DECODE_HEAD;
+ u32 dummy, uid, gid;
+ char *machine_name;
+ int i;
+ int nr_secflavs;
+
+ /* callback_sec_params4 */
+ READ_BUF(4);
+ READ32(nr_secflavs);
+ cbs->flavor = (u32)(-1);
+ for (i = 0; i < nr_secflavs; ++i) {
+ READ_BUF(4);
+ READ32(dummy);
+ switch (dummy) {
+ case RPC_AUTH_NULL:
+ /* Nothing to read */
+ if (cbs->flavor == (u32)(-1))
+ cbs->flavor = RPC_AUTH_NULL;
+ break;
+ case RPC_AUTH_UNIX:
+ READ_BUF(8);
+ /* stamp */
+ READ32(dummy);
+
+ /* machine name */
+ READ32(dummy);
+ READ_BUF(dummy);
+ SAVEMEM(machine_name, dummy);
+
+ /* uid, gid */
+ READ_BUF(8);
+ READ32(uid);
+ READ32(gid);
+
+ /* more gids */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy * 4);
+ if (cbs->flavor == (u32)(-1)) {
+ cbs->uid = uid;
+ cbs->gid = gid;
+ cbs->flavor = RPC_AUTH_UNIX;
+ }
+ break;
+ case RPC_AUTH_GSS:
+ dprintk("RPC_AUTH_GSS callback secflavor "
+ "not supported!\n");
+ READ_BUF(8);
+ /* gcbp_service */
+ READ32(dummy);
+ /* gcbp_handle_from_server */
+ READ32(dummy);
+ READ_BUF(dummy);
+ p += XDR_QUADLEN(dummy);
+ /* gcbp_handle_from_client */
+ READ_BUF(4);
+ READ32(dummy);
+ READ_BUF(dummy);
+ break;
+ default:
+ dprintk("Illegal callback secflavor\n");
+ return nfserr_inval;
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
+{
+ DECODE_HEAD;
+
+ READ_BUF(4);
+ READ32(bc->bc_cb_program);
+ nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
+
+ DECODE_TAIL;
+}
+
static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
{
DECODE_HEAD;
@@ -490,7 +571,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
READ32(create->cr_namelen);
READ_BUF(create->cr_namelen);
SAVEMEM(create->cr_name, create->cr_namelen);
- if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
+ if ((status = check_filename(create->cr_name, create->cr_namelen)))
return status;
status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
@@ -522,7 +603,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
READ32(link->li_namelen);
READ_BUF(link->li_namelen);
SAVEMEM(link->li_name, link->li_namelen);
- if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval)))
+ if ((status = check_filename(link->li_name, link->li_namelen)))
return status;
DECODE_TAIL;
@@ -616,7 +697,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
READ32(lookup->lo_len);
READ_BUF(lookup->lo_len);
SAVEMEM(lookup->lo_name, lookup->lo_len);
- if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent)))
+ if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
return status;
DECODE_TAIL;
@@ -780,7 +861,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
- if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+ if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
return status;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
@@ -795,7 +876,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
- if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval)))
+ if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
return status;
break;
case NFS4_OPEN_CLAIM_FH:
@@ -907,7 +988,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
READ32(remove->rm_namelen);
READ_BUF(remove->rm_namelen);
SAVEMEM(remove->rm_name, remove->rm_namelen);
- if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent)))
+ if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
return status;
DECODE_TAIL;
@@ -925,9 +1006,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
READ32(rename->rn_tnamelen);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
- if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent)))
+ if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
return status;
- if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval)))
+ if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
return status;
DECODE_TAIL;
@@ -954,8 +1035,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
READ32(secinfo->si_namelen);
READ_BUF(secinfo->si_namelen);
SAVEMEM(secinfo->si_name, secinfo->si_namelen);
- status = check_filename(secinfo->si_name, secinfo->si_namelen,
- nfserr_noent);
+ status = check_filename(secinfo->si_name, secinfo->si_namelen);
if (status)
return status;
DECODE_TAIL;
@@ -1026,31 +1106,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
static __be32
nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
{
-#if 0
- struct nfsd4_compoundargs save = {
- .p = argp->p,
- .end = argp->end,
- .rqstp = argp->rqstp,
- };
- u32 ve_bmval[2];
- struct iattr ve_iattr; /* request */
- struct nfs4_acl *ve_acl; /* request */
-#endif
DECODE_HEAD;
if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
goto out;
/* For convenience's sake, we compare raw xdr'd attributes in
- * nfsd4_proc_verify; however we still decode here just to return
- * correct error in case of bad xdr. */
-#if 0
- status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl);
- if (status == nfserr_inval) {
- status = nfserrno(status);
- goto out;
- }
-#endif
+ * nfsd4_proc_verify */
+
READ_BUF(4);
READ32(verify->ve_attrlen);
READ_BUF(verify->ve_attrlen);
@@ -1063,7 +1126,6 @@ static __be32
nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
{
int avail;
- int v;
int len;
DECODE_HEAD;
@@ -1087,27 +1149,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
__FILE__, __LINE__);
goto xdr_error;
}
- argp->rqstp->rq_vec[0].iov_base = p;
- argp->rqstp->rq_vec[0].iov_len = avail;
- v = 0;
- len = write->wr_buflen;
- while (len > argp->rqstp->rq_vec[v].iov_len) {
- len -= argp->rqstp->rq_vec[v].iov_len;
- v++;
- argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen >= PAGE_SIZE) {
- argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE;
- argp->pagelen -= PAGE_SIZE;
- } else {
- argp->rqstp->rq_vec[v].iov_len = argp->pagelen;
- argp->pagelen -= len;
- }
+ write->wr_head.iov_base = p;
+ write->wr_head.iov_len = avail;
+ WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
+ write->wr_pagelist = argp->pagelist;
+
+ len = XDR_QUADLEN(write->wr_buflen) << 2;
+ if (len >= avail) {
+ int pages;
+
+ len -= avail;
+
+ pages = len >> PAGE_SHIFT;
+ argp->pagelist += pages;
+ argp->pagelen -= pages * PAGE_SIZE;
+ len -= pages * PAGE_SIZE;
+
+ argp->p = (__be32 *)page_address(argp->pagelist[0]);
+ argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
}
- argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len);
- argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
- argp->rqstp->rq_vec[v].iov_len = len;
- write->wr_vlen = v+1;
+ argp->p += XDR_QUADLEN(len);
DECODE_TAIL;
}
@@ -1237,11 +1298,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
struct nfsd4_create_session *sess)
{
DECODE_HEAD;
-
u32 dummy;
- char *machine_name;
- int i;
- int nr_secflavs;
READ_BUF(16);
COPYMEM(&sess->clientid, 8);
@@ -1282,58 +1339,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
goto xdr_error;
}
- READ_BUF(8);
+ READ_BUF(4);
READ32(sess->callback_prog);
-
- /* callback_sec_params4 */
- READ32(nr_secflavs);
- for (i = 0; i < nr_secflavs; ++i) {
- READ_BUF(4);
- READ32(dummy);
- switch (dummy) {
- case RPC_AUTH_NULL:
- /* Nothing to read */
- break;
- case RPC_AUTH_UNIX:
- READ_BUF(8);
- /* stamp */
- READ32(dummy);
-
- /* machine name */
- READ32(dummy);
- READ_BUF(dummy);
- SAVEMEM(machine_name, dummy);
-
- /* uid, gid */
- READ_BUF(8);
- READ32(sess->uid);
- READ32(sess->gid);
-
- /* more gids */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy * 4);
- break;
- case RPC_AUTH_GSS:
- dprintk("RPC_AUTH_GSS callback secflavor "
- "not supported!\n");
- READ_BUF(8);
- /* gcbp_service */
- READ32(dummy);
- /* gcbp_handle_from_server */
- READ32(dummy);
- READ_BUF(dummy);
- p += XDR_QUADLEN(dummy);
- /* gcbp_handle_from_client */
- READ_BUF(4);
- READ32(dummy);
- READ_BUF(dummy);
- break;
- default:
- dprintk("Illegal callback secflavor\n");
- return nfserr_inval;
- }
- }
+ nfsd4_decode_cb_sec(argp, &sess->cb_sec);
DECODE_TAIL;
}
@@ -1528,7 +1536,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp,
/* new operations for NFSv4.1 */
- [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
[OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
[OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
@@ -1568,12 +1576,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
bool cachethis = false;
int i;
- /*
- * XXX: According to spec, we should check the tag
- * for UTF-8 compliance. I'm postponing this for
- * now because it seems that some clients do use
- * binary tags.
- */
READ_BUF(4);
READ32(argp->taglen);
READ_BUF(argp->taglen + 8);
@@ -1603,38 +1605,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
op = &argp->ops[i];
op->replay = NULL;
- /*
- * We can't use READ_BUF() here because we need to handle
- * a missing opcode as an OP_WRITE + 1. So we need to check
- * to see if we're truly at the end of our buffer or if there
- * is another page we need to flip to.
- */
-
- if (argp->p == argp->end) {
- if (argp->pagelen < 4) {
- /* There isn't an opcode still on the wire */
- op->opnum = OP_WRITE + 1;
- op->status = nfserr_bad_xdr;
- argp->opcnt = i+1;
- break;
- }
-
- /*
- * False alarm. We just hit a page boundary, but there
- * is still data available. Move pointer across page
- * boundary. *snip from READ_BUF*
- */
- argp->p = page_address(argp->pagelist[0]);
- argp->pagelist++;
- if (argp->pagelen < PAGE_SIZE) {
- argp->end = argp->p + (argp->pagelen>>2);
- argp->pagelen = 0;
- } else {
- argp->end = argp->p + (PAGE_SIZE>>2);
- argp->pagelen -= PAGE_SIZE;
- }
- }
- op->opnum = ntohl(*argp->p++);
+ READ_BUF(4);
+ READ32(op->opnum);
if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
op->status = ops->decoders[op->opnum](argp, &op->u);
@@ -2014,6 +1986,22 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
return 0;
}
+
+static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+{
+ struct path path = exp->ex_path;
+ int err;
+
+ path_get(&path);
+ while (follow_up(&path)) {
+ if (path.dentry != path.mnt->mnt_root)
+ break;
+ }
+ err = vfs_getattr(path.mnt, path.dentry, stat);
+ path_put(&path);
+ return err;
+}
+
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
@@ -2048,6 +2036,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
.mnt = exp->ex_path.mnt,
.dentry = dentry,
};
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -2208,7 +2197,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
if ((buflen -= 4) < 0)
goto out_resource;
- WRITE32(nfsd4_lease);
+ WRITE32(nn->nfsd4_lease);
}
if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
if ((buflen -= 4) < 0)
@@ -2430,18 +2419,8 @@ out_acl:
* and this is the root of a cross-mounted filesystem.
*/
if (ignore_crossmnt == 0 &&
- dentry == exp->ex_path.mnt->mnt_root) {
- struct path path = exp->ex_path;
- path_get(&path);
- while (follow_up(&path)) {
- if (path.dentry != path.mnt->mnt_root)
- break;
- }
- err = vfs_getattr(path.mnt, path.dentry, &stat);
- path_put(&path);
- if (err)
- goto out_nfserr;
- }
+ dentry == exp->ex_path.mnt->mnt_root)
+ get_parent_attributes(exp, &stat);
WRITE64(stat.ino);
}
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
@@ -2927,7 +2906,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_read *read)
{
u32 eof;
- int v, pn;
+ int v;
+ struct page *page;
unsigned long maxcount;
long len;
__be32 *p;
@@ -2946,11 +2926,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
len = maxcount;
v = 0;
while (len > 0) {
- pn = resp->rqstp->rq_resused++;
- resp->rqstp->rq_vec[v].iov_base =
- page_address(resp->rqstp->rq_respages[pn]);
+ page = *(resp->rqstp->rq_next_page);
+ if (!page) { /* ran out of pages */
+ maxcount -= len;
+ break;
+ }
+ resp->rqstp->rq_vec[v].iov_base = page_address(page);
resp->rqstp->rq_vec[v].iov_len =
len < PAGE_SIZE ? len : PAGE_SIZE;
+ resp->rqstp->rq_next_page++;
v++;
len -= PAGE_SIZE;
}
@@ -2996,8 +2980,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
return nfserr;
if (resp->xbuf->page_len)
return nfserr_resource;
+ if (!*resp->rqstp->rq_next_page)
+ return nfserr_resource;
- page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+ page = page_address(*(resp->rqstp->rq_next_page++));
maxcount = PAGE_SIZE;
RESERVE_SPACE(4);
@@ -3045,6 +3031,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
return nfserr;
if (resp->xbuf->page_len)
return nfserr_resource;
+ if (!*resp->rqstp->rq_next_page)
+ return nfserr_resource;
RESERVE_SPACE(NFS4_VERIFIER_SIZE);
savep = p;
@@ -3071,7 +3059,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
goto err_no_verf;
}
- page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]);
+ page = page_address(*(resp->rqstp->rq_next_page++));
readdir->common.err = 0;
readdir->buflen = maxcount;
readdir->buffer = page;
@@ -3094,8 +3082,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
p = readdir->buffer;
*p++ = 0; /* no more entries */
*p++ = htonl(readdir->common.err == nfserr_eof);
- resp->xbuf->page_len = ((char*)p) - (char*)page_address(
- resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
+ resp->xbuf->page_len = ((char*)p) -
+ (char*)page_address(*(resp->rqstp->rq_next_page-1));
/* Use rest of head for padding and remaining ops: */
resp->xbuf->tail[0].iov_base = tailbase;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dab350dfc376..74934284d9a7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -19,7 +19,7 @@
#include "idmap.h"
#include "nfsd.h"
#include "cache.h"
-#include "fault_inject.h"
+#include "state.h"
#include "netns.h"
/*
@@ -186,9 +186,6 @@ static struct file_operations supported_enctypes_ops = {
};
#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
-extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
-extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
-
static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
@@ -399,6 +396,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
int rv;
+ struct net *net = &init_net;
+
if (size > 0) {
int newthreads;
rv = get_int(&mesg, &newthreads);
@@ -406,11 +405,11 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
return rv;
if (newthreads < 0)
return -EINVAL;
- rv = nfsd_svc(newthreads);
+ rv = nfsd_svc(newthreads, net);
if (rv < 0)
return rv;
} else
- rv = nfsd_nrthreads();
+ rv = nfsd_nrthreads(net);
return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
}
@@ -448,9 +447,10 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
int len;
int npools;
int *nthreads;
+ struct net *net = &init_net;
mutex_lock(&nfsd_mutex);
- npools = nfsd_nrpools();
+ npools = nfsd_nrpools(net);
if (npools == 0) {
/*
* NFS is shut down. The admin can start it by
@@ -478,12 +478,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
if (nthreads[i] < 0)
goto out_free;
}
- rv = nfsd_set_nrthreads(i, nthreads);
+ rv = nfsd_set_nrthreads(i, nthreads, net);
if (rv)
goto out_free;
}
- rv = nfsd_get_nrthreads(npools, nthreads);
+ rv = nfsd_get_nrthreads(npools, nthreads, net);
if (rv)
goto out_free;
@@ -510,11 +510,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
unsigned minor;
ssize_t tlen = 0;
char *sep;
+ struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (size>0) {
- if (nfsd_serv)
+ if (nn->nfsd_serv)
/* Cannot change versions without updating
- * nfsd_serv->sv_xdrsize, and reallocing
+ * nn->nfsd_serv->sv_xdrsize, and reallocing
* rq_argp and rq_resp
*/
return -EBUSY;
@@ -645,11 +647,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
* Zero-length write. Return a list of NFSD's current listener
* transports.
*/
-static ssize_t __write_ports_names(char *buf)
+static ssize_t __write_ports_names(char *buf, struct net *net)
{
- if (nfsd_serv == NULL)
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv == NULL)
return 0;
- return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
+ return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
}
/*
@@ -657,28 +661,28 @@ static ssize_t __write_ports_names(char *buf)
* a socket of a supported family/protocol, and we use it as an
* nfsd listener.
*/
-static ssize_t __write_ports_addfd(char *buf)
+static ssize_t __write_ports_addfd(char *buf, struct net *net)
{
char *mesg = buf;
int fd, err;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
err = get_int(&mesg, &fd);
if (err != 0 || fd < 0)
return -EINVAL;
- err = nfsd_create_serv();
+ err = nfsd_create_serv(net);
if (err != 0)
return err;
- err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
+ err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
if (err < 0) {
nfsd_destroy(net);
return err;
}
/* Decrease the count, but don't shut down the service */
- nfsd_serv->sv_nrthreads--;
+ nn->nfsd_serv->sv_nrthreads--;
return err;
}
@@ -686,12 +690,12 @@ static ssize_t __write_ports_addfd(char *buf)
* A transport listener is added by writing it's transport name and
* a port number.
*/
-static ssize_t __write_ports_addxprt(char *buf)
+static ssize_t __write_ports_addxprt(char *buf, struct net *net)
{
char transport[16];
struct svc_xprt *xprt;
int port, err;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (sscanf(buf, "%15s %5u", transport, &port) != 2)
return -EINVAL;
@@ -699,25 +703,25 @@ static ssize_t __write_ports_addxprt(char *buf)
if (port < 1 || port > USHRT_MAX)
return -EINVAL;
- err = nfsd_create_serv();
+ err = nfsd_create_serv(net);
if (err != 0)
return err;
- err = svc_create_xprt(nfsd_serv, transport, net,
+ err = svc_create_xprt(nn->nfsd_serv, transport, net,
PF_INET, port, SVC_SOCK_ANONYMOUS);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nfsd_serv, transport, net,
+ err = svc_create_xprt(nn->nfsd_serv, transport, net,
PF_INET6, port, SVC_SOCK_ANONYMOUS);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
/* Decrease the count, but don't shut down the service */
- nfsd_serv->sv_nrthreads--;
+ nn->nfsd_serv->sv_nrthreads--;
return 0;
out_close:
- xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
+ xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
svc_close_xprt(xprt);
svc_xprt_put(xprt);
@@ -727,16 +731,17 @@ out_err:
return err;
}
-static ssize_t __write_ports(struct file *file, char *buf, size_t size)
+static ssize_t __write_ports(struct file *file, char *buf, size_t size,
+ struct net *net)
{
if (size == 0)
- return __write_ports_names(buf);
+ return __write_ports_names(buf, net);
if (isdigit(buf[0]))
- return __write_ports_addfd(buf);
+ return __write_ports_addfd(buf, net);
if (isalpha(buf[0]))
- return __write_ports_addxprt(buf);
+ return __write_ports_addxprt(buf, net);
return -EINVAL;
}
@@ -787,9 +792,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
static ssize_t write_ports(struct file *file, char *buf, size_t size)
{
ssize_t rv;
+ struct net *net = &init_net;
mutex_lock(&nfsd_mutex);
- rv = __write_ports(file, buf, size);
+ rv = __write_ports(file, buf, size, net);
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -821,6 +827,9 @@ int nfsd_max_blksize;
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
+ struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
if (size > 0) {
int bsize;
int rv = get_int(&mesg, &bsize);
@@ -835,7 +844,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
bsize = NFSSVC_MAXBLKSIZE;
bsize &= ~(1024-1);
mutex_lock(&nfsd_mutex);
- if (nfsd_serv) {
+ if (nn->nfsd_serv) {
mutex_unlock(&nfsd_mutex);
return -EBUSY;
}
@@ -848,13 +857,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
}
#ifdef CONFIG_NFSD_V4
-static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
+ time_t *time, struct nfsd_net *nn)
{
char *mesg = buf;
int rv, i;
if (size > 0) {
- if (nfsd_serv)
+ if (nn->nfsd_serv)
return -EBUSY;
rv = get_int(&mesg, &i);
if (rv)
@@ -879,12 +889,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim
return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
}
-static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time)
+static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
+ time_t *time, struct nfsd_net *nn)
{
ssize_t rv;
mutex_lock(&nfsd_mutex);
- rv = __nfsd4_write_time(file, buf, size, time);
+ rv = __nfsd4_write_time(file, buf, size, time, nn);
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -912,7 +923,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_
*/
static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
{
- return nfsd4_write_time(file, buf, size, &nfsd4_lease);
+ struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
}
/**
@@ -927,17 +939,19 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
*/
static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
{
- return nfsd4_write_time(file, buf, size, &nfsd4_grace);
+ struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
+ return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
}
-static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
+ struct nfsd_net *nn)
{
char *mesg = buf;
char *recdir;
int len, status;
if (size > 0) {
- if (nfsd_serv)
+ if (nn->nfsd_serv)
return -EBUSY;
if (size > PATH_MAX || buf[size-1] != '\n')
return -EINVAL;
@@ -981,9 +995,10 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
{
ssize_t rv;
+ struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
mutex_lock(&nfsd_mutex);
- rv = __write_recoverydir(file, buf, size);
+ rv = __write_recoverydir(file, buf, size, nn);
mutex_unlock(&nfsd_mutex);
return rv;
}
@@ -1063,6 +1078,7 @@ int nfsd_net_id;
static __net_init int nfsd_init_net(struct net *net)
{
int retval;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
retval = nfsd_export_init(net);
if (retval)
@@ -1070,6 +1086,8 @@ static __net_init int nfsd_init_net(struct net *net)
retval = nfsd_idmap_init(net);
if (retval)
goto out_idmap_error;
+ nn->nfsd4_lease = 90; /* default lease time */
+ nn->nfsd4_grace = 90;
return 0;
out_idmap_error:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 80d5ce40aadb..de23db255c69 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -55,7 +55,6 @@ extern struct svc_version nfsd_version2, nfsd_version3,
nfsd_version4;
extern u32 nfsd_supported_minorversion;
extern struct mutex nfsd_mutex;
-extern struct svc_serv *nfsd_serv;
extern spinlock_t nfsd_drc_lock;
extern unsigned int nfsd_drc_max_mem;
extern unsigned int nfsd_drc_mem_used;
@@ -65,26 +64,17 @@ extern const struct seq_operations nfs_exports_op;
/*
* Function prototypes.
*/
-int nfsd_svc(int nrservs);
+int nfsd_svc(int nrservs, struct net *net);
int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
-int nfsd_nrthreads(void);
-int nfsd_nrpools(void);
-int nfsd_get_nrthreads(int n, int *);
-int nfsd_set_nrthreads(int n, int *);
+int nfsd_nrthreads(struct net *);
+int nfsd_nrpools(struct net *);
+int nfsd_get_nrthreads(int n, int *, struct net *);
+int nfsd_set_nrthreads(int n, int *, struct net *);
int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
-static inline void nfsd_destroy(struct net *net)
-{
- int destroy = (nfsd_serv->sv_nrthreads == 1);
-
- if (destroy)
- svc_shutdown_net(nfsd_serv, net);
- svc_destroy(nfsd_serv);
- if (destroy)
- nfsd_serv = NULL;
-}
+void nfsd_destroy(struct net *net);
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
#ifdef CONFIG_NFSD_V2_ACL
@@ -103,7 +93,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
int nfsd_vers(int vers, enum vers_op change);
int nfsd_minorversion(u32 minorversion, enum vers_op change);
void nfsd_reset_versions(void);
-int nfsd_create_serv(void);
+int nfsd_create_serv(struct net *net);
extern int nfsd_max_blksize;
@@ -121,7 +111,9 @@ void nfs4_state_init(void);
int nfsd4_init_slabs(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
+int nfs4_state_start_net(struct net *net);
void nfs4_state_shutdown(void);
+void nfs4_state_shutdown_net(struct net *net);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
@@ -130,7 +122,9 @@ static inline void nfs4_state_init(void) { }
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
+static inline int nfs4_state_start_net(struct net *net) { return 0; }
static inline void nfs4_state_shutdown(void) { }
+static inline void nfs4_state_shutdown_net(struct net *net) { }
static inline void nfs4_reset_lease(time_t leasetime) { }
static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
static inline char * nfs4_recoverydir(void) {return NULL; }
@@ -265,16 +259,8 @@ void nfsd_lockd_shutdown(void);
/* Check for dir entries '.' and '..' */
#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
-/*
- * Time of server startup
- */
-extern struct timeval nfssvc_boot;
-
#ifdef CONFIG_NFSD_V4
-extern time_t nfsd4_lease;
-extern time_t nfsd4_grace;
-
/* before processing a COMPOUND operation, we have to check that there
* is enough space in the buffer for XDR encode to succeed. otherwise,
* we might process an operation with side effects, and be unable to
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 032af381b3aa..814afaa4458a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
if (inode)
_fh_update(fhp, exp, dentry);
- if (fhp->fh_handle.fh_fileid_type == 255) {
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
fh_put(fhp);
return nfserr_opnotsupp;
}
@@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp)
goto out;
_fh_update(fhp, fhp->fh_export, dentry);
- if (fhp->fh_handle.fh_fileid_type == 255)
+ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
return nfserr_opnotsupp;
}
out:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2013aa001dab..cee62ab9d4a3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,7 +11,6 @@
#include <linux/module.h>
#include <linux/fs_struct.h>
#include <linux/swap.h>
-#include <linux/nsproxy.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svcsock.h>
@@ -22,19 +21,19 @@
#include "nfsd.h"
#include "cache.h"
#include "vfs.h"
+#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_SVC
extern struct svc_program nfsd_program;
static int nfsd(void *vrqstp);
-struct timeval nfssvc_boot;
/*
- * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
* of the svc_serv struct. In particular, ->sv_nrthreads but also to some
* extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
*
- * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
* properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
* of nfsd threads must exist and each must listed in ->sp_all_threads in each
* entry of ->sv_pools[].
@@ -52,7 +51,6 @@ struct timeval nfssvc_boot;
* nfsd_versions
*/
DEFINE_MUTEX(nfsd_mutex);
-struct svc_serv *nfsd_serv;
/*
* nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
@@ -173,28 +171,32 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
*/
#define NFSD_MAXSERVS 8192
-int nfsd_nrthreads(void)
+int nfsd_nrthreads(struct net *net)
{
int rv = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
mutex_lock(&nfsd_mutex);
- if (nfsd_serv)
- rv = nfsd_serv->sv_nrthreads;
+ if (nn->nfsd_serv)
+ rv = nn->nfsd_serv->sv_nrthreads;
mutex_unlock(&nfsd_mutex);
return rv;
}
-static int nfsd_init_socks(void)
+static int nfsd_init_socks(struct net *net)
{
int error;
- if (!list_empty(&nfsd_serv->sv_permsocks))
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT,
+ error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
- error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT,
+ error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
@@ -202,14 +204,15 @@ static int nfsd_init_socks(void)
return 0;
}
-static bool nfsd_up = false;
+static int nfsd_users = 0;
-static int nfsd_startup(int nrservs)
+static int nfsd_startup_generic(int nrservs)
{
int ret;
- if (nfsd_up)
+ if (nfsd_users++)
return 0;
+
/*
* Readahead param cache - will no-op if it already exists.
* (Note therefore results will be suboptimal if number of
@@ -218,43 +221,79 @@ static int nfsd_startup(int nrservs)
ret = nfsd_racache_init(2*nrservs);
if (ret)
return ret;
- ret = nfsd_init_socks();
+ ret = nfs4_state_start();
if (ret)
goto out_racache;
- ret = lockd_up(&init_net);
+ return 0;
+
+out_racache:
+ nfsd_racache_shutdown();
+ return ret;
+}
+
+static void nfsd_shutdown_generic(void)
+{
+ if (--nfsd_users)
+ return;
+
+ nfs4_state_shutdown();
+ nfsd_racache_shutdown();
+}
+
+static int nfsd_startup_net(int nrservs, struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int ret;
+
+ if (nn->nfsd_net_up)
+ return 0;
+
+ ret = nfsd_startup_generic(nrservs);
if (ret)
- goto out_racache;
- ret = nfs4_state_start();
+ return ret;
+ ret = nfsd_init_socks(net);
+ if (ret)
+ goto out_socks;
+ ret = lockd_up(net);
+ if (ret)
+ goto out_socks;
+ ret = nfs4_state_start_net(net);
if (ret)
goto out_lockd;
- nfsd_up = true;
+
+ nn->nfsd_net_up = true;
return 0;
+
out_lockd:
- lockd_down(&init_net);
-out_racache:
- nfsd_racache_shutdown();
+ lockd_down(net);
+out_socks:
+ nfsd_shutdown_generic();
return ret;
}
-static void nfsd_shutdown(void)
+static void nfsd_shutdown_net(struct net *net)
{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ nfs4_state_shutdown_net(net);
+ lockd_down(net);
+ nn->nfsd_net_up = false;
+ nfsd_shutdown_generic();
+}
+
+static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
* started, then nfsd_last_thread will be run before any of this
* other initialization has been done.
*/
- if (!nfsd_up)
+ if (!nn->nfsd_net_up)
return;
- nfs4_state_shutdown();
- lockd_down(&init_net);
- nfsd_racache_shutdown();
- nfsd_up = false;
-}
-
-static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
-{
- nfsd_shutdown();
+ nfsd_shutdown_net(net);
svc_rpcb_cleanup(serv, net);
@@ -327,69 +366,84 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
-int nfsd_create_serv(void)
+int nfsd_create_serv(struct net *net)
{
int error;
- struct net *net = current->nsproxy->net_ns;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
WARN_ON(!mutex_is_locked(&nfsd_mutex));
- if (nfsd_serv) {
- svc_get(nfsd_serv);
+ if (nn->nfsd_serv) {
+ svc_get(nn->nfsd_serv);
return 0;
}
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions();
- nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+ nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
nfsd_last_thread, nfsd, THIS_MODULE);
- if (nfsd_serv == NULL)
+ if (nn->nfsd_serv == NULL)
return -ENOMEM;
- error = svc_bind(nfsd_serv, net);
+ error = svc_bind(nn->nfsd_serv, net);
if (error < 0) {
- svc_destroy(nfsd_serv);
+ svc_destroy(nn->nfsd_serv);
return error;
}
set_max_drc();
- do_gettimeofday(&nfssvc_boot); /* record boot time */
+ do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
return 0;
}
-int nfsd_nrpools(void)
+int nfsd_nrpools(struct net *net)
{
- if (nfsd_serv == NULL)
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ if (nn->nfsd_serv == NULL)
return 0;
else
- return nfsd_serv->sv_nrpools;
+ return nn->nfsd_serv->sv_nrpools;
}
-int nfsd_get_nrthreads(int n, int *nthreads)
+int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
{
int i = 0;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- if (nfsd_serv != NULL) {
- for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++)
- nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads;
+ if (nn->nfsd_serv != NULL) {
+ for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
+ nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
}
return 0;
}
-int nfsd_set_nrthreads(int n, int *nthreads)
+void nfsd_destroy(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
+
+ if (destroy)
+ svc_shutdown_net(nn->nfsd_serv, net);
+ svc_destroy(nn->nfsd_serv);
+ if (destroy)
+ nn->nfsd_serv = NULL;
+}
+
+int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
{
int i = 0;
int tot = 0;
int err = 0;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
WARN_ON(!mutex_is_locked(&nfsd_mutex));
- if (nfsd_serv == NULL || n <= 0)
+ if (nn->nfsd_serv == NULL || n <= 0)
return 0;
- if (n > nfsd_serv->sv_nrpools)
- n = nfsd_serv->sv_nrpools;
+ if (n > nn->nfsd_serv->sv_nrpools)
+ n = nn->nfsd_serv->sv_nrpools;
/* enforce a global maximum number of threads */
tot = 0;
@@ -419,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
nthreads[0] = 1;
/* apply the new numbers */
- svc_get(nfsd_serv);
+ svc_get(nn->nfsd_serv);
for (i = 0; i < n; i++) {
- err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
+ err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
nthreads[i]);
if (err)
break;
@@ -436,11 +490,11 @@ int nfsd_set_nrthreads(int n, int *nthreads)
* this is the first time nrservs is nonzero.
*/
int
-nfsd_svc(int nrservs)
+nfsd_svc(int nrservs, struct net *net)
{
int error;
bool nfsd_up_before;
- struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
mutex_lock(&nfsd_mutex);
dprintk("nfsd: creating service\n");
@@ -449,29 +503,29 @@ nfsd_svc(int nrservs)
if (nrservs > NFSD_MAXSERVS)
nrservs = NFSD_MAXSERVS;
error = 0;
- if (nrservs == 0 && nfsd_serv == NULL)
+ if (nrservs == 0 && nn->nfsd_serv == NULL)
goto out;
- error = nfsd_create_serv();
+ error = nfsd_create_serv(net);
if (error)
goto out;
- nfsd_up_before = nfsd_up;
+ nfsd_up_before = nn->nfsd_net_up;
- error = nfsd_startup(nrservs);
+ error = nfsd_startup_net(nrservs, net);
if (error)
goto out_destroy;
- error = svc_set_num_threads(nfsd_serv, NULL, nrservs);
+ error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
if (error)
goto out_shutdown;
- /* We are holding a reference to nfsd_serv which
+ /* We are holding a reference to nn->nfsd_serv which
* we don't want to count in the return value,
* so subtract 1
*/
- error = nfsd_serv->sv_nrthreads - 1;
+ error = nn->nfsd_serv->sv_nrthreads - 1;
out_shutdown:
if (error < 0 && !nfsd_up_before)
- nfsd_shutdown();
+ nfsd_shutdown_net(net);
out_destroy:
nfsd_destroy(net); /* Release server */
out:
@@ -487,6 +541,8 @@ static int
nfsd(void *vrqstp)
{
struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
+ struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
+ struct net *net = perm_sock->xpt_net;
int err;
/* Lock module and set up kernel thread */
@@ -551,7 +607,7 @@ out:
/* Release the thread */
svc_exit_thread(rqstp);
- nfsd_destroy(&init_net);
+ nfsd_destroy(net);
/* Release module */
mutex_unlock(&nfsd_mutex);
@@ -640,21 +696,24 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
}
/* Store reply in cache. */
- nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
return 1;
}
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
int ret;
+ struct net *net = &init_net;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
mutex_lock(&nfsd_mutex);
- if (nfsd_serv == NULL) {
+ if (nn->nfsd_serv == NULL) {
mutex_unlock(&nfsd_mutex);
return -ENODEV;
}
/* bump up the psudo refcount while traversing */
- svc_get(nfsd_serv);
- ret = svc_pool_stats_open(nfsd_serv, file);
+ svc_get(nn->nfsd_serv);
+ ret = svc_pool_stats_open(nn->nfsd_serv, file);
mutex_unlock(&nfsd_mutex);
return ret;
}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 65ec595e2226..979b42106979 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -246,7 +246,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readargs *args)
{
unsigned int len;
- int v,pn;
+ int v;
if (!(p = decode_fh(p, &args->fh)))
return 0;
@@ -262,8 +262,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
*/
v=0;
while (len > 0) {
- pn = rqstp->rq_resused++;
- rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+ struct page *p = *(rqstp->rq_next_page++);
+
+ rqstp->rq_vec[v].iov_base = page_address(p);
rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
len -= rqstp->rq_vec[v].iov_len;
v++;
@@ -355,7 +356,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
{
if (!(p = decode_fh(p, &args->fh)))
return 0;
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
@@ -396,7 +397,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
if (args->count > PAGE_SIZE)
args->count = PAGE_SIZE;
- args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]);
+ args->buffer = page_address(*(rqstp->rq_next_page++));
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e036894bce57..d1c229feed52 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -150,6 +150,12 @@ struct nfsd4_channel_attrs {
u32 rdma_attrs;
};
+struct nfsd4_cb_sec {
+ u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */
+ u32 uid;
+ u32 gid;
+};
+
struct nfsd4_create_session {
clientid_t clientid;
struct nfs4_sessionid sessionid;
@@ -158,8 +164,12 @@ struct nfsd4_create_session {
struct nfsd4_channel_attrs fore_channel;
struct nfsd4_channel_attrs back_channel;
u32 callback_prog;
- u32 uid;
- u32 gid;
+ struct nfsd4_cb_sec cb_sec;
+};
+
+struct nfsd4_backchannel_ctl {
+ u32 bc_cb_program;
+ struct nfsd4_cb_sec bc_cb_sec;
};
struct nfsd4_bind_conn_to_session {
@@ -192,6 +202,7 @@ struct nfsd4_session {
struct nfs4_sessionid se_sessionid;
struct nfsd4_channel_attrs se_fchannel;
struct nfsd4_channel_attrs se_bchannel;
+ struct nfsd4_cb_sec se_cb_sec;
struct list_head se_conns;
u32 se_cb_prog;
u32 se_cb_seq_nr;
@@ -221,13 +232,12 @@ struct nfsd4_sessionid {
*/
struct nfs4_client {
struct list_head cl_idhash; /* hash by cl_clientid.id */
- struct list_head cl_strhash; /* hash by cl_name */
+ struct rb_node cl_namenode; /* link into by-name trees */
struct list_head cl_openowners;
struct idr cl_stateids; /* stateid lookup */
struct list_head cl_delegations;
struct list_head cl_lru; /* tail queue */
struct xdr_netobj cl_name; /* id generated by client */
- char cl_recdir[HEXDIR_LEN]; /* recovery dir */
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
struct sockaddr_storage cl_addr; /* client ipaddress */
@@ -242,9 +252,11 @@ struct nfs4_client {
#define NFSD4_CLIENT_CB_KILL (1)
#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
+#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
unsigned long cl_flags;
+ struct rpc_cred *cl_cb_cred;
struct rpc_clnt *cl_cb_client;
u32 cl_cb_ident;
#define NFSD4_CB_UP 0
@@ -271,6 +283,7 @@ struct nfs4_client {
unsigned long cl_cb_slot_busy;
struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
/* wait here for slots */
+ struct net *net;
};
static inline void
@@ -292,6 +305,7 @@ is_client_expired(struct nfs4_client *clp)
*/
struct nfs4_client_reclaim {
struct list_head cr_strhash; /* hash by cr_name */
+ struct nfs4_client *cr_clp; /* pointer to associated clp */
char cr_recdir[HEXDIR_LEN]; /* recover dir */
};
@@ -452,25 +466,26 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net,
stateid_t *stateid, int flags, struct file **filp);
extern void nfs4_lock_state(void);
extern void nfs4_unlock_state(void);
-extern int nfs4_in_grace(void);
-extern void nfs4_release_reclaim(void);
-extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp);
-extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions);
+void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
+extern void nfs4_release_reclaim(struct nfsd_net *);
+extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
+ struct nfsd_net *nn);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
extern void nfs4_free_openowner(struct nfs4_openowner *);
extern void nfs4_free_lockowner(struct nfs4_lockowner *);
extern int set_callback_cred(void);
+extern void nfsd4_init_callback(struct nfsd4_callback *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
-extern void nfsd4_do_callback_rpc(struct work_struct *);
extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
extern int nfsd4_create_callback_queue(void);
extern void nfsd4_destroy_callback_queue(void);
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfs4_put_delegation(struct nfs4_delegation *dp);
-extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
-extern int nfs4_client_to_reclaim(const char *name);
-extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
+extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
+ struct nfsd_net *nn);
+extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
extern void release_session_client(struct nfsd4_session *);
extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
@@ -480,5 +495,28 @@ extern void nfsd4_client_tracking_exit(struct net *net);
extern void nfsd4_client_record_create(struct nfs4_client *clp);
extern void nfsd4_client_record_remove(struct nfs4_client *clp);
extern int nfsd4_client_record_check(struct nfs4_client *clp);
-extern void nfsd4_record_grace_done(struct net *net, time_t boot_time);
+extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
+
+/* nfs fault injection functions */
+#ifdef CONFIG_NFSD_FAULT_INJECTION
+int nfsd_fault_inject_init(void);
+void nfsd_fault_inject_cleanup(void);
+u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
+struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
+
+u64 nfsd_forget_client(struct nfs4_client *, u64);
+u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
+u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
+u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
+
+u64 nfsd_print_client(struct nfs4_client *, u64);
+u64 nfsd_print_client_locks(struct nfs4_client *, u64);
+u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
+u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
+#else /* CONFIG_NFSD_FAULT_INJECTION */
+static inline int nfsd_fault_inject_init(void) { return 0; }
+static inline void nfsd_fault_inject_cleanup(void) {}
+#endif /* CONFIG_NFSD_FAULT_INJECTION */
+
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c120b48ec305..f0a6d88d7fff 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
struct svc_rqst *rqstp = sd->u.data;
- struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
+ struct page **pp = rqstp->rq_next_page;
struct page *page = buf->page;
size_t size;
@@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
if (rqstp->rq_res.page_len == 0) {
get_page(page);
- put_page(*pp);
- *pp = page;
- rqstp->rq_resused++;
+ put_page(*rqstp->rq_next_page);
+ *(rqstp->rq_next_page++) = page;
rqstp->rq_res.page_base = buf->offset;
rqstp->rq_res.page_len = size;
} else if (page != pp[-1]) {
get_page(page);
- if (*pp)
- put_page(*pp);
- *pp = page;
- rqstp->rq_resused++;
+ if (*rqstp->rq_next_page)
+ put_page(*rqstp->rq_next_page);
+ *(rqstp->rq_next_page++) = page;
rqstp->rq_res.page_len += size;
} else
rqstp->rq_res.page_len += size;
@@ -936,7 +934,8 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
.u.data = rqstp,
};
- rqstp->rq_resused = 1;
+ WARN_ON_ONCE(rqstp->rq_next_page != rqstp->rq_respages + 1);
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
} else {
oldfs = get_fs();
@@ -1020,28 +1019,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
inode = dentry->d_inode;
exp = fhp->fh_export;
- /*
- * Request sync writes if
- * - the sync export option has been set, or
- * - the client requested O_SYNC behavior (NFSv3 feature).
- * - The file system doesn't support fsync().
- * When NFSv2 gathered writes have been configured for this volume,
- * flushing the data to disk is handled separately below.
- */
use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
- if (!file->f_op->fsync) {/* COMMIT3 cannot work */
- stable = 2;
- *stablep = 2; /* FILE_SYNC */
- }
-
if (!EX_ISSYNC(exp))
stable = 0;
- if (stable && !use_wgather) {
- spin_lock(&file->f_lock);
- file->f_flags |= O_SYNC;
- spin_unlock(&file->f_lock);
- }
/* Write the data. */
oldfs = get_fs(); set_fs(KERNEL_DS);
@@ -1057,8 +1038,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (inode->i_mode & (S_ISUID | S_ISGID))
kill_suid(dentry);
- if (stable && use_wgather)
- host_err = wait_for_concurrent_writes(file);
+ if (stable) {
+ if (use_wgather)
+ host_err = wait_for_concurrent_writes(file);
+ else
+ host_err = vfs_fsync_range(file, offset, offset+*cnt, 0);
+ }
out_nfserr:
dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1485,13 +1470,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_EXCLUSIVE:
if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
&& dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 )
+ && dchild->d_inode->i_size == 0 ) {
+ if (created)
+ *created = 1;
break;
+ }
case NFS4_CREATE_EXCLUSIVE4_1:
if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
&& dchild->d_inode->i_atime.tv_sec == v_atime
- && dchild->d_inode->i_size == 0 )
+ && dchild->d_inode->i_size == 0 ) {
+ if (created)
+ *created = 1;
goto set_attr;
+ }
/* fallthru */
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index acd127d4ee82..0889bfb43dc9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -385,7 +385,8 @@ struct nfsd4_write {
u64 wr_offset; /* request */
u32 wr_stable_how; /* request */
u32 wr_buflen; /* request */
- int wr_vlen;
+ struct kvec wr_head;
+ struct page ** wr_pagelist; /* request */
u32 wr_bytes_written; /* response */
u32 wr_how_written; /* response */
@@ -462,6 +463,7 @@ struct nfsd4_op {
/* NFSv4.1 */
struct nfsd4_exchange_id exchange_id;
+ struct nfsd4_backchannel_ctl backchannel_ctl;
struct nfsd4_bind_conn_to_session bind_conn_to_session;
struct nfsd4_create_session create_session;
struct nfsd4_destroy_session destroy_session;
@@ -526,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
|| nfsd4_is_solo_sequence(resp);
}
+static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ return argp->opcnt == resp->opcnt;
+}
+
#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
static inline void
@@ -566,6 +576,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
+extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
extern __be32 nfsd4_create_session(struct svc_rqst *,
struct nfsd4_compound_state *,
@@ -579,7 +590,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *,
extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
- struct nfsd4_open *open);
+ struct nfsd4_open *open, struct nfsd_net *nn);
extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
struct svc_fh *current_fh, struct nfsd4_open *open);
extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 16f35f7423c5..61946883025c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = {
};
const struct inode_operations nilfs_file_inode_operations = {
- .truncate = nilfs_truncate,
.setattr = nilfs_setattr,
.permission = nilfs_permission,
.fiemap = nilfs_fiemap,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4d31d2cca7fd..6b49f14eac8c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page)
return ret;
}
+void nilfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ nilfs_truncate(inode);
+ }
+}
+
static int nilfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
err = block_write_begin(mapping, pos, len, flags, pagep,
nilfs_get_block);
if (unlikely(err)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
-
+ nilfs_write_failed(mapping, pos + len);
nilfs_transaction_abort(inode->i_sb);
}
return err;
@@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
ssize_t size;
@@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t end = offset + iov_length(iov, nr_segs);
if (end > isize)
- vmtruncate(inode, isize);
+ nilfs_write_failed(mapping, end);
}
return size;
@@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
-
- err = vmtruncate(inode, iattr->ia_size);
- if (unlikely(err))
- goto out_err;
+ truncate_setsize(inode, iattr->ia_size);
+ nilfs_truncate(inode);
}
setattr_copy(inode, iattr);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 74cece80e9a3..9bc72dec3fa6 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
extern int nilfs_setattr(struct dentry *, struct iattr *);
+extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
int nilfs_permission(struct inode *inode, int mask);
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index f1626f5011c5..ff00a0b7acb9 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
if (unlikely(err)) {
loff_t isize = inode->i_size;
if (pos + blocksize > isize)
- vmtruncate(inode, isize);
+ nilfs_write_failed(inode->i_mapping,
+ pos + blocksize);
goto failed_inode;
}
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1ecf46448f85..5b2d4f0853ac 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1762,6 +1762,16 @@ err_out:
return err;
}
+static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ ntfs_truncate_vfs(inode);
+ }
+}
+
/**
* ntfs_file_buffered_write -
*
@@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
* allocated space, which is not a disaster.
*/
i_size = i_size_read(vi);
- if (pos + bytes > i_size)
- vmtruncate(vi, i_size);
+ if (pos + bytes > i_size) {
+ ntfs_write_failed(mapping, pos + bytes);
+ }
break;
}
}
@@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = {
const struct inode_operations ntfs_file_inode_ops = {
#ifdef NTFS_RW
- .truncate = ntfs_truncate_vfs,
.setattr = ntfs_setattr,
#endif /* NTFS_RW */
};
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1d27331e6fc9..d3e118cc6ffa 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,9 +2866,11 @@ conv_err_out:
*
* See ntfs_truncate() description above for details.
*/
+#ifdef NTFS_RW
void ntfs_truncate_vfs(struct inode *vi) {
ntfs_truncate(vi);
}
+#endif
/**
* ntfs_setattr - called from notify_change() when an attribute is being changed
@@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
NInoCompressed(ni) ?
"compressed" : "encrypted");
err = -EOPNOTSUPP;
- } else
- err = vmtruncate(vi, attr->ia_size);
+ } else {
+ truncate_setsize(vi, attr->ia_size);
+ ntfs_truncate_vfs(vi);
+ }
if (err || ia_valid == ATTR_SIZE)
goto out;
} else {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index db29695f845c..76b6cfb579d7 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi)
return;
}
+#else
+
+static inline void ntfs_truncate_vfs(struct inode *vi) {}
+
#endif /* NTFS_RW */
#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fe492e1a3cfc..37d313ede159 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1218,24 +1218,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- /*
- * This will intentionally not wind up calling truncate_setsize(),
- * since all the work for a size change has been done above.
- * Otherwise, we could get into problems with truncate as
- * ip_alloc_sem is used there to protect against i_size
- * changes.
- *
- * XXX: this means the conditional below can probably be removed.
- */
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- status = vmtruncate(inode, attr->ia_size);
- if (status) {
- mlog_errno(status);
- goto bail_commit;
- }
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 77e3cb2962b4..e0d9b3e722bd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
return mpage_writepages(mapping, wbc, omfs_get_block);
}
+static void omfs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ omfs_truncate(inode);
+ }
+}
+
static int omfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
omfs_get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ omfs_write_failed(mapping, pos + len);
return ret;
}
@@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+ truncate_setsize(inode, attr->ia_size);
+ omfs_truncate(inode);
}
setattr_copy(inode, attr);
@@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
const struct inode_operations omfs_file_inops = {
.setattr = omfs_setattr,
- .truncate = omfs_truncate
};
const struct address_space_operations omfs_aops = {
diff --git a/fs/open.c b/fs/open.c
index 182d8667b7bd..9b33c0cbfacf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,33 +61,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
return ret;
}
-static long do_sys_truncate(const char __user *pathname, loff_t length)
+long vfs_truncate(struct path *path, loff_t length)
{
- struct path path;
struct inode *inode;
- int error;
-
- error = -EINVAL;
- if (length < 0) /* sorry, but loff_t says... */
- goto out;
+ long error;
- error = user_path(pathname, &path);
- if (error)
- goto out;
- inode = path.dentry->d_inode;
+ inode = path->dentry->d_inode;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
- error = -EISDIR;
if (S_ISDIR(inode->i_mode))
- goto dput_and_out;
-
- error = -EINVAL;
+ return -EISDIR;
if (!S_ISREG(inode->i_mode))
- goto dput_and_out;
+ return -EINVAL;
- error = mnt_want_write(path.mnt);
+ error = mnt_want_write(path->mnt);
if (error)
- goto dput_and_out;
+ goto out;
error = inode_permission(inode, MAY_WRITE);
if (error)
@@ -111,19 +100,40 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
error = locks_verify_truncate(inode, NULL, length);
if (!error)
- error = security_path_truncate(&path);
+ error = security_path_truncate(path);
if (!error)
- error = do_truncate(path.dentry, length, 0, NULL);
+ error = do_truncate(path->dentry, length, 0, NULL);
put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
- mnt_drop_write(path.mnt);
-dput_and_out:
- path_put(&path);
+ mnt_drop_write(path->mnt);
out:
return error;
}
+EXPORT_SYMBOL_GPL(vfs_truncate);
+
+static long do_sys_truncate(const char __user *pathname, loff_t length)
+{
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+ struct path path;
+ int error;
+
+ if (length < 0) /* sorry, but loff_t says... */
+ return -EINVAL;
+
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ if (!error) {
+ error = vfs_truncate(&path, length);
+ path_put(&path);
+ }
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+ return error;
+}
SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
@@ -306,6 +316,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
struct path path;
struct inode *inode;
int res;
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
return -EINVAL;
@@ -328,8 +339,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
}
old_cred = override_creds(override_cred);
-
- res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+retry:
+ res = user_path_at(dfd, filename, lookup_flags, &path);
if (res)
goto out;
@@ -364,6 +375,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
out_path_release:
path_put(&path);
+ if (retry_estale(res, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
revert_creds(old_cred);
put_cred(override_cred);
@@ -379,8 +394,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
struct path path;
int error;
-
- error = user_path_dir(filename, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+retry:
+ error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
@@ -392,6 +408,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
dput_and_out:
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -425,8 +445,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
struct path path;
int error;
-
- error = user_path_dir(filename, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+retry:
+ error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
if (error)
goto out;
@@ -445,6 +466,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
error = 0;
dput_and_out:
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -489,11 +514,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode
{
struct path path;
int error;
-
- error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(dfd, filename, lookup_flags, &path);
if (!error) {
error = chmod_common(&path, mode);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
return error;
}
@@ -552,6 +582,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
@@ -562,6 +593,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
mnt_drop_write(path.mnt);
out_release:
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a5a0be40e40..9b43ff77a51e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e659a0ff1da7..e064f562b1f7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
if (error)
return error;
- if ((iattr->ia_valid & ATTR_SIZE) &&
- iattr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, iattr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, iattr);
mark_inode_dirty(inode);
-
+
de->uid = inode->i_uid;
de->gid = inode->i_gid;
de->mode = inode->i_mode;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 701580ddfcc3..1827d88ad58b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -736,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
- if (error)
- return error;
- }
-
setattr_copy(inode, attr);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 8375c922c0d5..50302d6f8895 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)
return err;
}
-static void reiserfs_vfs_truncate_file(struct inode *inode)
+void reiserfs_vfs_truncate_file(struct inode *inode)
{
mutex_lock(&(REISERFS_I(inode)->tailpack));
reiserfs_truncate_file(inode, 1);
@@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = {
};
const struct inode_operations reiserfs_file_inode_operations = {
- .truncate = reiserfs_vfs_truncate_file,
.setattr = reiserfs_setattr,
.setxattr = reiserfs_setxattr,
.getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d83736fbc26c..95d7680ead47 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
loff_t isize = i_size_read(inode);
loff_t end = offset + iov_length(iov, nr_segs);
- if (end > isize)
- vmtruncate(inode, isize);
+ if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
+ truncate_setsize(inode, isize);
+ reiserfs_vfs_truncate_file(inode);
+ }
}
return ret;
@@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
*/
reiserfs_write_unlock_once(inode->i_sb, depth);
if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode))
- error = vmtruncate(inode, attr->ia_size);
+ attr->ia_size != i_size_read(inode)) {
+ error = inode_newsize_ok(inode, attr->ia_size);
+ if (!error) {
+ truncate_setsize(inode, attr->ia_size);
+ reiserfs_vfs_truncate_file(inode);
+ }
+ }
if (!error) {
setattr_copy(inode, attr);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 33215f57ea06..157e474ab303 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
*,
int count);
int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+void reiserfs_vfs_truncate_file(struct inode *inode);
int reiserfs_commit_page(struct inode *inode, struct page *page,
unsigned from, unsigned to);
void reiserfs_flush_old_commits(struct super_block *);
diff --git a/fs/stat.c b/fs/stat.c
index eae494630a36..14f45459c83d 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -74,7 +74,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
{
struct path path;
int error = -EINVAL;
- int lookup_flags = 0;
+ unsigned int lookup_flags = 0;
if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH)) != 0)
@@ -84,13 +84,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
lookup_flags |= LOOKUP_FOLLOW;
if (flag & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
-
+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
error = vfs_getattr(path.mnt, path.dentry, stat);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
out:
return error;
}
@@ -296,11 +300,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
struct path path;
int error;
int empty = 0;
+ unsigned int lookup_flags = LOOKUP_EMPTY;
if (bufsiz <= 0)
return -EINVAL;
- error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty);
+retry:
+ error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
if (!error) {
struct inode *inode = path.dentry->d_inode;
@@ -314,6 +320,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
}
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
return error;
}
diff --git a/fs/statfs.c b/fs/statfs.c
index f8e832e6f0a2..c219e733f553 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs);
int user_statfs(const char __user *pathname, struct kstatfs *st)
{
struct path path;
- int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
+ int error;
+ unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (!error) {
error = vfs_statfs(&path, st);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
return error;
}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0a65939508e9..9d4dc6831792 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
- error = vmtruncate(inode, attr->ia_size);
+ error = inode_newsize_ok(inode, attr->ia_size);
if (error)
return error;
+ truncate_setsize(inode, attr->ia_size);
+ sysv_truncate(inode);
}
setattr_copy(inode, attr);
@@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
}
const struct inode_operations sysv_file_inode_operations = {
- .truncate = sysv_truncate,
.setattr = sysv_setattr,
.getattr = sysv_getattr,
};
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 90b54b438789..c1a591a4725b 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, get_block);
}
+static void sysv_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size) {
+ truncate_pagecache(inode, to, inode->i_size);
+ sysv_truncate(inode);
+ }
+}
+
static int sysv_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,
int ret;
ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ sysv_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index eb6d0b7dc879..ff24e4449ece 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
return __block_write_begin(page, pos, len, ufs_getfrag_block);
}
+static void ufs_write_failed(struct address_space *mapping, loff_t to)
+{
+ struct inode *inode = mapping->host;
+
+ if (to > inode->i_size)
+ truncate_pagecache(inode, to, inode->i_size);
+}
+
static int ufs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
ret = block_write_begin(mapping, pos, len, flags, pagep,
ufs_getfrag_block);
- if (unlikely(ret)) {
- loff_t isize = mapping->host->i_size;
- if (pos + len > isize)
- vmtruncate(mapping->host, isize);
- }
+ if (unlikely(ret))
+ ufs_write_failed(mapping, pos + len);
return ret;
}
diff --git a/fs/utimes.c b/fs/utimes.c
index bb0696a41735..f4fb7eca10e8 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
if (!(flags & AT_SYMLINK_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
-
+retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
goto out;
error = utimes_common(&path, times);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
}
out:
diff --git a/fs/xattr.c b/fs/xattr.c
index e21c119f4f99..3377dff18404 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -389,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -399,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -476,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
{
struct path path;
ssize_t error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = getxattr(path.dentry, name, value, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -490,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
{
struct path path;
ssize_t error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = getxattr(path.dentry, name, value, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -556,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
{
struct path path;
ssize_t error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = listxattr(path.dentry, list, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -570,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
{
struct path path;
ssize_t error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = listxattr(path.dentry, list, size);
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -615,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_path(pathname, &path);
+ unsigned int lookup_flags = LOOKUP_FOLLOW;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -625,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}
@@ -633,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
{
struct path path;
int error;
-
- error = user_lpath(pathname, &path);
+ unsigned int lookup_flags = 0;
+retry:
+ error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
if (error)
return error;
error = mnt_want_write(path.mnt);
@@ -643,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
mnt_drop_write(path.mnt);
}
path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
return error;
}