summaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2011-03-19 07:38:50 +0100
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2011-03-19 07:38:50 +0100
commit97eb3f24352ec6632c2127b35d8087d2a809a9b9 (patch)
tree722948059bbd325bbca232269490124231df80d4 /fs/ceph
parentInput: evdev - fix evdev_write return value on partial writes (diff)
parentMerge branch 'tsc2005' into next (diff)
downloadlinux-97eb3f24352ec6632c2127b35d8087d2a809a9b9.tar.xz
linux-97eb3f24352ec6632c2127b35d8087d2a809a9b9.zip
Merge branch 'next' into for-linus
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/Makefile23
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c60
-rw-r--r--fs/ceph/debugfs.c9
-rw-r--r--fs/ceph/dir.c71
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c65
-rw-r--r--fs/ceph/inode.c102
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c94
-rw-r--r--fs/ceph/mds_client.c107
-rw-r--r--fs/ceph/mds_client.h35
-rw-r--r--fs/ceph/super.c15
-rw-r--r--fs/ceph/super.h8
-rw-r--r--fs/ceph/xattr.c3
15 files changed, 396 insertions, 206 deletions
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff1..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
# Makefile for CEPH filesystem.
#
-ifneq ($(KERNELRELEASE),)
-
obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
+ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
-else
-#Otherwise we were called directly from the command
-# line; invoke the kernel build system.
-
-KERNELDIR ?= /lib/modules/$(shell uname -r)/build
-PWD := $(shell pwd)
-
-default: all
-
-all:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
-
-modules_install:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
-
-clean:
- $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
-
-endif
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e1..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
page->index << PAGE_CACHE_SHIFT, &len,
ci->i_truncate_seq, ci->i_truncate_size,
- &page, 1);
+ &page, 1, 0);
if (err == -ENOENT)
err = 0;
if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
offset, &len,
ci->i_truncate_seq, ci->i_truncate_size,
- pages, nr_pages);
+ pages, nr_pages, 0);
if (rc == -ENOENT)
rc = 0;
if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
snapc, do_sync,
ci->i_truncate_seq,
ci->i_truncate_size,
- &inode->i_mtime, true, 1);
+ &inode->i_mtime, true, 1, 0);
max_pages = req->r_num_pages;
alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71d..6b61ded701e1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
invalidating_gen == ci->i_rdcache_gen) {
/* success. */
dout("try_nonblocking_invalidate %p success\n", inode);
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
+ /* save any racing async invalidate some trouble */
+ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
return 0;
}
dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -1560,9 +1560,10 @@ retry_locked:
/* NOTE: no side-effects allowed, until we take s_mutex */
revoking = cap->implemented & ~cap->issued;
- if (revoking)
- dout(" mds%d revoking %s\n", cap->mds,
- ceph_cap_string(revoking));
+ dout(" mds%d cap %p issued %s implemented %s revoking %s\n",
+ cap->mds, cap, ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->implemented),
+ ceph_cap_string(revoking));
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
@@ -1658,6 +1659,8 @@ ack:
if (cap == ci->i_auth_cap && ci->i_dirty_caps)
flushing = __mark_caps_flushing(inode, session);
+ else
+ flushing = 0;
mds = cap->mds; /* remember mds, so we don't repeat */
sent++;
@@ -1940,6 +1943,35 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
}
}
+static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap;
+ int delayed = 0;
+
+ spin_lock(&inode->i_lock);
+ cap = ci->i_auth_cap;
+ dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode,
+ ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq);
+ __ceph_flush_snaps(ci, &session, 1);
+ if (ci->i_flushing_caps) {
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ ci->i_flushing_caps, NULL);
+ if (delayed) {
+ spin_lock(&inode->i_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&inode->i_lock);
+ }
+ } else {
+ spin_unlock(&inode->i_lock);
+ }
+}
+
/*
* Take references to capabilities we hold, so that we don't release
@@ -2273,8 +2305,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
- unsigned seq = le32_to_cpu(grant->seq);
- unsigned issue_seq = le32_to_cpu(grant->issue_seq);
+ int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
int issued, implemented, used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2317,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
int revoked_rdcache = 0;
int queue_invalidate = 0;
- dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n",
- inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps));
+ dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+ inode, cap, mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
inode->i_size);
@@ -2383,7 +2414,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
cap->seq = seq;
- cap->issue_seq = issue_seq;
/* file layout may have changed */
ci->i_layout = grant->layout;
@@ -2689,8 +2719,13 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
ceph_add_cap(inode, session, cap_id, -1,
issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
NULL /* no caps context */);
- try_flush_caps(inode, session, NULL);
+ kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
+
+ /* make sure we re-request max_size, if necessary */
+ spin_lock(&inode->i_lock);
+ ci->i_requested_max_size = 0;
+ spin_unlock(&inode->i_lock);
}
/*
@@ -2782,8 +2817,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_IMPORT:
handle_cap_import(mdsc, inode, h, session,
snaptrace, snaptrace_len);
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
- session);
+ ceph_check_caps(ceph_inode(inode), 0, session);
goto done_unlocked;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b58..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
req = rb_entry(rp, struct ceph_mds_request, r_node);
- if (req->r_request)
- seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
- else
+ if (req->r_request && req->r_session)
+ seq_printf(s, "%lld\tmds%d\t", req->r_tid,
+ req->r_session->s_mds);
+ else if (!req->r_request)
seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+ else
+ seq_printf(s, "%lld\t(no session)\t", req->r_tid);
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcafc..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -40,12 +40,13 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata)
return 0;
- if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
- dentry->d_op = &ceph_dentry_ops;
+ if (dentry->d_parent == NULL || /* nfs fh_to_dentry */
+ ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+ d_set_d_op(dentry, &ceph_dentry_ops);
else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
- dentry->d_op = &ceph_snapdir_dentry_ops;
+ d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
else
- dentry->d_op = &ceph_snap_dentry_ops;
+ d_set_d_op(dentry, &ceph_snap_dentry_ops);
di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
if (!di)
@@ -111,11 +112,11 @@ static int __dcache_readdir(struct file *filp,
dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
last);
- spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
/* start at beginning? */
- if (filp->f_pos == 2 || (last &&
- filp->f_pos < ceph_dentry(last)->offset)) {
+ if (filp->f_pos == 2 || last == NULL ||
+ filp->f_pos < ceph_dentry(last)->offset) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -135,6 +136,7 @@ more:
fi->at_end = 1;
goto out_unlock;
}
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (!d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
@@ -144,13 +146,15 @@ more:
dentry->d_name.len, dentry->d_name.name, di->offset,
filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
!dentry->d_inode ? " null" : "");
+ spin_unlock(&dentry->d_lock);
p = p->prev;
dentry = list_entry(p, struct dentry, d_u.d_child);
di = ceph_dentry(dentry);
}
- atomic_inc(&dentry->d_count);
- spin_unlock(&dcache_lock);
+ dget_dlock(dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&parent->d_lock);
dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -176,19 +180,19 @@ more:
filp->f_pos++;
- /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+ /* make sure a dentry wasn't dropped while we didn't have parent lock */
if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
err = -EAGAIN;
goto out;
}
- spin_lock(&dcache_lock);
+ spin_lock(&parent->d_lock);
p = p->prev; /* advance to next dentry */
goto more;
out_unlock:
- spin_unlock(&dcache_lock);
+ spin_unlock(&parent->d_lock);
out:
if (last)
dput(last);
@@ -336,7 +340,10 @@ more:
if (req->r_reply_info.dir_end) {
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 2;
+ if (ceph_frag_is_rightmost(frag))
+ fi->next_offset = 2;
+ else
+ fi->next_offset = 0;
} else {
rinfo = &req->r_reply_info;
err = note_last_dentry(fi,
@@ -355,18 +362,22 @@ more:
u64 pos = ceph_make_fpos(frag, off);
struct ceph_mds_reply_inode *in =
rinfo->dir_in[off - fi->offset].in;
+ struct ceph_vino vino;
+ ino_t ino;
+
dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
off, off - fi->offset, rinfo->dir_nr, pos,
rinfo->dir_dname_len[off - fi->offset],
rinfo->dir_dname[off - fi->offset], in);
BUG_ON(!in);
ftype = le32_to_cpu(in->mode) >> 12;
+ vino.ino = le64_to_cpu(in->ino);
+ vino.snap = le64_to_cpu(in->snapid);
+ ino = ceph_vino_to_ino(vino);
if (filldir(dirent,
rinfo->dir_dname[off - fi->offset],
rinfo->dir_dname_len[off - fi->offset],
- pos,
- le64_to_cpu(in->ino),
- ftype) < 0) {
+ pos, ino, ftype) < 0) {
dout("filldir stopping us...\n");
return 0;
}
@@ -414,6 +425,7 @@ static void reset_readdir(struct ceph_file_info *fi)
fi->last_readdir = NULL;
}
kfree(fi->last_name);
+ fi->last_name = NULL;
fi->next_offset = 2; /* compensate for . and .. */
if (fi->dentry) {
dput(fi->dentry);
@@ -978,7 +990,12 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
*/
static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
{
- struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dir;
+
+ if (nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ dir = dentry->d_parent->d_inode;
dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
@@ -1207,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
}
}
+/*
+ * Return name hash for a given dentry. This is dependent on
+ * the parent directory's hash function.
+ */
+unsigned ceph_dentry_hash(struct dentry *dn)
+{
+ struct inode *dir = dn->d_parent->d_inode;
+ struct ceph_inode_info *dci = ceph_inode(dir);
+
+ switch (dci->i_dir_layout.dl_dir_hash) {
+ case 0: /* for backward compat */
+ case CEPH_STR_HASH_LINUX:
+ return dn->d_name.hash;
+
+ default:
+ return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
+ dn->d_name.name, dn->d_name.len);
+ }
+}
+
const struct file_operations ceph_dir_fops = {
.read = ceph_read_dir,
.readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d9426992..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
dout("encode_fh %p connectable\n", dentry);
cfh->ino = ceph_ino(dentry->d_inode);
cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = parent->d_name.hash;
+ cfh->parent_name_hash = ceph_dentry_hash(parent);
*max_len = connected_handle_length;
type = 2;
} else if (*max_len >= handle_length) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf3690..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
}
/*
- * No need to block if we have any caps. Update wanted set
+ * No need to block if we have caps on the auth MDS (for
+ * write) or any MDS (for read). Update wanted set
* asynchronously.
*/
spin_lock(&inode->i_lock);
- if (__ceph_is_any_real_caps(ci)) {
+ if (__ceph_is_any_real_caps(ci) &&
+ (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
int mds_wanted = __ceph_caps_mds_wanted(ci);
int issued = __ceph_caps_issued(ci, NULL);
@@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file)
static int striped_read(struct inode *inode,
u64 off, u64 len,
struct page **pages, int num_pages,
- int *checkeof)
+ int *checkeof, bool align_to_pages,
+ unsigned long buf_align)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 pos, this_len;
+ int io_align, page_align;
int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
int left, pages_left;
int read;
@@ -300,14 +304,19 @@ static int striped_read(struct inode *inode,
page_pos = pages;
pages_left = num_pages;
read = 0;
+ io_align = off & ~PAGE_MASK;
more:
+ if (align_to_pages)
+ page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+ else
+ page_align = pos & ~PAGE_MASK;
this_len = left;
ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
&ci->i_layout, pos, &this_len,
ci->i_truncate_seq,
ci->i_truncate_size,
- page_pos, pages_left);
+ page_pos, pages_left, page_align);
hit_stripe = this_len < left;
was_short = ret >= 0 && ret < this_len;
if (ret == -ENOENT)
@@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
struct inode *inode = file->f_dentry->d_inode;
struct page **pages;
u64 off = *poff;
- int num_pages = calc_pages_for(off, len);
- int ret;
+ int num_pages, ret;
dout("sync_read on file %p %llu~%u %s\n", file, off, len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, off, len);
-
- /*
- * flush any page cache pages in this range. this
- * will make concurrent normal and O_DIRECT io slow,
- * but it will at least behave sensibly when they are
- * in sequence.
- */
+ num_pages = calc_pages_for((unsigned long)data, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, true);
} else {
+ num_pages = calc_pages_for(off, len);
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
}
if (IS_ERR(pages))
return PTR_ERR(pages);
+ /*
+ * flush any page cache pages in this range. this
+ * will make concurrent normal and sync io slow,
+ * but it will at least behave sensibly when they are
+ * in sequence.
+ */
ret = filemap_write_and_wait(inode->i_mapping);
if (ret < 0)
goto done;
- ret = striped_read(inode, off, len, pages, num_pages, checkeof);
+ ret = striped_read(inode, off, len, pages, num_pages, checkeof,
+ file->f_flags & O_DIRECT,
+ (unsigned long)data & ~PAGE_MASK);
if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
done:
if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages, true);
else
ceph_release_page_vector(pages, num_pages);
dout("sync_read result %d\n", ret);
@@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
int flags;
int do_sync = 0;
int check_caps = 0;
+ int page_align, io_align;
+ unsigned long buf_align;
int ret;
struct timespec mtime = CURRENT_TIME;
@@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
else
pos = *offset;
+ io_align = pos & ~PAGE_MASK;
+ buf_align = (unsigned long)data & ~PAGE_MASK;
+
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
if (ret < 0)
return ret;
@@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
*/
more:
len = left;
+ if (file->f_flags & O_DIRECT) {
+ /* write from beginning of first page, regardless of
+ io alignment */
+ page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
+ num_pages = calc_pages_for((unsigned long)data, len);
+ } else {
+ page_align = pos & ~PAGE_MASK;
+ num_pages = calc_pages_for(pos, len);
+ }
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), pos, &len,
CEPH_OSD_OP_WRITE, flags,
ci->i_snap_realm->cached_context,
do_sync,
ci->i_truncate_seq, ci->i_truncate_size,
- &mtime, false, 2);
+ &mtime, false, 2, page_align);
if (!req)
return -ENOMEM;
- num_pages = calc_pages_for(pos, len);
-
if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
+ pages = ceph_get_direct_page_vector(data, num_pages, false);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
@@ -549,7 +572,7 @@ more:
}
if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages);
+ ceph_put_page_vector(pages, num_pages, false);
else if (file->f_flags & O_SYNC)
ceph_release_page_vector(pages, num_pages);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 1d6a45b5a04c..5625463aa479 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2,7 +2,6 @@
#include <linux/module.h>
#include <linux/fs.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/uaccess.h>
@@ -298,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_release_count = 0;
ci->i_symlink = NULL;
+ memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -369,6 +370,15 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
return &ci->vfs_inode;
}
+static void ceph_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(ceph_inode_cachep, ci);
+}
+
void ceph_destroy_inode(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
@@ -408,7 +418,7 @@ void ceph_destroy_inode(struct inode *inode)
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
- kmem_cache_free(ceph_inode_cachep, ci);
+ call_rcu(&inode->i_rcu, ceph_i_callback);
}
@@ -471,7 +481,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
if (issued & (CEPH_CAP_FILE_EXCL|
CEPH_CAP_FILE_WR|
- CEPH_CAP_FILE_BUFFER)) {
+ CEPH_CAP_FILE_BUFFER|
+ CEPH_CAP_AUTH_EXCL|
+ CEPH_CAP_XATTR_EXCL)) {
if (timespec_compare(ctime, &inode->i_ctime) > 0) {
dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -511,7 +523,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
warn = 1;
}
} else {
- /* we have no write caps; whatever the MDS says is true */
+ /* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
inode->i_ctime = *ctime;
inode->i_mtime = *mtime;
@@ -567,12 +579,17 @@ static int fill_inode(struct inode *inode,
/*
* provided version will be odd if inode value is projected,
- * even if stable. skip the update if we have a newer info
- * (e.g., due to inode info racing form multiple MDSs), or if
- * we are getting projected (unstable) inode info.
+ * even if stable. skip the update if we have newer stable
+ * info (ours>=theirs, e.g. due to racing mds replies), unless
+ * we are getting projected (unstable) info (in which case the
+ * version is odd, and we want ours>theirs).
+ * us them
+ * 2 2 skip
+ * 3 2 skip
+ * 3 3 update
*/
if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) > le64_to_cpu(info->version))
+ (ci->i_version & ~1) >= le64_to_cpu(info->version))
goto no_change;
issued = __ceph_caps_issued(ci, &implemented);
@@ -606,7 +623,14 @@ static int fill_inode(struct inode *inode,
le32_to_cpu(info->time_warp_seq),
&ctime, &mtime, &atime);
- ci->i_max_size = le64_to_cpu(info->max_size);
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+
ci->i_layout = info->layout;
inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
@@ -667,6 +691,8 @@ static int fill_inode(struct inode *inode,
inode->i_op = &ceph_dir_iops;
inode->i_fop = &ceph_dir_fops;
+ ci->i_dir_layout = iinfo->dir_layout;
+
ci->i_files = le64_to_cpu(info->files);
ci->i_subdirs = le64_to_cpu(info->subdirs);
ci->i_rbytes = le64_to_cpu(info->rbytes);
@@ -684,10 +710,6 @@ static int fill_inode(struct inode *inode,
ci->i_ceph_flags |= CEPH_I_COMPLETE;
ci->i_max_offset = 2;
}
-
- /* it may be better to set st_size in getattr instead? */
- if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
- inode->i_size = ci->i_rbytes;
break;
default:
pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
@@ -828,13 +850,13 @@ static void ceph_set_dentry_offset(struct dentry *dn)
di->offset = ceph_inode(inode)->i_max_offset++;
spin_unlock(&inode->i_lock);
- spin_lock(&dcache_lock);
- spin_lock(&dn->d_lock);
+ spin_lock(&dir->d_lock);
+ spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_u.d_child, &dir->d_subdirs);
dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
dn->d_u.d_child.prev, dn->d_u.d_child.next);
spin_unlock(&dn->d_lock);
- spin_unlock(&dcache_lock);
+ spin_unlock(&dir->d_lock);
}
/*
@@ -866,8 +888,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
"inode %p ino %llx.%llx\n",
- dn, atomic_read(&dn->d_count),
- realdn, atomic_read(&realdn->d_count),
+ dn, dn->d_count,
+ realdn, realdn->d_count,
realdn->d_inode, ceph_vinop(realdn->d_inode));
dput(dn);
dn = realdn;
@@ -1055,7 +1077,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
ininfo = rinfo->targeti.in;
vino.ino = le64_to_cpu(ininfo->ino);
vino.snap = le64_to_cpu(ininfo->snapid);
- if (!dn->d_inode) {
+ in = dn->d_inode;
+ if (!in) {
in = ceph_get_inode(sb, vino);
if (IS_ERR(in)) {
pr_err("fill_trace bad get_inode "
@@ -1217,11 +1240,11 @@ retry_lookup:
goto retry_lookup;
} else {
/* reorder parent's d_subdirs */
- spin_lock(&dcache_lock);
- spin_lock(&dn->d_lock);
+ spin_lock(&parent->d_lock);
+ spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_u.d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
- spin_unlock(&dcache_lock);
+ spin_unlock(&parent->d_lock);
}
di = dn->d_fsdata;
@@ -1386,11 +1409,8 @@ static void ceph_invalidate_work(struct work_struct *work)
spin_lock(&inode->i_lock);
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
- if (ci->i_rdcache_gen == 0 ||
- ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
/* nevermind! */
- ci->i_rdcache_revoking = 0;
spin_unlock(&inode->i_lock);
goto out;
}
@@ -1400,15 +1420,16 @@ static void ceph_invalidate_work(struct work_struct *work)
ceph_invalidate_nondirty_pages(inode->i_mapping);
spin_lock(&inode->i_lock);
- if (orig_gen == ci->i_rdcache_gen) {
+ if (orig_gen == ci->i_rdcache_gen &&
+ orig_gen == ci->i_rdcache_revoking) {
dout("invalidate_pages %p gen %d successful\n", inode,
ci->i_rdcache_gen);
- ci->i_rdcache_gen = 0;
- ci->i_rdcache_revoking = 0;
+ ci->i_rdcache_revoking--;
check = 1;
} else {
- dout("invalidate_pages %p gen %d raced, gen now %d\n",
- inode, orig_gen, ci->i_rdcache_gen);
+ dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
+ inode, orig_gen, ci->i_rdcache_gen,
+ ci->i_rdcache_revoking);
}
spin_unlock(&inode->i_lock);
@@ -1739,7 +1760,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
return 0;
}
- dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
+ dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
return 0;
@@ -1760,12 +1781,17 @@ int ceph_do_getattr(struct inode *inode, int mask)
* Check inode permissions. We verify we have a valid value for
* the AUTH cap, then call the generic handler.
*/
-int ceph_permission(struct inode *inode, int mask)
+int ceph_permission(struct inode *inode, int mask, unsigned int flags)
{
- int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
+ int err;
+
+ if (flags & IPERM_FLAG_RCU)
+ return -ECHILD;
+
+ err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
if (!err)
- err = generic_permission(inode, mask, NULL);
+ err = generic_permission(inode, mask, flags, NULL);
return err;
}
@@ -1789,7 +1815,11 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
else
stat->dev = 0;
if (S_ISDIR(inode->i_mode)) {
- stat->size = ci->i_rbytes;
+ if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
+ RBYTES))
+ stat->size = ci->i_rbytes;
+ else
+ stat->size = ci->i_files + ci->i_subdirs;
stat->blocks = 0;
stat->blksize = 65536;
}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index a6ce54e94eb5..52e8fd74d450 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
#include <linux/ioctl.h>
#include <linux/types.h>
-#define CEPH_IOCTL_MAGIC 0x98
+#define CEPH_IOCTL_MAGIC 0x97
/* just use u64 to align sanely on all archs */
struct ceph_ioctl_layout {
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 40abde93c345..476b329867d4 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -11,40 +11,68 @@
* Implement fcntl and flock locking functions.
*/
static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
- u64 pid, u64 pid_ns,
- int cmd, u64 start, u64 length, u8 wait)
+ int cmd, u8 wait, struct file_lock *fl)
{
struct inode *inode = file->f_dentry->d_inode;
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
+ u64 length = 0;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = igrab(inode);
+ /* mds requires start and length rather than start and end */
+ if (LLONG_MAX == fl->fl_end)
+ length = 0;
+ else
+ length = fl->fl_end - fl->fl_start + 1;
+
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
"length: %llu, wait: %d, type`: %d", (int)lock_type,
- (int)operation, pid, start, length, wait, cmd);
+ (int)operation, (u64)fl->fl_pid, fl->fl_start,
+ length, wait, fl->fl_type);
+
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
- req->r_args.filelock_change.pid = cpu_to_le64(pid);
+ req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
/* This should be adjusted, but I'm not sure if
namespaces actually get id numbers*/
req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)pid_ns);
- req->r_args.filelock_change.start = cpu_to_le64(start);
+ cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
+ req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
err = ceph_mdsc_do_request(mdsc, inode, req);
+
+ if ( operation == CEPH_MDS_OP_GETFILELOCK){
+ fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+ if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
+ fl->fl_type = F_RDLCK;
+ else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
+ fl->fl_type = F_WRLCK;
+ else
+ fl->fl_type = F_UNLCK;
+
+ fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
+ length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
+ le64_to_cpu(req->r_reply_info.filelock_reply->length);
+ if (length >= 1)
+ fl->fl_end = length -1;
+ else
+ fl->fl_end = 0;
+
+ }
ceph_mdsc_put_request(req);
dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
- (int)operation, pid, start, length, wait, cmd, err);
+ "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type,
+ (int)operation, (u64)fl->fl_pid, fl->fl_start,
+ length, wait, fl->fl_type, err);
return err;
}
@@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
*/
int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
{
- u64 length;
u8 lock_cmd;
int err;
u8 wait = 0;
@@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
else
lock_cmd = CEPH_LOCK_UNLOCK;
- if (LLONG_MAX == fl->fl_end)
- length = 0;
- else
- length = fl->fl_end - fl->fl_start + 1;
-
- err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- lock_cmd, fl->fl_start,
- length, wait);
+ err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
if (!err) {
- dout("mds locked, locking locally");
- err = posix_lock_file(file, fl, NULL);
- if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
- /* undo! This should only happen if the kernel detects
- * local deadlock. */
- ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
- (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- CEPH_LOCK_UNLOCK, fl->fl_start,
- length, 0);
- dout("got %d on posix_lock_file, undid lock", err);
+ if ( op != CEPH_MDS_OP_GETFILELOCK ){
+ dout("mds locked, locking locally");
+ err = posix_lock_file(file, fl, NULL);
+ if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+ /* undo! This should only happen if the kernel detects
+ * local deadlock. */
+ ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+ CEPH_LOCK_UNLOCK, 0, fl);
+ dout("got %d on posix_lock_file, undid lock", err);
+ }
}
+
} else {
dout("mds returned error code %d", err);
}
@@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
- u64 length;
u8 lock_cmd;
int err;
u8 wait = 1;
@@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
- /* mds requires start and length rather than start and end */
- if (LLONG_MAX == fl->fl_end)
- length = 0;
- else
- length = fl->fl_end - fl->fl_start + 1;
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
- file, (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- lock_cmd, fl->fl_start,
- length, wait);
+ file, lock_cmd, wait, fl);
if (!err) {
err = flock_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
- file, (u64)fl->fl_pid,
- (u64)(unsigned long)fl->fl_nspid,
- CEPH_LOCK_UNLOCK, fl->fl_start,
- length, 0);
+ file, CEPH_LOCK_UNLOCK, 0, fl);
dout("got %d on flock_lock_file_wait, undid lock", err);
}
} else {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3142b15940c2..a1ee8fa3a8e7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -6,7 +6,6 @@
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
#include "super.h"
#include "mds_client.h"
@@ -61,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
* parse individual inode info
*/
static int parse_reply_info_in(void **p, void *end,
- struct ceph_mds_reply_info_in *info)
+ struct ceph_mds_reply_info_in *info,
+ int features)
{
int err = -EIO;
@@ -75,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
info->symlink = *p;
*p += info->symlink_len;
+ if (features & CEPH_FEATURE_DIRLAYOUTHASH)
+ ceph_decode_copy_safe(p, end, &info->dir_layout,
+ sizeof(info->dir_layout), bad);
+ else
+ memset(&info->dir_layout, 0, sizeof(info->dir_layout));
+
ceph_decode_32_safe(p, end, info->xattr_len, bad);
ceph_decode_need(p, end, info->xattr_len, bad);
info->xattr_data = *p;
@@ -89,12 +95,13 @@ bad:
* target inode.
*/
static int parse_reply_info_trace(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
int err;
if (info->head->is_dentry) {
- err = parse_reply_info_in(p, end, &info->diri);
+ err = parse_reply_info_in(p, end, &info->diri, features);
if (err < 0)
goto out_bad;
@@ -115,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
}
if (info->head->is_target) {
- err = parse_reply_info_in(p, end, &info->targeti);
+ err = parse_reply_info_in(p, end, &info->targeti, features);
if (err < 0)
goto out_bad;
}
@@ -135,7 +142,8 @@ out_bad:
* parse readdir results
*/
static int parse_reply_info_dir(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
u32 num, i = 0;
int err;
@@ -183,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
*p += sizeof(struct ceph_mds_reply_lease);
/* inode */
- err = parse_reply_info_in(p, end, &info->dir_in[i]);
+ err = parse_reply_info_in(p, end, &info->dir_in[i], features);
if (err < 0)
goto out_bad;
i++;
@@ -203,10 +211,45 @@ out_bad:
}
/*
+ * parse fcntl F_GETLK results
+ */
+static int parse_reply_info_filelock(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
+{
+ if (*p + sizeof(*info->filelock_reply) > end)
+ goto bad;
+
+ info->filelock_reply = *p;
+ *p += sizeof(*info->filelock_reply);
+
+ if (unlikely(*p != end))
+ goto bad;
+ return 0;
+
+bad:
+ return -EIO;
+}
+
+/*
+ * parse extra results
+ */
+static int parse_reply_info_extra(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
+{
+ if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+ return parse_reply_info_filelock(p, end, info, features);
+ else
+ return parse_reply_info_dir(p, end, info, features);
+}
+
+/*
* parse entire mds reply
*/
static int parse_reply_info(struct ceph_msg *msg,
- struct ceph_mds_reply_info_parsed *info)
+ struct ceph_mds_reply_info_parsed *info,
+ int features)
{
void *p, *end;
u32 len;
@@ -219,15 +262,15 @@ static int parse_reply_info(struct ceph_msg *msg,
/* trace */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
- err = parse_reply_info_trace(&p, p+len, info);
+ err = parse_reply_info_trace(&p, p+len, info, features);
if (err < 0)
goto out_bad;
}
- /* dir content */
+ /* extra */
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
- err = parse_reply_info_dir(&p, p+len, info);
+ err = parse_reply_info_extra(&p, p+len, info, features);
if (err < 0)
goto out_bad;
}
@@ -529,6 +572,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
ceph_mdsc_get_request(req);
__insert_request(mdsc, req);
+ req->r_uid = current_fsuid();
+ req->r_gid = current_fsgid();
+
if (dir) {
struct ceph_inode_info *ci = ceph_inode(dir);
@@ -620,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
} else {
/* dir + name */
inode = dir;
- hash = req->r_dentry->d_name.hash;
+ hash = ceph_dentry_hash(req->r_dentry);
is_hash = true;
}
}
@@ -647,9 +693,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
dout("choose_mds %p %llx.%llx "
"frag %u mds%d (%d/%d)\n",
inode, ceph_vinop(inode),
- frag.frag, frag.mds,
+ frag.frag, mds,
(int)r, frag.ndist);
- return mds;
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+ CEPH_MDS_STATE_ACTIVE)
+ return mds;
}
/* since this file/dir wasn't known to be
@@ -662,7 +710,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
dout("choose_mds %p %llx.%llx "
"frag %u mds%d (auth)\n",
inode, ceph_vinop(inode), frag.frag, mds);
- return mds;
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
+ CEPH_MDS_STATE_ACTIVE)
+ return mds;
}
}
}
@@ -1452,7 +1502,7 @@ retry:
*base = ceph_ino(temp->d_inode);
*plen = len;
dout("build_path on %p %d built %llx '%.*s'\n",
- dentry, atomic_read(&dentry->d_count), *base, len, path);
+ dentry, dentry->d_count, *base, len, path);
return path;
}
@@ -1588,8 +1638,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(current_fsuid());
- head->caller_gid = cpu_to_le32(current_fsgid());
+ head->caller_uid = cpu_to_le32(req->r_uid);
+ head->caller_gid = cpu_to_le32(req->r_gid);
head->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
@@ -1659,7 +1709,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
struct ceph_msg *msg;
int flags = 0;
- req->r_mds = mds;
req->r_attempts++;
if (req->r_inode) {
struct ceph_cap *cap =
@@ -1746,6 +1795,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
+ put_request_session(req);
+
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1763,6 +1814,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
}
+ req->r_session = get_session(session);
+
dout("do_request mds%d session %p state %s\n", mds, session,
session_state_name(session->s_state));
if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1775,7 +1828,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
}
/* send request */
- req->r_session = get_session(session);
req->r_resend_mds = -1; /* forget any previous mds hint */
if (req->r_request_started == 0) /* note request start time */
@@ -1829,7 +1881,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
if (req->r_session &&
req->r_session->s_mds == mds) {
dout(" kicking tid %llu\n", req->r_tid);
- put_request_session(req);
__do_request(mdsc, req);
}
}
@@ -2022,8 +2073,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
goto out;
} else {
struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- struct ceph_cap *cap =
- ceph_get_cap_for_mds(ci, req->r_mds);;
+ struct ceph_cap *cap = NULL;
+
+ if (req->r_session)
+ cap = ceph_get_cap_for_mds(ci,
+ req->r_session->s_mds);
dout("already using auth");
if ((!cap || cap != ci->i_auth_cap) ||
@@ -2067,12 +2121,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
dout("handle_reply tid %lld result %d\n", tid, result);
rinfo = &req->r_reply_info;
- err = parse_reply_info(msg, rinfo);
+ err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex);
if (err < 0) {
- pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
+ pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid);
ceph_msg_dump(msg);
goto out_err;
}
@@ -2092,7 +2146,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
- if (result == 0 && rinfo->dir_nr)
+ if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK &&
+ rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c72355..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
*/
struct ceph_mds_reply_info_in {
struct ceph_mds_reply_inode *in;
+ struct ceph_dir_layout dir_layout;
u32 symlink_len;
char *symlink;
u32 xattr_len;
@@ -42,26 +43,37 @@ struct ceph_mds_reply_info_in {
};
/*
- * parsed info about an mds reply, including information about the
- * target inode and/or its parent directory and dentry, and directory
- * contents (for readdir results).
+ * parsed info about an mds reply, including information about
+ * either: 1) the target inode and/or its parent directory and dentry,
+ * and directory contents (for readdir results), or
+ * 2) the file range lock info (for fcntl F_GETLK results).
*/
struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_head *head;
+ /* trace */
struct ceph_mds_reply_info_in diri, targeti;
struct ceph_mds_reply_dirfrag *dirfrag;
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
- struct ceph_mds_reply_dirfrag *dir_dir;
- int dir_nr;
- char **dir_dname;
- u32 *dir_dname_len;
- struct ceph_mds_reply_lease **dir_dlease;
- struct ceph_mds_reply_info_in *dir_in;
- u8 dir_complete, dir_end;
+ /* extra */
+ union {
+ /* for fcntl F_GETLK results */
+ struct ceph_filelock *filelock_reply;
+
+ /* for readdir results */
+ struct {
+ struct ceph_mds_reply_dirfrag *dir_dir;
+ int dir_nr;
+ char **dir_dname;
+ u32 *dir_dname_len;
+ struct ceph_mds_reply_lease **dir_dlease;
+ struct ceph_mds_reply_info_in *dir_in;
+ u8 dir_complete, dir_end;
+ };
+ };
/* encoded blob describing snapshot contexts for certain
operations (e.g., open) */
@@ -154,7 +166,6 @@ struct ceph_mds_request {
struct ceph_mds_client *r_mdsc;
int r_op; /* mds op code */
- int r_mds;
/* operation on what? */
struct inode *r_inode; /* arg1 */
@@ -170,6 +181,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
+ uid_t r_uid;
+ gid_t r_gid;
/* for choosing which mds to send this request to */
int r_direct_mode;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae0539..9c5085465a63 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -290,6 +290,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+ fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
@@ -428,7 +430,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
+ fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
+ CEPH_FEATURE_DIRLAYOUTHASH;
fsc->client->monc.want_mdsmap = 1;
fsc->mount_options = fsopt;
@@ -443,13 +446,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail_client;
err = -ENOMEM;
- fsc->wb_wq = create_workqueue("ceph-writeback");
+ /*
+ * The number of concurrent works can be high but they don't need
+ * to be processed in parallel, limit concurrency.
+ */
+ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
goto fail_bdi;
- fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+ fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
- fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+ fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
if (fsc->trunc_wq == NULL)
goto fail_pg_inv_wq;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f7..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
unsigned i_ceph_flags;
unsigned long i_release_count;
+ struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
char *i_symlink;
@@ -293,9 +294,7 @@ struct ceph_inode_info {
int i_rd_ref, i_rdcache_ref, i_wr_ref;
int i_wrbuffer_ref, i_wrbuffer_ref_head;
u32 i_shared_gen; /* increment each time we get FILE_SHARED */
- u32 i_rdcache_gen; /* we increment this each time we get
- FILE_CACHE. If it's non-zero, we
- _may_ have cached pages. */
+ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
struct list_head i_unsafe_writes; /* uncommitted sync writes */
@@ -667,7 +666,7 @@ extern void ceph_queue_invalidate(struct inode *inode);
extern void ceph_queue_writeback(struct inode *inode);
extern int ceph_do_getattr(struct inode *inode, int mask);
-extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask, unsigned int flags);
extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
@@ -770,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
+extern unsigned ceph_dentry_hash(struct dentry *dn);
/*
* our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 6e12a6ba5f79..8c9eba6ef9df 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -219,6 +219,7 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
struct rb_node **p;
struct rb_node *parent = NULL;
struct ceph_inode_xattr *xattr = NULL;
+ int name_len = strlen(name);
int c;
p = &ci->i_xattrs.index.rb_node;
@@ -226,6 +227,8 @@ static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
parent = *p;
xattr = rb_entry(parent, struct ceph_inode_xattr, node);
c = strncmp(name, xattr->name, xattr->name_len);
+ if (c == 0 && name_len > xattr->name_len)
+ c = 1;
if (c < 0)
p = &(*p)->rb_left;
else if (c > 0)