From 4918b6d140c4822201ebbe2f070875332aff337b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:26:07 -0700 Subject: ceph: add F_SYNC file flag to force sync (non-O_DIRECT) io This allows us to force IO through the sync path which you normally only get when multiple clients are reading/writing to the same file or by mounting with -o sync. Among other things, this lets test programs verify correctness with a single mount. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4698a5c553dc..44e4fe9fba02 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -643,7 +643,8 @@ again: if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) /* hmm, this isn't really async... */ ret = ceph_sync_read(filp, base, len, ppos, &checkeof); else @@ -720,7 +721,8 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { + (inode->i_sb->s_flags & MS_SYNCHRONOUS) || + (fi->flags & CEPH_F_SYNC)) { ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { -- cgit v1.2.3 From d8de9ab63a57326d21154c13c365f949f53ce8e1 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:34 -0700 Subject: ceph: avoid carrying Fw cap during write into page cache The generic_file_aio_write call may block on balance_dirty_pages while we flush data to the OSDs. If we hold a reference to the FILE_WR cap during that interval revocation by the MDS (e.g., to do a stat(2)) may be very slow. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 44e4fe9fba02..6c90cf090601 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -713,7 +713,7 @@ retry_snap: want = CEPH_CAP_FILE_BUFFER; ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); if (ret < 0) - goto out; + goto out_put; dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -726,8 +726,18 @@ retry_snap: ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + /* + * buffered write; drop Fw early to avoid slow + * revocation if we get stuck on balance_dirty_pages + */ + int dirty; + + spin_lock(&inode->i_lock); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + spin_unlock(&inode->i_lock); + ceph_put_cap_refs(ci, got); + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); if ((ret >= 0 || ret == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { @@ -735,7 +745,12 @@ retry_snap: if (err < 0) ret = err; } + + if (dirty) + __mark_inode_dirty(inode, dirty); + goto out; } + if (ret >= 0) { int dirty; spin_lock(&inode->i_lock); @@ -745,12 +760,13 @@ retry_snap: __mark_inode_dirty(inode, dirty); } -out: +out_put: dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, ceph_cap_string(got)); ceph_put_cap_refs(ci, got); +out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); -- cgit v1.2.3 From acda76578813ef893004ecad0e5ad2bb6039e5f7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:48 -0700 Subject: ceph: fix bad parent_inode calc in ceph_lookup_open We were always getting NULL here because the intent file f_dentry is always NULL at this point, which means we were always passing NULL to ceph_mdsc_do_request. In reality, this was fine, since this isn't currently ever a write operation that needs to get strung on the dir's unsafe list. Use the dir explicitly, and only pass it if this open has side-effects that a dir fsync should flush. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6c90cf090601..9b667e9abf4c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -223,7 +223,6 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct file *file = nd->intent.open.file; - struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); struct ceph_mds_request *req; int err; int flags = nd->intent.open.flags - 1; /* silly vfs! */ @@ -242,7 +241,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, + (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, + req); dentry = ceph_finish_lookup(req, dentry, err); if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); -- cgit v1.2.3 From 9bae113a085b790de384bf86f09e15b42a65a985 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:27:59 -0700 Subject: ceph: only link open operations to directory unsafe list if O_CREAT|O_TRUNC We only need to put these on the directory unsafe list if they have side effects that fsync(2) should flush out. Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9b667e9abf4c..e0115eb4e9ba 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -194,7 +194,8 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, parent_inode, req); + err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? + parent_inode : NULL, req); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); -- cgit v1.2.3 From 468640e32c7f6bfdaaa011095cc388786755d159 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:28:11 -0700 Subject: ceph: fix ceph_lookup_open intent usage We weren't properly calling lookup_instantiate_filp when setting up the lookup intent, which could lead to file leakage on errors. So: - use separate helper for the hidden snapdir translation, immediately following the mds request - use ceph_finish_lookup for the final dentry/return value dance in the exit path - lookup_instantiate_filp on success Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 31 ++++++++++++++++++++----------- fs/ceph/file.c | 23 +++++++++++++++-------- fs/ceph/super.h | 2 ++ 3 files changed, 37 insertions(+), 19 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 53b441fe78f1..f39a409db0ea 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -482,18 +482,10 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin) } /* - * Process result of a lookup/open request. - * - * Mainly, make sure we return the final req->r_dentry (if it already - * existed) in place of the original VFS-provided dentry when they - * differ. - * - * Gracefully handle the case where the MDS replies with -ENOENT and - * no trace (which it may do, at its discretion, e.g., if it doesn't - * care to issue a lease on the negative dentry). + * Handle lookups for the hidden .snap directory. */ -struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, - struct dentry *dentry, int err) +int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = dentry->d_parent->d_inode; @@ -510,7 +502,23 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, d_add(dentry, inode); err = 0; } + return err; +} +/* + * Figure out final result of a lookup/open request. + * + * Mainly, make sure we return the final req->r_dentry (if it already + * existed) in place of the original VFS-provided dentry when they + * differ. + * + * Gracefully handle the case where the MDS replies with -ENOENT and + * no trace (which it may do, at its discretion, e.g., if it doesn't + * care to issue a lease on the negative dentry). + */ +struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, + struct dentry *dentry, int err) +{ if (err == -ENOENT) { /* no trace? */ err = 0; @@ -605,6 +613,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_locked_dir = dir; err = ceph_mdsc_do_request(mdsc, NULL, req); + err = ceph_handle_snapdir(req, dentry, err); dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e0115eb4e9ba..f34d47d66e7c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -223,8 +223,9 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct file *file = nd->intent.open.file; + struct file *file; struct ceph_mds_request *req; + struct dentry *ret; int err; int flags = nd->intent.open.flags - 1; /* silly vfs! */ @@ -245,15 +246,21 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - dentry = ceph_finish_lookup(req, dentry, err); - if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) + err = ceph_handle_snapdir(req, dentry, err); + if (err) + goto out; + if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); - if (!err) - err = ceph_init_file(req->r_dentry->d_inode, file, - req->r_fmode); + if (err) + goto out; + file = lookup_instantiate_filp(nd, req->r_dentry, ceph_open); + if (IS_ERR(file)) + err = PTR_ERR(file); +out: + ret = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); - dout("ceph_lookup_open result=%p\n", dentry); - return dentry; + dout("ceph_lookup_open result=%p\n", ret); + return ret; } int ceph_release(struct inode *inode, struct file *file) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a8a273320241..c24891a5bec2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -791,6 +791,8 @@ extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, ceph_snapdir_dentry_ops; extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); +extern int ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); -- cgit v1.2.3 From 5f21c96dd5c615341963036ae8f5e4f5227a818d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 26 Jul 2011 11:30:29 -0700 Subject: ceph: protect access to d_parent d_parent is protected by d_lock: use it when looking up a dentry's parent directory inode. Also take a reference and drop it in the caller to avoid a use-after-free. Reported-by: Al Viro Reviewed-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 15 +++++++++++++++ fs/ceph/file.c | 8 +++++--- fs/ceph/inode.c | 4 +++- fs/ceph/ioctl.c | 4 +++- fs/ceph/super.h | 9 +-------- fs/ceph/xattr.c | 8 ++++++-- 6 files changed, 33 insertions(+), 15 deletions(-) (limited to 'fs/ceph/file.c') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 883c9546111d..ed296ec121d1 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -71,6 +71,21 @@ out_unlock: return 0; } +struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) +{ + struct inode *inode = NULL; + + if (!dentry) + return NULL; + + spin_lock(&dentry->d_lock); + if (dentry->d_parent) { + inode = dentry->d_parent->d_inode; + ihold(inode); + } + spin_unlock(&dentry->d_lock); + return inode; +} /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f34d47d66e7c..45fbd69daabe 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -122,7 +122,7 @@ int ceph_open(struct inode *inode, struct file *file) struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct ceph_file_info *cf = file->private_data; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode = NULL; int err; int flags, fmode, wanted; @@ -194,8 +194,10 @@ int ceph_open(struct inode *inode, struct file *file) req->r_inode = inode; ihold(inode); req->r_num_caps = 1; - err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? - parent_inode : NULL, req); + if (flags & (O_CREAT|O_TRUNC)) + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); + err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); if (!err) err = ceph_init_file(inode, file, req->r_fmode); ceph_mdsc_put_request(req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2717dc4e443c..a7db56f1523b 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1562,7 +1562,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; @@ -1745,7 +1745,9 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index a757a5680578..3b256b50f7d8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -38,7 +38,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) static long ceph_ioctl_set_layout(struct file *file, void __user *arg) { struct inode *inode = file->f_dentry->d_inode; - struct inode *parent_inode = file->f_dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; struct ceph_ioctl_layout l; @@ -87,7 +87,9 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) req->r_args.setlayout.layout.fl_pg_preferred = cpu_to_le32(l.preferred_osd); + parent_inode = ceph_get_dentry_parent_inode(file->f_dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c24891a5bec2..c1eb9a014b74 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -801,6 +801,7 @@ extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct dentry *dn); +extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); /* * our d_ops vary depending on whether the inode is live, @@ -823,14 +824,6 @@ extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, int p_locks, int f_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); -static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) -{ - if (dentry && dentry->d_parent) - return dentry->d_parent->d_inode; - - return NULL; -} - /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f42d730f1b66..96c6739a0280 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -629,7 +629,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *inode = dentry->d_inode; struct ceph_inode_info *ci = ceph_inode(inode); - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; int err; @@ -677,7 +677,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_data_len = size; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); @@ -788,7 +790,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = dentry->d_inode; - struct inode *parent_inode = dentry->d_parent->d_inode; + struct inode *parent_inode; struct ceph_mds_request *req; int err; @@ -802,7 +804,9 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); + parent_inode = ceph_get_dentry_parent_inode(dentry); err = ceph_mdsc_do_request(mdsc, parent_inode, req); + iput(parent_inode); ceph_mdsc_put_request(req); return err; } -- cgit v1.2.3