diff options
author | Steve French <sfrench@us.ibm.com> | 2006-01-12 23:47:08 +0100 |
---|---|---|
committer | Steve French <sfrench@us.ibm.com> | 2006-01-12 23:47:08 +0100 |
commit | 94bc2be31a01a3055ec94176e595dfe208e92d3b (patch) | |
tree | ebfbe81c6718a6390bfa1b99c6d228237d818576 /fs | |
parent | [CIFS] Allow local filesize for file that is open for write to be updated (diff) | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/brodo/pcmcia-fixes-2.6 (diff) | |
download | linux-94bc2be31a01a3055ec94176e595dfe208e92d3b.tar.xz linux-94bc2be31a01a3055ec94176e595dfe208e92d3b.zip |
Merge with /pub/scm/linux/kernel/git/torvalds/linux-2.6.git
Signed-off-by: Steve French <sfrench@us.ibm.com>
Diffstat (limited to 'fs')
421 files changed, 54457 insertions, 7255 deletions
diff --git a/fs/9p/9p.c b/fs/9p/9p.c index e847f504a47c..1a6d08761f39 100644 --- a/fs/9p/9p.c +++ b/fs/9p/9p.c @@ -1,8 +1,9 @@ /* * linux/fs/9p/9p.c * - * This file contains functions 9P2000 functions + * This file contains functions to perform synchronous 9P calls * + * Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> * @@ -33,6 +34,7 @@ #include "debug.h" #include "v9fs.h" #include "9p.h" +#include "conv.h" #include "mux.h" /** @@ -46,16 +48,21 @@ int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, - char *version, struct v9fs_fcall **fcall) + char *version, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; dprintk(DEBUG_9P, "msize: %d version: %s\n", msize, version); - msg.id = TVERSION; - msg.params.tversion.msize = msize; - msg.params.tversion.version = version; + tc = v9fs_create_tversion(msize, version); - return v9fs_mux_rpc(v9ses, &msg, fcall); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); + + return ret; } /** @@ -71,19 +78,45 @@ v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, - u32 fid, u32 afid, struct v9fs_fcall **fcall) + u32 fid, u32 afid, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall* tc; dprintk(DEBUG_9P, "uname '%s' aname '%s' fid %d afid %d\n", uname, aname, fid, afid); - msg.id = TATTACH; - msg.params.tattach.fid = fid; - msg.params.tattach.afid = afid; - msg.params.tattach.uname = uname; - msg.params.tattach.aname = aname; - return v9fs_mux_rpc(v9ses, &msg, fcall); + tc = v9fs_create_tattach(fid, afid, uname, aname); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); + + return ret; +} + +static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, + struct v9fs_fcall *rc, int err) +{ + int fid; + struct v9fs_session_info *v9ses; + + if (err) + return; + + fid = tc->params.tclunk.fid; + kfree(tc); + + if (!rc) + return; + + dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id); + v9ses = a; + if (rc->id == RCLUNK) + v9fs_put_idpool(fid, &v9ses->fidpool); + + kfree(rc); } /** @@ -95,16 +128,25 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, */ int -v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_fcall **fcall) +v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc, *rc; dprintk(DEBUG_9P, "fid %d\n", fid); - msg.id = TCLUNK; - msg.params.tclunk.fid = fid; - return v9fs_mux_rpc(v9ses, &msg, fcall); + rc = NULL; + tc = v9fs_create_tclunk(fid); + if (!IS_ERR(tc)) + ret = v9fs_mux_rpc(v9ses->mux, tc, &rc); + else + ret = PTR_ERR(tc); + + if (ret) + dprintk(DEBUG_ERROR, "failed fid %d err %d\n", fid, ret); + + v9fs_t_clunk_cb(v9ses, tc, rc, ret); + return ret; } /** @@ -114,14 +156,21 @@ v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid, * */ -int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag) +int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; + + dprintk(DEBUG_9P, "oldtag %d\n", oldtag); + + tc = v9fs_create_tflush(oldtag); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, NULL); + kfree(tc); + } else + ret = PTR_ERR(tc); - dprintk(DEBUG_9P, "oldtag %d\n", tag); - msg.id = TFLUSH; - msg.params.tflush.oldtag = tag; - return v9fs_mux_rpc(v9ses, &msg, NULL); + return ret; } /** @@ -133,17 +182,22 @@ int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 tag) */ int -v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall) +v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; dprintk(DEBUG_9P, "fid %d\n", fid); - if (fcall) - *fcall = NULL; - msg.id = TSTAT; - msg.params.tstat.fid = fid; - return v9fs_mux_rpc(v9ses, &msg, fcall); + ret = -ENOMEM; + tc = v9fs_create_tstat(fid); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); + + return ret; } /** @@ -157,16 +211,21 @@ v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **fcall) int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_stat *stat, struct v9fs_fcall **fcall) + struct v9fs_wstat *wstat, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; + + dprintk(DEBUG_9P, "fid %d\n", fid); - dprintk(DEBUG_9P, "fid %d length %d\n", fid, (int)stat->length); - msg.id = TWSTAT; - msg.params.twstat.fid = fid; - msg.params.twstat.stat = stat; + tc = v9fs_create_twstat(fid, wstat, v9ses->extended); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); - return v9fs_mux_rpc(v9ses, &msg, fcall); + return ret; } /** @@ -183,23 +242,27 @@ v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, - char *name, struct v9fs_fcall **fcall) + char *name, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; + int nwname; dprintk(DEBUG_9P, "fid %d newfid %d wname '%s'\n", fid, newfid, name); - msg.id = TWALK; - msg.params.twalk.fid = fid; - msg.params.twalk.newfid = newfid; - - if (name) { - msg.params.twalk.nwname = 1; - msg.params.twalk.wnames = &name; - } else { - msg.params.twalk.nwname = 0; - } - - return v9fs_mux_rpc(v9ses, &msg, fcall); + + if (name) + nwname = 1; + else + nwname = 0; + + tc = v9fs_create_twalk(fid, newfid, nwname, &name); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); + + return ret; } /** @@ -214,19 +277,21 @@ v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, int v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode, - struct v9fs_fcall **fcall) + struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; - long errorno = -1; + int ret; + struct v9fs_fcall *tc; dprintk(DEBUG_9P, "fid %d mode %d\n", fid, mode); - msg.id = TOPEN; - msg.params.topen.fid = fid; - msg.params.topen.mode = mode; - errorno = v9fs_mux_rpc(v9ses, &msg, fcall); + tc = v9fs_create_topen(fid, mode); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); - return errorno; + return ret; } /** @@ -239,14 +304,21 @@ v9fs_t_open(struct v9fs_session_info *v9ses, u32 fid, u8 mode, int v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_fcall **fcall) + struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; dprintk(DEBUG_9P, "fid %d\n", fid); - msg.id = TREMOVE; - msg.params.tremove.fid = fid; - return v9fs_mux_rpc(v9ses, &msg, fcall); + + tc = v9fs_create_tremove(fid); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); + + return ret; } /** @@ -262,20 +334,22 @@ v9fs_t_remove(struct v9fs_session_info *v9ses, u32 fid, int v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name, - u32 perm, u8 mode, struct v9fs_fcall **fcall) + u32 perm, u8 mode, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; + int ret; + struct v9fs_fcall *tc; dprintk(DEBUG_9P, "fid %d name '%s' perm %x mode %d\n", fid, name, perm, mode); - msg.id = TCREATE; - msg.params.tcreate.fid = fid; - msg.params.tcreate.name = name; - msg.params.tcreate.perm = perm; - msg.params.tcreate.mode = mode; + tc = v9fs_create_tcreate(fid, name, perm, mode); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, rcp); + kfree(tc); + } else + ret = PTR_ERR(tc); - return v9fs_mux_rpc(v9ses, &msg, fcall); + return ret; } /** @@ -290,31 +364,29 @@ v9fs_t_create(struct v9fs_session_info *v9ses, u32 fid, char *name, int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset, - u32 count, struct v9fs_fcall **fcall) + u32 count, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; - struct v9fs_fcall *rc = NULL; - long errorno = -1; - - dprintk(DEBUG_9P, "fid %d offset 0x%lx count 0x%x\n", fid, - (long unsigned int)offset, count); - msg.id = TREAD; - msg.params.tread.fid = fid; - msg.params.tread.offset = offset; - msg.params.tread.count = count; - errorno = v9fs_mux_rpc(v9ses, &msg, &rc); - - if (!errorno) { - errorno = rc->params.rread.count; - dump_data(rc->params.rread.data, rc->params.rread.count); - } - - if (fcall) - *fcall = rc; - else - kfree(rc); - - return errorno; + int ret; + struct v9fs_fcall *tc, *rc; + + dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid, + (long long unsigned) offset, count); + + tc = v9fs_create_tread(fid, offset, count); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, &rc); + if (!ret) + ret = rc->params.rread.count; + if (rcp) + *rcp = rc; + else + kfree(rc); + + kfree(tc); + } else + ret = PTR_ERR(tc); + + return ret; } /** @@ -328,32 +400,30 @@ v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset, */ int -v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, - u64 offset, u32 count, void *data, struct v9fs_fcall **fcall) +v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count, + const char __user *data, struct v9fs_fcall **rcp) { - struct v9fs_fcall msg; - struct v9fs_fcall *rc = NULL; - long errorno = -1; + int ret; + struct v9fs_fcall *tc, *rc; - dprintk(DEBUG_9P, "fid %d offset 0x%llx count 0x%x\n", fid, - (unsigned long long)offset, count); - dump_data(data, count); + dprintk(DEBUG_9P, "fid %d offset 0x%llux count 0x%x\n", fid, + (long long unsigned) offset, count); - msg.id = TWRITE; - msg.params.twrite.fid = fid; - msg.params.twrite.offset = offset; - msg.params.twrite.count = count; - msg.params.twrite.data = data; + tc = v9fs_create_twrite(fid, offset, count, data); + if (!IS_ERR(tc)) { + ret = v9fs_mux_rpc(v9ses->mux, tc, &rc); - errorno = v9fs_mux_rpc(v9ses, &msg, &rc); + if (!ret) + ret = rc->params.rwrite.count; + if (rcp) + *rcp = rc; + else + kfree(rc); - if (!errorno) - errorno = rc->params.rwrite.count; + kfree(tc); + } else + ret = PTR_ERR(tc); - if (fcall) - *fcall = rc; - else - kfree(rc); - - return errorno; + return ret; } + diff --git a/fs/9p/9p.h b/fs/9p/9p.h index f55424216be2..0cd374d94717 100644 --- a/fs/9p/9p.h +++ b/fs/9p/9p.h @@ -3,6 +3,7 @@ * * 9P protocol definitions. * + * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> * @@ -100,9 +101,18 @@ enum { V9FS_QTFILE = 0x00, }; +#define V9FS_NOTAG (u16)(~0) +#define V9FS_NOFID (u32)(~0) +#define V9FS_MAXWELEM 16 + /* ample room for Twrite/Rread header (iounit) */ #define V9FS_IOHDRSZ 24 +struct v9fs_str { + u16 len; + char *str; +}; + /* qids are the unique ID for a file (like an inode */ struct v9fs_qid { u8 type; @@ -120,6 +130,29 @@ struct v9fs_stat { u32 atime; u32 mtime; u64 length; + struct v9fs_str name; + struct v9fs_str uid; + struct v9fs_str gid; + struct v9fs_str muid; + struct v9fs_str extension; /* 9p2000.u extensions */ + u32 n_uid; /* 9p2000.u extensions */ + u32 n_gid; /* 9p2000.u extensions */ + u32 n_muid; /* 9p2000.u extensions */ +}; + +/* file metadata (stat) structure used to create Twstat message + The is similar to v9fs_stat, but the strings don't point to + the same memory block and should be freed separately +*/ +struct v9fs_wstat { + u16 size; + u16 type; + u32 dev; + struct v9fs_qid qid; + u32 mode; + u32 atime; + u32 mtime; + u64 length; char *name; char *uid; char *gid; @@ -128,25 +161,24 @@ struct v9fs_stat { u32 n_uid; /* 9p2000.u extensions */ u32 n_gid; /* 9p2000.u extensions */ u32 n_muid; /* 9p2000.u extensions */ - char data[0]; }; /* Structures for Protocol Operations */ struct Tversion { u32 msize; - char *version; + struct v9fs_str version; }; struct Rversion { u32 msize; - char *version; + struct v9fs_str version; }; struct Tauth { u32 afid; - char *uname; - char *aname; + struct v9fs_str uname; + struct v9fs_str aname; }; struct Rauth { @@ -154,12 +186,12 @@ struct Rauth { }; struct Rerror { - char *error; + struct v9fs_str error; u32 errno; /* 9p2000.u extension */ }; struct Tflush { - u32 oldtag; + u16 oldtag; }; struct Rflush { @@ -168,8 +200,8 @@ struct Rflush { struct Tattach { u32 fid; u32 afid; - char *uname; - char *aname; + struct v9fs_str uname; + struct v9fs_str aname; }; struct Rattach { @@ -179,13 +211,13 @@ struct Rattach { struct Twalk { u32 fid; u32 newfid; - u32 nwname; - char **wnames; + u16 nwname; + struct v9fs_str wnames[16]; }; struct Rwalk { - u32 nwqid; - struct v9fs_qid *wqids; + u16 nwqid; + struct v9fs_qid wqids[16]; }; struct Topen { @@ -200,7 +232,7 @@ struct Ropen { struct Tcreate { u32 fid; - char *name; + struct v9fs_str name; u32 perm; u8 mode; }; @@ -251,12 +283,12 @@ struct Tstat { }; struct Rstat { - struct v9fs_stat *stat; + struct v9fs_stat stat; }; struct Twstat { u32 fid; - struct v9fs_stat *stat; + struct v9fs_stat stat; }; struct Rwstat { @@ -271,6 +303,7 @@ struct v9fs_fcall { u32 size; u8 id; u16 tag; + void *sdata; union { struct Tversion tversion; @@ -303,7 +336,9 @@ struct v9fs_fcall { } params; }; -#define FCALL_ERROR(fcall) (fcall ? fcall->params.rerror.error : "") +#define PRINT_FCALL_ERROR(s, fcall) dprintk(DEBUG_ERROR, "%s: %.*s\n", s, \ + fcall?fcall->params.rerror.error.len:0, \ + fcall?fcall->params.rerror.error.str:""); int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, char *version, struct v9fs_fcall **rcall); @@ -311,8 +346,7 @@ int v9fs_t_version(struct v9fs_session_info *v9ses, u32 msize, int v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, u32 fid, u32 afid, struct v9fs_fcall **rcall); -int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_fcall **rcall); +int v9fs_t_clunk(struct v9fs_session_info *v9ses, u32 fid); int v9fs_t_flush(struct v9fs_session_info *v9ses, u16 oldtag); @@ -320,7 +354,7 @@ int v9fs_t_stat(struct v9fs_session_info *v9ses, u32 fid, struct v9fs_fcall **rcall); int v9fs_t_wstat(struct v9fs_session_info *v9ses, u32 fid, - struct v9fs_stat *stat, struct v9fs_fcall **rcall); + struct v9fs_wstat *wstat, struct v9fs_fcall **rcall); int v9fs_t_walk(struct v9fs_session_info *v9ses, u32 fid, u32 newfid, char *name, struct v9fs_fcall **rcall); @@ -338,4 +372,5 @@ int v9fs_t_read(struct v9fs_session_info *v9ses, u32 fid, u64 offset, u32 count, struct v9fs_fcall **rcall); int v9fs_t_write(struct v9fs_session_info *v9ses, u32 fid, u64 offset, - u32 count, void *data, struct v9fs_fcall **rcall); + u32 count, const char __user * data, + struct v9fs_fcall **rcall); diff --git a/fs/9p/Makefile b/fs/9p/Makefile index e4e4ffe5a7dc..3d023089707e 100644 --- a/fs/9p/Makefile +++ b/fs/9p/Makefile @@ -1,17 +1,17 @@ obj-$(CONFIG_9P_FS) := 9p2000.o 9p2000-objs := \ + trans_fd.o \ + trans_sock.o \ + mux.o \ + 9p.o \ + conv.o \ vfs_super.o \ vfs_inode.o \ vfs_file.o \ vfs_dir.o \ vfs_dentry.o \ error.o \ - mux.o \ - trans_fd.o \ - trans_sock.o \ - 9p.o \ - conv.o \ v9fs.o \ fid.o diff --git a/fs/9p/conv.c b/fs/9p/conv.c index 18121af99d3e..55ccfa10ee9e 100644 --- a/fs/9p/conv.c +++ b/fs/9p/conv.c @@ -30,7 +30,7 @@ #include <linux/errno.h> #include <linux/fs.h> #include <linux/idr.h> - +#include <asm/uaccess.h> #include "debug.h" #include "v9fs.h" #include "9p.h" @@ -58,12 +58,15 @@ static inline int buf_check_overflow(struct cbuf *buf) static inline int buf_check_size(struct cbuf *buf, int len) { - if (buf->p+len > buf->ep) { + if (buf->p + len > buf->ep) { if (buf->p < buf->ep) { - eprintk(KERN_ERR, "buffer overflow\n"); + eprintk(KERN_ERR, "buffer overflow: want %d has %d\n", + len, (int)(buf->ep - buf->p)); + dump_stack(); buf->p = buf->ep + 1; - return 0; } + + return 0; } return 1; @@ -127,14 +130,6 @@ static inline void buf_put_string(struct cbuf *buf, const char *s) buf_put_stringn(buf, s, strlen(s)); } -static inline void buf_put_data(struct cbuf *buf, void *data, u32 datalen) -{ - if (buf_check_size(buf, datalen)) { - memcpy(buf->p, data, datalen); - buf->p += datalen; - } -} - static inline u8 buf_get_int8(struct cbuf *buf) { u8 ret = 0; @@ -183,86 +178,37 @@ static inline u64 buf_get_int64(struct cbuf *buf) return ret; } -static inline int -buf_get_string(struct cbuf *buf, char *data, unsigned int datalen) -{ - u16 len = 0; - - len = buf_get_int16(buf); - if (!buf_check_overflow(buf) && buf_check_size(buf, len) && len+1>datalen) { - memcpy(data, buf->p, len); - data[len] = 0; - buf->p += len; - len++; - } - - return len; -} - -static inline char *buf_get_stringb(struct cbuf *buf, struct cbuf *sbuf) -{ - char *ret; - u16 len; - - ret = NULL; - len = buf_get_int16(buf); - - if (!buf_check_overflow(buf) && buf_check_size(buf, len) && - buf_check_size(sbuf, len+1)) { - - memcpy(sbuf->p, buf->p, len); - sbuf->p[len] = 0; - ret = sbuf->p; - buf->p += len; - sbuf->p += len + 1; - } - - return ret; -} - -static inline int buf_get_data(struct cbuf *buf, void *data, int datalen) +static inline void buf_get_str(struct cbuf *buf, struct v9fs_str *vstr) { - int ret = 0; - - if (buf_check_size(buf, datalen)) { - memcpy(data, buf->p, datalen); - buf->p += datalen; - ret = datalen; + vstr->len = buf_get_int16(buf); + if (!buf_check_overflow(buf) && buf_check_size(buf, vstr->len)) { + vstr->str = buf->p; + buf->p += vstr->len; + } else { + vstr->len = 0; + vstr->str = NULL; } - - return ret; } -static inline void *buf_get_datab(struct cbuf *buf, struct cbuf *dbuf, - int datalen) +static inline void buf_get_qid(struct cbuf *bufp, struct v9fs_qid *qid) { - char *ret = NULL; - int n = 0; - - if (buf_check_size(dbuf, datalen)) { - n = buf_get_data(buf, dbuf->p, datalen); - if (n > 0) { - ret = dbuf->p; - dbuf->p += n; - } - } - - return ret; + qid->type = buf_get_int8(bufp); + qid->version = buf_get_int32(bufp); + qid->path = buf_get_int64(bufp); } /** - * v9fs_size_stat - calculate the size of a variable length stat struct - * @v9ses: session information + * v9fs_size_wstat - calculate the size of a variable length stat struct * @stat: metadata (stat) structure + * @extended: non-zero if 9P2000.u * */ -static int v9fs_size_stat(struct v9fs_session_info *v9ses, - struct v9fs_stat *stat) +static int v9fs_size_wstat(struct v9fs_wstat *wstat, int extended) { int size = 0; - if (stat == NULL) { + if (wstat == NULL) { eprintk(KERN_ERR, "v9fs_size_stat: got a NULL stat pointer\n"); return 0; } @@ -279,82 +225,38 @@ static int v9fs_size_stat(struct v9fs_session_info *v9ses, 8 + /* length[8] */ 8; /* minimum sum of string lengths */ - if (stat->name) - size += strlen(stat->name); - if (stat->uid) - size += strlen(stat->uid); - if (stat->gid) - size += strlen(stat->gid); - if (stat->muid) - size += strlen(stat->muid); + if (wstat->name) + size += strlen(wstat->name); + if (wstat->uid) + size += strlen(wstat->uid); + if (wstat->gid) + size += strlen(wstat->gid); + if (wstat->muid) + size += strlen(wstat->muid); - if (v9ses->extended) { + if (extended) { size += 4 + /* n_uid[4] */ 4 + /* n_gid[4] */ 4 + /* n_muid[4] */ 2; /* string length of extension[4] */ - if (stat->extension) - size += strlen(stat->extension); + if (wstat->extension) + size += strlen(wstat->extension); } return size; } /** - * serialize_stat - safely format a stat structure for transmission - * @v9ses: session info - * @stat: metadata (stat) structure - * @bufp: buffer to serialize structure into - * - */ - -static int -serialize_stat(struct v9fs_session_info *v9ses, struct v9fs_stat *stat, - struct cbuf *bufp) -{ - buf_put_int16(bufp, stat->size); - buf_put_int16(bufp, stat->type); - buf_put_int32(bufp, stat->dev); - buf_put_int8(bufp, stat->qid.type); - buf_put_int32(bufp, stat->qid.version); - buf_put_int64(bufp, stat->qid.path); - buf_put_int32(bufp, stat->mode); - buf_put_int32(bufp, stat->atime); - buf_put_int32(bufp, stat->mtime); - buf_put_int64(bufp, stat->length); - - buf_put_string(bufp, stat->name); - buf_put_string(bufp, stat->uid); - buf_put_string(bufp, stat->gid); - buf_put_string(bufp, stat->muid); - - if (v9ses->extended) { - buf_put_string(bufp, stat->extension); - buf_put_int32(bufp, stat->n_uid); - buf_put_int32(bufp, stat->n_gid); - buf_put_int32(bufp, stat->n_muid); - } - - if (buf_check_overflow(bufp)) - return 0; - - return stat->size; -} - -/** - * deserialize_stat - safely decode a recieved metadata (stat) structure - * @v9ses: session info + * buf_get_stat - safely decode a recieved metadata (stat) structure * @bufp: buffer to deserialize * @stat: metadata (stat) structure - * @dbufp: buffer to deserialize variable strings into + * @extended: non-zero if 9P2000.u * */ -static inline int -deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp, - struct v9fs_stat *stat, struct cbuf *dbufp) +static inline void +buf_get_stat(struct cbuf *bufp, struct v9fs_stat *stat, int extended) { - stat->size = buf_get_int16(bufp); stat->type = buf_get_int16(bufp); stat->dev = buf_get_int32(bufp); @@ -365,282 +267,82 @@ deserialize_stat(struct v9fs_session_info *v9ses, struct cbuf *bufp, stat->atime = buf_get_int32(bufp); stat->mtime = buf_get_int32(bufp); stat->length = buf_get_int64(bufp); - stat->name = buf_get_stringb(bufp, dbufp); - stat->uid = buf_get_stringb(bufp, dbufp); - stat->gid = buf_get_stringb(bufp, dbufp); - stat->muid = buf_get_stringb(bufp, dbufp); + buf_get_str(bufp, &stat->name); + buf_get_str(bufp, &stat->uid); + buf_get_str(bufp, &stat->gid); + buf_get_str(bufp, &stat->muid); - if (v9ses->extended) { - stat->extension = buf_get_stringb(bufp, dbufp); + if (extended) { + buf_get_str(bufp, &stat->extension); stat->n_uid = buf_get_int32(bufp); stat->n_gid = buf_get_int32(bufp); stat->n_muid = buf_get_int32(bufp); } - - if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) - return 0; - - return stat->size + 2; -} - -/** - * deserialize_statb - wrapper for decoding a received metadata structure - * @v9ses: session info - * @bufp: buffer to deserialize - * @dbufp: buffer to deserialize variable strings into - * - */ - -static inline struct v9fs_stat *deserialize_statb(struct v9fs_session_info - *v9ses, struct cbuf *bufp, - struct cbuf *dbufp) -{ - struct v9fs_stat *ret = buf_alloc(dbufp, sizeof(struct v9fs_stat)); - - if (ret) { - int n = deserialize_stat(v9ses, bufp, ret, dbufp); - if (n <= 0) - return NULL; - } - - return ret; } /** * v9fs_deserialize_stat - decode a received metadata structure - * @v9ses: session info * @buf: buffer to deserialize * @buflen: length of received buffer * @stat: metadata structure to decode into - * @statlen: length of destination metadata structure + * @extended: non-zero if 9P2000.u * + * Note: stat will point to the buf region. */ int -v9fs_deserialize_stat(struct v9fs_session_info *v9ses, void *buf, - u32 buflen, struct v9fs_stat *stat, u32 statlen) +v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat, + int extended) { struct cbuf buffer; struct cbuf *bufp = &buffer; - struct cbuf dbuffer; - struct cbuf *dbufp = &dbuffer; + unsigned char *p; buf_init(bufp, buf, buflen); - buf_init(dbufp, (char *)stat + sizeof(struct v9fs_stat), - statlen - sizeof(struct v9fs_stat)); - - return deserialize_stat(v9ses, bufp, stat, dbufp); -} - -static inline int -v9fs_size_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall) -{ - int size = 4 + 1 + 2; /* size[4] msg[1] tag[2] */ - int i = 0; - - switch (fcall->id) { - default: - eprintk(KERN_ERR, "bad msg type %d\n", fcall->id); - return 0; - case TVERSION: /* msize[4] version[s] */ - size += 4 + 2 + strlen(fcall->params.tversion.version); - break; - case TAUTH: /* afid[4] uname[s] aname[s] */ - size += 4 + 2 + strlen(fcall->params.tauth.uname) + - 2 + strlen(fcall->params.tauth.aname); - break; - case TFLUSH: /* oldtag[2] */ - size += 2; - break; - case TATTACH: /* fid[4] afid[4] uname[s] aname[s] */ - size += 4 + 4 + 2 + strlen(fcall->params.tattach.uname) + - 2 + strlen(fcall->params.tattach.aname); - break; - case TWALK: /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */ - size += 4 + 4 + 2; - /* now compute total for the array of names */ - for (i = 0; i < fcall->params.twalk.nwname; i++) - size += 2 + strlen(fcall->params.twalk.wnames[i]); - break; - case TOPEN: /* fid[4] mode[1] */ - size += 4 + 1; - break; - case TCREATE: /* fid[4] name[s] perm[4] mode[1] */ - size += 4 + 2 + strlen(fcall->params.tcreate.name) + 4 + 1; - break; - case TREAD: /* fid[4] offset[8] count[4] */ - size += 4 + 8 + 4; - break; - case TWRITE: /* fid[4] offset[8] count[4] data[count] */ - size += 4 + 8 + 4 + fcall->params.twrite.count; - break; - case TCLUNK: /* fid[4] */ - size += 4; - break; - case TREMOVE: /* fid[4] */ - size += 4; - break; - case TSTAT: /* fid[4] */ - size += 4; - break; - case TWSTAT: /* fid[4] stat[n] */ - fcall->params.twstat.stat->size = - v9fs_size_stat(v9ses, fcall->params.twstat.stat); - size += 4 + 2 + 2 + fcall->params.twstat.stat->size; - } - return size; -} - -/* - * v9fs_serialize_fcall - marshall fcall struct into a packet - * @v9ses: session information - * @fcall: structure to convert - * @data: buffer to serialize fcall into - * @datalen: length of buffer to serialize fcall into - * - */ - -int -v9fs_serialize_fcall(struct v9fs_session_info *v9ses, struct v9fs_fcall *fcall, - void *data, u32 datalen) -{ - int i = 0; - struct v9fs_stat *stat = NULL; - struct cbuf buffer; - struct cbuf *bufp = &buffer; - - buf_init(bufp, data, datalen); - - if (!fcall) { - eprintk(KERN_ERR, "no fcall\n"); - return -EINVAL; - } - - fcall->size = v9fs_size_fcall(v9ses, fcall); - - buf_put_int32(bufp, fcall->size); - buf_put_int8(bufp, fcall->id); - buf_put_int16(bufp, fcall->tag); - - dprintk(DEBUG_CONV, "size %d id %d tag %d\n", fcall->size, fcall->id, - fcall->tag); - - /* now encode it */ - switch (fcall->id) { - default: - eprintk(KERN_ERR, "bad msg type: %d\n", fcall->id); - return -EPROTO; - case TVERSION: - buf_put_int32(bufp, fcall->params.tversion.msize); - buf_put_string(bufp, fcall->params.tversion.version); - break; - case TAUTH: - buf_put_int32(bufp, fcall->params.tauth.afid); - buf_put_string(bufp, fcall->params.tauth.uname); - buf_put_string(bufp, fcall->params.tauth.aname); - break; - case TFLUSH: - buf_put_int16(bufp, fcall->params.tflush.oldtag); - break; - case TATTACH: - buf_put_int32(bufp, fcall->params.tattach.fid); - buf_put_int32(bufp, fcall->params.tattach.afid); - buf_put_string(bufp, fcall->params.tattach.uname); - buf_put_string(bufp, fcall->params.tattach.aname); - break; - case TWALK: - buf_put_int32(bufp, fcall->params.twalk.fid); - buf_put_int32(bufp, fcall->params.twalk.newfid); - buf_put_int16(bufp, fcall->params.twalk.nwname); - for (i = 0; i < fcall->params.twalk.nwname; i++) - buf_put_string(bufp, fcall->params.twalk.wnames[i]); - break; - case TOPEN: - buf_put_int32(bufp, fcall->params.topen.fid); - buf_put_int8(bufp, fcall->params.topen.mode); - break; - case TCREATE: - buf_put_int32(bufp, fcall->params.tcreate.fid); - buf_put_string(bufp, fcall->params.tcreate.name); - buf_put_int32(bufp, fcall->params.tcreate.perm); - buf_put_int8(bufp, fcall->params.tcreate.mode); - break; - case TREAD: - buf_put_int32(bufp, fcall->params.tread.fid); - buf_put_int64(bufp, fcall->params.tread.offset); - buf_put_int32(bufp, fcall->params.tread.count); - break; - case TWRITE: - buf_put_int32(bufp, fcall->params.twrite.fid); - buf_put_int64(bufp, fcall->params.twrite.offset); - buf_put_int32(bufp, fcall->params.twrite.count); - buf_put_data(bufp, fcall->params.twrite.data, - fcall->params.twrite.count); - break; - case TCLUNK: - buf_put_int32(bufp, fcall->params.tclunk.fid); - break; - case TREMOVE: - buf_put_int32(bufp, fcall->params.tremove.fid); - break; - case TSTAT: - buf_put_int32(bufp, fcall->params.tstat.fid); - break; - case TWSTAT: - buf_put_int32(bufp, fcall->params.twstat.fid); - stat = fcall->params.twstat.stat; - - buf_put_int16(bufp, stat->size + 2); - serialize_stat(v9ses, stat, bufp); - break; - } + p = bufp->p; + buf_get_stat(bufp, stat, extended); if (buf_check_overflow(bufp)) - return -EIO; - - return fcall->size; + return 0; + else + return bufp->p - p; } /** * deserialize_fcall - unmarshal a response - * @v9ses: session information - * @msgsize: size of rcall message * @buf: recieved buffer * @buflen: length of received buffer * @rcall: fcall structure to populate * @rcalllen: length of fcall structure to populate + * @extended: non-zero if 9P2000.u * */ int -v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize, - void *buf, u32 buflen, struct v9fs_fcall *rcall, - int rcalllen) +v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall, + int extended) { struct cbuf buffer; struct cbuf *bufp = &buffer; - struct cbuf dbuffer; - struct cbuf *dbufp = &dbuffer; int i = 0; buf_init(bufp, buf, buflen); - buf_init(dbufp, (char *)rcall + sizeof(struct v9fs_fcall), - rcalllen - sizeof(struct v9fs_fcall)); - rcall->size = msgsize; + rcall->size = buf_get_int32(bufp); rcall->id = buf_get_int8(bufp); rcall->tag = buf_get_int16(bufp); dprintk(DEBUG_CONV, "size %d id %d tag %d\n", rcall->size, rcall->id, rcall->tag); + switch (rcall->id) { default: eprintk(KERN_ERR, "unknown message type: %d\n", rcall->id); return -EPROTO; case RVERSION: rcall->params.rversion.msize = buf_get_int32(bufp); - rcall->params.rversion.version = buf_get_stringb(bufp, dbufp); + buf_get_str(bufp, &rcall->params.rversion.version); break; case RFLUSH: break; @@ -651,34 +353,27 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize, break; case RWALK: rcall->params.rwalk.nwqid = buf_get_int16(bufp); - rcall->params.rwalk.wqids = buf_alloc(dbufp, - rcall->params.rwalk.nwqid * sizeof(struct v9fs_qid)); - if (rcall->params.rwalk.wqids) - for (i = 0; i < rcall->params.rwalk.nwqid; i++) { - rcall->params.rwalk.wqids[i].type = - buf_get_int8(bufp); - rcall->params.rwalk.wqids[i].version = - buf_get_int16(bufp); - rcall->params.rwalk.wqids[i].path = - buf_get_int64(bufp); - } + if (rcall->params.rwalk.nwqid > V9FS_MAXWELEM) { + eprintk(KERN_ERR, "Rwalk with more than %d qids: %d\n", + V9FS_MAXWELEM, rcall->params.rwalk.nwqid); + return -EPROTO; + } + + for (i = 0; i < rcall->params.rwalk.nwqid; i++) + buf_get_qid(bufp, &rcall->params.rwalk.wqids[i]); break; case ROPEN: - rcall->params.ropen.qid.type = buf_get_int8(bufp); - rcall->params.ropen.qid.version = buf_get_int32(bufp); - rcall->params.ropen.qid.path = buf_get_int64(bufp); + buf_get_qid(bufp, &rcall->params.ropen.qid); rcall->params.ropen.iounit = buf_get_int32(bufp); break; case RCREATE: - rcall->params.rcreate.qid.type = buf_get_int8(bufp); - rcall->params.rcreate.qid.version = buf_get_int32(bufp); - rcall->params.rcreate.qid.path = buf_get_int64(bufp); + buf_get_qid(bufp, &rcall->params.rcreate.qid); rcall->params.rcreate.iounit = buf_get_int32(bufp); break; case RREAD: rcall->params.rread.count = buf_get_int32(bufp); - rcall->params.rread.data = buf_get_datab(bufp, dbufp, - rcall->params.rread.count); + rcall->params.rread.data = bufp->p; + buf_check_size(bufp, rcall->params.rread.count); break; case RWRITE: rcall->params.rwrite.count = buf_get_int32(bufp); @@ -689,20 +384,443 @@ v9fs_deserialize_fcall(struct v9fs_session_info *v9ses, u32 msgsize, break; case RSTAT: buf_get_int16(bufp); - rcall->params.rstat.stat = - deserialize_statb(v9ses, bufp, dbufp); + buf_get_stat(bufp, &rcall->params.rstat.stat, extended); break; case RWSTAT: break; case RERROR: - rcall->params.rerror.error = buf_get_stringb(bufp, dbufp); - if (v9ses->extended) + buf_get_str(bufp, &rcall->params.rerror.error); + if (extended) rcall->params.rerror.errno = buf_get_int16(bufp); break; } - if (buf_check_overflow(bufp) || buf_check_overflow(dbufp)) + if (buf_check_overflow(bufp)) { + dprintk(DEBUG_ERROR, "buffer overflow\n"); return -EIO; + } + + return bufp->p - bufp->sp; +} + +static inline void v9fs_put_int8(struct cbuf *bufp, u8 val, u8 * p) +{ + *p = val; + buf_put_int8(bufp, val); +} + +static inline void v9fs_put_int16(struct cbuf *bufp, u16 val, u16 * p) +{ + *p = val; + buf_put_int16(bufp, val); +} + +static inline void v9fs_put_int32(struct cbuf *bufp, u32 val, u32 * p) +{ + *p = val; + buf_put_int32(bufp, val); +} + +static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p) +{ + *p = val; + buf_put_int64(bufp, val); +} - return rcall->size; +static inline void +v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str) +{ + if (data) { + str->len = strlen(data); + str->str = bufp->p; + } else { + str->len = 0; + str->str = NULL; + } + + buf_put_stringn(bufp, data, str->len); +} + +static inline int +v9fs_put_user_data(struct cbuf *bufp, const char __user * data, int count, + unsigned char **pdata) +{ + *pdata = buf_alloc(bufp, count); + return copy_from_user(*pdata, data, count); +} + +static void +v9fs_put_wstat(struct cbuf *bufp, struct v9fs_wstat *wstat, + struct v9fs_stat *stat, int statsz, int extended) +{ + v9fs_put_int16(bufp, statsz, &stat->size); + v9fs_put_int16(bufp, wstat->type, &stat->type); + v9fs_put_int32(bufp, wstat->dev, &stat->dev); + v9fs_put_int8(bufp, wstat->qid.type, &stat->qid.type); + v9fs_put_int32(bufp, wstat->qid.version, &stat->qid.version); + v9fs_put_int64(bufp, wstat->qid.path, &stat->qid.path); + v9fs_put_int32(bufp, wstat->mode, &stat->mode); + v9fs_put_int32(bufp, wstat->atime, &stat->atime); + v9fs_put_int32(bufp, wstat->mtime, &stat->mtime); + v9fs_put_int64(bufp, wstat->length, &stat->length); + + v9fs_put_str(bufp, wstat->name, &stat->name); + v9fs_put_str(bufp, wstat->uid, &stat->uid); + v9fs_put_str(bufp, wstat->gid, &stat->gid); + v9fs_put_str(bufp, wstat->muid, &stat->muid); + + if (extended) { + v9fs_put_str(bufp, wstat->extension, &stat->extension); + v9fs_put_int32(bufp, wstat->n_uid, &stat->n_uid); + v9fs_put_int32(bufp, wstat->n_gid, &stat->n_gid); + v9fs_put_int32(bufp, wstat->n_muid, &stat->n_muid); + } +} + +static struct v9fs_fcall * +v9fs_create_common(struct cbuf *bufp, u32 size, u8 id) +{ + struct v9fs_fcall *fc; + + size += 4 + 1 + 2; /* size[4] id[1] tag[2] */ + fc = kmalloc(sizeof(struct v9fs_fcall) + size, GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->sdata = (char *)fc + sizeof(*fc); + + buf_init(bufp, (char *)fc->sdata, size); + v9fs_put_int32(bufp, size, &fc->size); + v9fs_put_int8(bufp, id, &fc->id); + v9fs_put_int16(bufp, V9FS_NOTAG, &fc->tag); + + return fc; +} + +void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag) +{ + fc->tag = tag; + *(__le16 *) (fc->sdata + 5) = cpu_to_le16(tag); +} + +struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 2 + strlen(version); /* msize[4] version[s] */ + fc = v9fs_create_common(bufp, size, TVERSION); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, msize, &fc->params.tversion.msize); + v9fs_put_str(bufp, version, &fc->params.tversion.version); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 2 + strlen(uname) + 2 + strlen(aname); /* afid[4] uname[s] aname[s] */ + fc = v9fs_create_common(bufp, size, TAUTH); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, afid, &fc->params.tauth.afid); + v9fs_put_str(bufp, uname, &fc->params.tauth.uname); + v9fs_put_str(bufp, aname, &fc->params.tauth.aname); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall * +v9fs_create_tattach(u32 fid, u32 afid, char *uname, char *aname) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 4 + 2 + strlen(uname) + 2 + strlen(aname); /* fid[4] afid[4] uname[s] aname[s] */ + fc = v9fs_create_common(bufp, size, TATTACH); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.tattach.fid); + v9fs_put_int32(bufp, afid, &fc->params.tattach.afid); + v9fs_put_str(bufp, uname, &fc->params.tattach.uname); + v9fs_put_str(bufp, aname, &fc->params.tattach.aname); + + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tflush(u16 oldtag) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 2; /* oldtag[2] */ + fc = v9fs_create_common(bufp, size, TFLUSH); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int16(bufp, oldtag, &fc->params.tflush.oldtag); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname, + char **wnames) +{ + int i, size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + if (nwname > V9FS_MAXWELEM) { + dprintk(DEBUG_ERROR, "nwname > %d\n", V9FS_MAXWELEM); + return NULL; + } + + size = 4 + 4 + 2; /* fid[4] newfid[4] nwname[2] ... */ + for (i = 0; i < nwname; i++) { + size += 2 + strlen(wnames[i]); /* wname[s] */ + } + + fc = v9fs_create_common(bufp, size, TWALK); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.twalk.fid); + v9fs_put_int32(bufp, newfid, &fc->params.twalk.newfid); + v9fs_put_int16(bufp, nwname, &fc->params.twalk.nwname); + for (i = 0; i < nwname; i++) { + v9fs_put_str(bufp, wnames[i], &fc->params.twalk.wnames[i]); + } + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 1; /* fid[4] mode[1] */ + fc = v9fs_create_common(bufp, size, TOPEN); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.topen.fid); + v9fs_put_int8(bufp, mode, &fc->params.topen.mode); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 2 + strlen(name) + 4 + 1; /* fid[4] name[s] perm[4] mode[1] */ + fc = v9fs_create_common(bufp, size, TCREATE); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.tcreate.fid); + v9fs_put_str(bufp, name, &fc->params.tcreate.name); + v9fs_put_int32(bufp, perm, &fc->params.tcreate.perm); + v9fs_put_int8(bufp, mode, &fc->params.tcreate.mode); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 8 + 4; /* fid[4] offset[8] count[4] */ + fc = v9fs_create_common(bufp, size, TREAD); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.tread.fid); + v9fs_put_int64(bufp, offset, &fc->params.tread.offset); + v9fs_put_int32(bufp, count, &fc->params.tread.count); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count, + const char __user * data) +{ + int size, err; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4 + 8 + 4 + count; /* fid[4] offset[8] count[4] data[count] */ + fc = v9fs_create_common(bufp, size, TWRITE); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.twrite.fid); + v9fs_put_int64(bufp, offset, &fc->params.twrite.offset); + v9fs_put_int32(bufp, count, &fc->params.twrite.count); + err = v9fs_put_user_data(bufp, data, count, &fc->params.twrite.data); + if (err) { + kfree(fc); + fc = ERR_PTR(err); + } + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tclunk(u32 fid) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4; /* fid[4] */ + fc = v9fs_create_common(bufp, size, TCLUNK); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.tclunk.fid); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tremove(u32 fid) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4; /* fid[4] */ + fc = v9fs_create_common(bufp, size, TREMOVE); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.tremove.fid); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_tstat(u32 fid) +{ + int size; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + size = 4; /* fid[4] */ + fc = v9fs_create_common(bufp, size, TSTAT); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.tstat.fid); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; +} + +struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat, + int extended) +{ + int size, statsz; + struct v9fs_fcall *fc; + struct cbuf buffer; + struct cbuf *bufp = &buffer; + + statsz = v9fs_size_wstat(wstat, extended); + size = 4 + 2 + 2 + statsz; /* fid[4] stat[n] */ + fc = v9fs_create_common(bufp, size, TWSTAT); + if (IS_ERR(fc)) + goto error; + + v9fs_put_int32(bufp, fid, &fc->params.twstat.fid); + buf_put_int16(bufp, statsz + 2); + v9fs_put_wstat(bufp, wstat, &fc->params.twstat.stat, statsz, extended); + + if (buf_check_overflow(bufp)) { + kfree(fc); + fc = ERR_PTR(-ENOMEM); + } + error: + return fc; } diff --git a/fs/9p/conv.h b/fs/9p/conv.h index ee849613c61a..26a736e4a2e7 100644 --- a/fs/9p/conv.h +++ b/fs/9p/conv.h @@ -1,8 +1,9 @@ /* * linux/fs/9p/conv.h * - * 9P protocol conversion definitions + * 9P protocol conversion definitions. * + * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> * @@ -24,13 +25,27 @@ * */ -int v9fs_deserialize_stat(struct v9fs_session_info *, void *buf, - u32 buflen, struct v9fs_stat *stat, u32 statlen); -int v9fs_serialize_fcall(struct v9fs_session_info *, struct v9fs_fcall *tcall, - void *buf, u32 buflen); -int v9fs_deserialize_fcall(struct v9fs_session_info *, u32 msglen, - void *buf, u32 buflen, struct v9fs_fcall *rcall, - int rcalllen); +int v9fs_deserialize_stat(void *buf, u32 buflen, struct v9fs_stat *stat, + int extended); +int v9fs_deserialize_fcall(void *buf, u32 buflen, struct v9fs_fcall *rcall, + int extended); -/* this one is actually in error.c right now */ -int v9fs_errstr2errno(char *errstr); +void v9fs_set_tag(struct v9fs_fcall *fc, u16 tag); + +struct v9fs_fcall *v9fs_create_tversion(u32 msize, char *version); +struct v9fs_fcall *v9fs_create_tauth(u32 afid, char *uname, char *aname); +struct v9fs_fcall *v9fs_create_tattach(u32 fid, u32 afid, char *uname, + char *aname); +struct v9fs_fcall *v9fs_create_tflush(u16 oldtag); +struct v9fs_fcall *v9fs_create_twalk(u32 fid, u32 newfid, u16 nwname, + char **wnames); +struct v9fs_fcall *v9fs_create_topen(u32 fid, u8 mode); +struct v9fs_fcall *v9fs_create_tcreate(u32 fid, char *name, u32 perm, u8 mode); +struct v9fs_fcall *v9fs_create_tread(u32 fid, u64 offset, u32 count); +struct v9fs_fcall *v9fs_create_twrite(u32 fid, u64 offset, u32 count, + const char __user *data); +struct v9fs_fcall *v9fs_create_tclunk(u32 fid); +struct v9fs_fcall *v9fs_create_tremove(u32 fid); +struct v9fs_fcall *v9fs_create_tstat(u32 fid); +struct v9fs_fcall *v9fs_create_twstat(u32 fid, struct v9fs_wstat *wstat, + int extended); diff --git a/fs/9p/debug.h b/fs/9p/debug.h index 4445f06919d9..fe551032788b 100644 --- a/fs/9p/debug.h +++ b/fs/9p/debug.h @@ -51,16 +51,23 @@ do { \ #if DEBUG_DUMP_PKT static inline void dump_data(const unsigned char *data, unsigned int datalen) { - int i, j; - int len = datalen; + int i, n; + char buf[5*8]; - printk(KERN_DEBUG "data "); - for (i = 0; i < len; i += 4) { - for (j = 0; (j < 4) && (i + j < len); j++) - printk(KERN_DEBUG "%02x", data[i + j]); - printk(KERN_DEBUG " "); + n = 0; + i = 0; + while (i < datalen) { + n += snprintf(buf+n, sizeof(buf)-n, "%02x", data[i++]); + if (i%4 == 0) + n += snprintf(buf+n, sizeof(buf)-n, " "); + + if (i%16 == 0) { + dprintk(DEBUG_ERROR, "%s\n", buf); + n = 0; + } } - printk(KERN_DEBUG "\n"); + + dprintk(DEBUG_ERROR, "%s\n", buf); } #else /* DEBUG_DUMP_PKT */ static inline void dump_data(const unsigned char *data, unsigned int datalen) diff --git a/fs/9p/error.c b/fs/9p/error.c index 834cb179e388..e4b6f8f38b6f 100644 --- a/fs/9p/error.c +++ b/fs/9p/error.c @@ -33,7 +33,6 @@ #include <linux/list.h> #include <linux/jhash.h> -#include <linux/string.h> #include "debug.h" #include "error.h" @@ -55,7 +54,8 @@ int v9fs_error_init(void) /* load initial error map into hash table */ for (c = errmap; c->name != NULL; c++) { - bucket = jhash(c->name, strlen(c->name), 0) % ERRHASHSZ; + c->namelen = strlen(c->name); + bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ; INIT_HLIST_NODE(&c->list); hlist_add_head(&c->list, &hash_errmap[bucket]); } @@ -69,15 +69,15 @@ int v9fs_error_init(void) * */ -int v9fs_errstr2errno(char *errstr) +int v9fs_errstr2errno(char *errstr, int len) { int errno = 0; struct hlist_node *p = NULL; struct errormap *c = NULL; - int bucket = jhash(errstr, strlen(errstr), 0) % ERRHASHSZ; + int bucket = jhash(errstr, len, 0) % ERRHASHSZ; hlist_for_each_entry(c, p, &hash_errmap[bucket], list) { - if (!strcmp(c->name, errstr)) { + if (c->namelen==len && !memcmp(c->name, errstr, len)) { errno = c->val; break; } diff --git a/fs/9p/error.h b/fs/9p/error.h index 78f89acf7c9a..a9794e85fe51 100644 --- a/fs/9p/error.h +++ b/fs/9p/error.h @@ -36,6 +36,7 @@ struct errormap { char *name; int val; + int namelen; struct hlist_node list; }; @@ -175,4 +176,3 @@ static struct errormap errmap[] = { }; extern int v9fs_error_init(void); -extern int v9fs_errstr2errno(char *errstr); diff --git a/fs/9p/fid.c b/fs/9p/fid.c index d95f8626d170..eda449778fa5 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -31,9 +31,6 @@ #include "v9fs.h" #include "9p.h" #include "v9fs_vfs.h" -#include "transport.h" -#include "mux.h" -#include "conv.h" #include "fid.h" /** @@ -164,7 +161,7 @@ static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry) return v9fs_fid_create(dentry, v9ses, fidnum, 0); clunk_fid: - v9fs_t_clunk(v9ses, fidnum, NULL); + v9fs_t_clunk(v9ses, fidnum); return ERR_PTR(err); } diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 8835b576f744..945cb368d451 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -4,7 +4,7 @@ * Protocol Multiplexer * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> - * Copyright (C) 2004 by Latchesar Ionkov <lucho@ionkov.net> + * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,448 +28,943 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/poll.h> #include <linux/kthread.h> #include <linux/idr.h> #include "debug.h" #include "v9fs.h" #include "9p.h" -#include "transport.h" #include "conv.h" +#include "transport.h" #include "mux.h" +#define ERREQFLUSH 1 +#define SCHED_TIMEOUT 10 +#define MAXPOLLWADDR 2 + +enum { + Rworksched = 1, /* read work scheduled or running */ + Rpending = 2, /* can read */ + Wworksched = 4, /* write work scheduled or running */ + Wpending = 8, /* can write */ +}; + +struct v9fs_mux_poll_task; + +struct v9fs_req { + int tag; + struct v9fs_fcall *tcall; + struct v9fs_fcall *rcall; + int err; + v9fs_mux_req_callback cb; + void *cba; + struct list_head req_list; +}; + +struct v9fs_mux_data { + spinlock_t lock; + struct list_head mux_list; + struct v9fs_mux_poll_task *poll_task; + int msize; + unsigned char *extended; + struct v9fs_transport *trans; + struct v9fs_idpool tidpool; + int err; + wait_queue_head_t equeue; + struct list_head req_list; + struct list_head unsent_req_list; + struct v9fs_fcall *rcall; + int rpos; + char *rbuf; + int wpos; + int wsize; + char *wbuf; + wait_queue_t poll_wait[MAXPOLLWADDR]; + wait_queue_head_t *poll_waddr[MAXPOLLWADDR]; + poll_table pt; + struct work_struct rq; + struct work_struct wq; + unsigned long wsched; +}; + +struct v9fs_mux_poll_task { + struct task_struct *task; + struct list_head mux_list; + int muxnum; +}; + +struct v9fs_mux_rpc { + struct v9fs_mux_data *m; + struct v9fs_req *req; + int err; + struct v9fs_fcall *rcall; + wait_queue_head_t wqueue; +}; + +static int v9fs_poll_proc(void *); +static void v9fs_read_work(void *); +static void v9fs_write_work(void *); +static void v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address, + poll_table * p); +static u16 v9fs_mux_get_tag(struct v9fs_mux_data *); +static void v9fs_mux_put_tag(struct v9fs_mux_data *, u16); + +static DECLARE_MUTEX(v9fs_mux_task_lock); +static struct workqueue_struct *v9fs_mux_wq; + +static int v9fs_mux_num; +static int v9fs_mux_poll_task_num; +static struct v9fs_mux_poll_task v9fs_mux_poll_tasks[100]; + +int v9fs_mux_global_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) + v9fs_mux_poll_tasks[i].task = NULL; + + v9fs_mux_wq = create_workqueue("v9fs"); + if (!v9fs_mux_wq) + return -ENOMEM; + + return 0; +} + +void v9fs_mux_global_exit(void) +{ + destroy_workqueue(v9fs_mux_wq); +} + /** - * dprintcond - print condition of session info - * @v9ses: session info structure - * @req: RPC request structure + * v9fs_mux_calc_poll_procs - calculates the number of polling procs + * based on the number of mounted v9fs filesystems. * + * The current implementation returns sqrt of the number of mounts. */ +inline int v9fs_mux_calc_poll_procs(int muxnum) +{ + int n; + + if (v9fs_mux_poll_task_num) + n = muxnum / v9fs_mux_poll_task_num + + (muxnum % v9fs_mux_poll_task_num ? 1 : 0); + else + n = 1; + + if (n > ARRAY_SIZE(v9fs_mux_poll_tasks)) + n = ARRAY_SIZE(v9fs_mux_poll_tasks); + + return n; +} -static inline int -dprintcond(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req) +static int v9fs_mux_poll_start(struct v9fs_mux_data *m) { - dprintk(DEBUG_MUX, "condition: %d, %p\n", v9ses->transport->status, - req->rcall); + int i, n; + struct v9fs_mux_poll_task *vpt, *vptlast; + struct task_struct *pproc; + + dprintk(DEBUG_MUX, "mux %p muxnum %d procnum %d\n", m, v9fs_mux_num, + v9fs_mux_poll_task_num); + up(&v9fs_mux_task_lock); + + n = v9fs_mux_calc_poll_procs(v9fs_mux_num + 1); + if (n > v9fs_mux_poll_task_num) { + for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) { + if (v9fs_mux_poll_tasks[i].task == NULL) { + vpt = &v9fs_mux_poll_tasks[i]; + dprintk(DEBUG_MUX, "create proc %p\n", vpt); + pproc = kthread_create(v9fs_poll_proc, vpt, + "v9fs-poll"); + + if (!IS_ERR(pproc)) { + vpt->task = pproc; + INIT_LIST_HEAD(&vpt->mux_list); + vpt->muxnum = 0; + v9fs_mux_poll_task_num++; + wake_up_process(vpt->task); + } + break; + } + } + + if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) + dprintk(DEBUG_ERROR, "warning: no free poll slots\n"); + } + + n = (v9fs_mux_num + 1) / v9fs_mux_poll_task_num + + ((v9fs_mux_num + 1) % v9fs_mux_poll_task_num ? 1 : 0); + + vptlast = NULL; + for (i = 0; i < ARRAY_SIZE(v9fs_mux_poll_tasks); i++) { + vpt = &v9fs_mux_poll_tasks[i]; + if (vpt->task != NULL) { + vptlast = vpt; + if (vpt->muxnum < n) { + dprintk(DEBUG_MUX, "put in proc %d\n", i); + list_add(&m->mux_list, &vpt->mux_list); + vpt->muxnum++; + m->poll_task = vpt; + memset(&m->poll_waddr, 0, sizeof(m->poll_waddr)); + init_poll_funcptr(&m->pt, v9fs_pollwait); + break; + } + } + } + + if (i >= ARRAY_SIZE(v9fs_mux_poll_tasks)) { + if (vptlast == NULL) + return -ENOMEM; + + dprintk(DEBUG_MUX, "put in proc %d\n", i); + list_add(&m->mux_list, &vptlast->mux_list); + vptlast->muxnum++; + m->poll_task = vptlast; + memset(&m->poll_waddr, 0, sizeof(m->poll_waddr)); + init_poll_funcptr(&m->pt, v9fs_pollwait); + } + + v9fs_mux_num++; + down(&v9fs_mux_task_lock); + return 0; } +static void v9fs_mux_poll_stop(struct v9fs_mux_data *m) +{ + int i; + struct v9fs_mux_poll_task *vpt; + + up(&v9fs_mux_task_lock); + vpt = m->poll_task; + list_del(&m->mux_list); + for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) { + if (m->poll_waddr[i] != NULL) { + remove_wait_queue(m->poll_waddr[i], &m->poll_wait[i]); + m->poll_waddr[i] = NULL; + } + } + vpt->muxnum--; + if (!vpt->muxnum) { + dprintk(DEBUG_MUX, "destroy proc %p\n", vpt); + send_sig(SIGKILL, vpt->task, 1); + vpt->task = NULL; + v9fs_mux_poll_task_num--; + } + v9fs_mux_num--; + down(&v9fs_mux_task_lock); +} + /** - * xread - force read of a certain number of bytes - * @v9ses: session info structure - * @ptr: pointer to buffer - * @sz: number of bytes to read + * v9fs_mux_init - allocate and initialize the per-session mux data + * Creates the polling task if this is the first session. * - * Chuck Cranor CS-533 project1 + * @trans - transport structure + * @msize - maximum message size + * @extended - pointer to the extended flag */ - -static int xread(struct v9fs_session_info *v9ses, void *ptr, unsigned long sz) +struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize, + unsigned char *extended) { - int rd = 0; - int ret = 0; - while (rd < sz) { - ret = v9ses->transport->read(v9ses->transport, ptr, sz - rd); - if (ret <= 0) { - dprintk(DEBUG_ERROR, "xread errno %d\n", ret); - return ret; + int i, n; + struct v9fs_mux_data *m, *mtmp; + + dprintk(DEBUG_MUX, "transport %p msize %d\n", trans, msize); + m = kmalloc(sizeof(struct v9fs_mux_data), GFP_KERNEL); + if (!m) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&m->lock); + INIT_LIST_HEAD(&m->mux_list); + m->msize = msize; + m->extended = extended; + m->trans = trans; + idr_init(&m->tidpool.pool); + init_MUTEX(&m->tidpool.lock); + m->err = 0; + init_waitqueue_head(&m->equeue); + INIT_LIST_HEAD(&m->req_list); + INIT_LIST_HEAD(&m->unsent_req_list); + m->rcall = NULL; + m->rpos = 0; + m->rbuf = NULL; + m->wpos = m->wsize = 0; + m->wbuf = NULL; + INIT_WORK(&m->rq, v9fs_read_work, m); + INIT_WORK(&m->wq, v9fs_write_work, m); + m->wsched = 0; + memset(&m->poll_waddr, 0, sizeof(m->poll_waddr)); + m->poll_task = NULL; + n = v9fs_mux_poll_start(m); + if (n) + return ERR_PTR(n); + + n = trans->poll(trans, &m->pt); + if (n & POLLIN) { + dprintk(DEBUG_MUX, "mux %p can read\n", m); + set_bit(Rpending, &m->wsched); + } + + if (n & POLLOUT) { + dprintk(DEBUG_MUX, "mux %p can write\n", m); + set_bit(Wpending, &m->wsched); + } + + for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) { + if (IS_ERR(m->poll_waddr[i])) { + v9fs_mux_poll_stop(m); + mtmp = (void *)m->poll_waddr; /* the error code */ + kfree(m); + m = mtmp; + break; } - rd += ret; - ptr += ret; } - return (rd); + + return m; } /** - * read_message - read a full 9P2000 fcall packet - * @v9ses: session info structure - * @rcall: fcall structure to read into - * @rcalllen: size of fcall buffer - * + * v9fs_mux_destroy - cancels all pending requests and frees mux resources */ +void v9fs_mux_destroy(struct v9fs_mux_data *m) +{ + dprintk(DEBUG_MUX, "mux %p prev %p next %p\n", m, + m->mux_list.prev, m->mux_list.next); + v9fs_mux_cancel(m, -ECONNRESET); + + if (!list_empty(&m->req_list)) { + /* wait until all processes waiting on this session exit */ + dprintk(DEBUG_MUX, "mux %p waiting for empty request queue\n", + m); + wait_event_timeout(m->equeue, (list_empty(&m->req_list)), 5000); + dprintk(DEBUG_MUX, "mux %p request queue empty: %d\n", m, + list_empty(&m->req_list)); + } + + v9fs_mux_poll_stop(m); + m->trans = NULL; + + kfree(m); +} -static int -read_message(struct v9fs_session_info *v9ses, - struct v9fs_fcall *rcall, int rcalllen) +/** + * v9fs_pollwait - called by files poll operation to add v9fs-poll task + * to files wait queue + */ +static void +v9fs_pollwait(struct file *filp, wait_queue_head_t * wait_address, + poll_table * p) { - unsigned char buf[4]; - void *data; - int size = 0; - int res = 0; - - res = xread(v9ses, buf, sizeof(buf)); - if (res < 0) { - dprintk(DEBUG_ERROR, - "Reading of count field failed returned: %d\n", res); - return res; + int i; + struct v9fs_mux_data *m; + + m = container_of(p, struct v9fs_mux_data, pt); + for(i = 0; i < ARRAY_SIZE(m->poll_waddr); i++) + if (m->poll_waddr[i] == NULL) + break; + + if (i >= ARRAY_SIZE(m->poll_waddr)) { + dprintk(DEBUG_ERROR, "not enough wait_address slots\n"); + return; } - if (res < 4) { - dprintk(DEBUG_ERROR, - "Reading of count field failed returned: %d\n", res); - return -EIO; + m->poll_waddr[i] = wait_address; + + if (!wait_address) { + dprintk(DEBUG_ERROR, "no wait_address\n"); + m->poll_waddr[i] = ERR_PTR(-EIO); + return; } - size = buf[0] | (buf[1] << 8) | (buf[2] << 16) | (buf[3] << 24); - dprintk(DEBUG_MUX, "got a packet count: %d\n", size); + init_waitqueue_entry(&m->poll_wait[i], m->poll_task->task); + add_wait_queue(wait_address, &m->poll_wait[i]); +} + +/** + * v9fs_poll_mux - polls a mux and schedules read or write works if necessary + */ +static inline void v9fs_poll_mux(struct v9fs_mux_data *m) +{ + int n; - /* adjust for the four bytes of size */ - size -= 4; + if (m->err < 0) + return; - if (size > v9ses->maxdata) { - dprintk(DEBUG_ERROR, "packet too big: %d\n", size); - return -E2BIG; + n = m->trans->poll(m->trans, NULL); + if (n < 0 || n & (POLLERR | POLLHUP | POLLNVAL)) { + dprintk(DEBUG_MUX, "error mux %p err %d\n", m, n); + if (n >= 0) + n = -ECONNRESET; + v9fs_mux_cancel(m, n); } - data = kmalloc(size, GFP_KERNEL); - if (!data) { - eprintk(KERN_WARNING, "out of memory\n"); - return -ENOMEM; + if (n & POLLIN) { + set_bit(Rpending, &m->wsched); + dprintk(DEBUG_MUX, "mux %p can read\n", m); + if (!test_and_set_bit(Rworksched, &m->wsched)) { + dprintk(DEBUG_MUX, "schedule read work mux %p\n", m); + queue_work(v9fs_mux_wq, &m->rq); + } } - res = xread(v9ses, data, size); - if (res < size) { - dprintk(DEBUG_ERROR, "Reading of fcall failed returned: %d\n", - res); - kfree(data); - return res; + if (n & POLLOUT) { + set_bit(Wpending, &m->wsched); + dprintk(DEBUG_MUX, "mux %p can write\n", m); + if ((m->wsize || !list_empty(&m->unsent_req_list)) + && !test_and_set_bit(Wworksched, &m->wsched)) { + dprintk(DEBUG_MUX, "schedule write work mux %p\n", m); + queue_work(v9fs_mux_wq, &m->wq); + } } +} + +/** + * v9fs_poll_proc - polls all v9fs transports for new events and queues + * the appropriate work to the work queue + */ +static int v9fs_poll_proc(void *a) +{ + struct v9fs_mux_data *m, *mtmp; + struct v9fs_mux_poll_task *vpt; - /* we now have an in-memory string that is the reply. - * deserialize it. There is very little to go wrong at this point - * save for v9fs_alloc errors. - */ - res = v9fs_deserialize_fcall(v9ses, size, data, v9ses->maxdata, - rcall, rcalllen); + vpt = a; + dprintk(DEBUG_MUX, "start %p %p\n", current, vpt); + allow_signal(SIGKILL); + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + if (signal_pending(current)) + break; - kfree(data); + list_for_each_entry_safe(m, mtmp, &vpt->mux_list, mux_list) { + v9fs_poll_mux(m); + } - if (res < 0) - return res; + dprintk(DEBUG_MUX, "sleeping...\n"); + schedule_timeout(SCHED_TIMEOUT * HZ); + } + __set_current_state(TASK_RUNNING); + dprintk(DEBUG_MUX, "finish\n"); return 0; } /** - * v9fs_recv - receive an RPC response for a particular tag - * @v9ses: session info structure - * @req: RPC request structure - * + * v9fs_write_work - called when a transport can send some data */ - -static int v9fs_recv(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req) +static void v9fs_write_work(void *a) { - int ret = 0; + int n, err; + struct v9fs_mux_data *m; + struct v9fs_req *req; - dprintk(DEBUG_MUX, "waiting for response: %d\n", req->tcall->tag); - ret = wait_event_interruptible(v9ses->read_wait, - ((v9ses->transport->status != Connected) || - (req->rcall != 0) || (req->err < 0) || - dprintcond(v9ses, req))); + m = a; - dprintk(DEBUG_MUX, "got it: rcall %p\n", req->rcall); + if (m->err < 0) { + clear_bit(Wworksched, &m->wsched); + return; + } - spin_lock(&v9ses->muxlock); - list_del(&req->next); - spin_unlock(&v9ses->muxlock); + if (!m->wsize) { + if (list_empty(&m->unsent_req_list)) { + clear_bit(Wworksched, &m->wsched); + return; + } - if (req->err < 0) - return req->err; + spin_lock(&m->lock); + req = + list_entry(m->unsent_req_list.next, struct v9fs_req, + req_list); + list_move_tail(&req->req_list, &m->req_list); + m->wbuf = req->tcall->sdata; + m->wsize = req->tcall->size; + m->wpos = 0; + dump_data(m->wbuf, m->wsize); + spin_unlock(&m->lock); + } - if (v9ses->transport->status == Disconnected) - return -ECONNRESET; + dprintk(DEBUG_MUX, "mux %p pos %d size %d\n", m, m->wpos, m->wsize); + clear_bit(Wpending, &m->wsched); + err = m->trans->write(m->trans, m->wbuf + m->wpos, m->wsize - m->wpos); + dprintk(DEBUG_MUX, "mux %p sent %d bytes\n", m, err); + if (err == -EAGAIN) { + clear_bit(Wworksched, &m->wsched); + return; + } - return ret; -} + if (err <= 0) + goto error; -/** - * v9fs_send - send a 9P request - * @v9ses: session info structure - * @req: RPC request to send - * - */ + m->wpos += err; + if (m->wpos == m->wsize) + m->wpos = m->wsize = 0; + + if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) { + if (test_and_clear_bit(Wpending, &m->wsched)) + n = POLLOUT; + else + n = m->trans->poll(m->trans, NULL); + + if (n & POLLOUT) { + dprintk(DEBUG_MUX, "schedule write work mux %p\n", m); + queue_work(v9fs_mux_wq, &m->wq); + } else + clear_bit(Wworksched, &m->wsched); + } else + clear_bit(Wworksched, &m->wsched); + + return; -static int v9fs_send(struct v9fs_session_info *v9ses, struct v9fs_rpcreq *req) + error: + v9fs_mux_cancel(m, err); + clear_bit(Wworksched, &m->wsched); +} + +static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) { - int ret = -1; - void *data = NULL; - struct v9fs_fcall *tcall = req->tcall; + int ecode, tag; + struct v9fs_str *ename; - data = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL); - if (!data) - return -ENOMEM; + tag = req->tag; + if (req->rcall->id == RERROR && !req->err) { + ecode = req->rcall->params.rerror.errno; + ename = &req->rcall->params.rerror.error; - tcall->size = 0; /* enforce size recalculation */ - ret = - v9fs_serialize_fcall(v9ses, tcall, data, - v9ses->maxdata + V9FS_IOHDRSZ); - if (ret < 0) - goto free_data; + dprintk(DEBUG_MUX, "Rerror %.*s\n", ename->len, ename->str); - spin_lock(&v9ses->muxlock); - list_add(&req->next, &v9ses->mux_fcalls); - spin_unlock(&v9ses->muxlock); + if (*m->extended) + req->err = -ecode; - dprintk(DEBUG_MUX, "sending message: tag %d size %d\n", tcall->tag, - tcall->size); - ret = v9ses->transport->write(v9ses->transport, data, tcall->size); + if (!req->err) { + req->err = v9fs_errstr2errno(ename->str, ename->len); - if (ret != tcall->size) { - spin_lock(&v9ses->muxlock); - list_del(&req->next); - kfree(req->rcall); + if (!req->err) { /* string match failed */ + PRINT_FCALL_ERROR("unknown error", req->rcall); + } + + if (!req->err) + req->err = -ESERVERFAULT; + } + } else if (req->tcall && req->rcall->id != req->tcall->id + 1) { + dprintk(DEBUG_ERROR, "fcall mismatch: expected %d, got %d\n", + req->tcall->id + 1, req->rcall->id); + if (!req->err) + req->err = -EIO; + } - spin_unlock(&v9ses->muxlock); - if (ret >= 0) - ret = -EREMOTEIO; + if (req->cb && req->err != ERREQFLUSH) { + dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", + req->tcall, req->rcall); + + (*req->cb) (req->cba, req->tcall, req->rcall, req->err); + req->cb = NULL; } else - ret = 0; + kfree(req->rcall); - free_data: - kfree(data); - return ret; + v9fs_mux_put_tag(m, tag); + + wake_up(&m->equeue); + kfree(req); } /** - * v9fs_mux_rpc - send a request, receive a response - * @v9ses: session info structure - * @tcall: fcall to send - * @rcall: buffer to place response into - * + * v9fs_read_work - called when there is some data to be read from a transport */ - -long -v9fs_mux_rpc(struct v9fs_session_info *v9ses, struct v9fs_fcall *tcall, - struct v9fs_fcall **rcall) +static void v9fs_read_work(void *a) { - int tid = -1; - struct v9fs_fcall *fcall = NULL; - struct v9fs_rpcreq req; - int ret = -1; - - if (!v9ses) - return -EINVAL; - - if (!v9ses->transport || v9ses->transport->status != Connected) - return -EIO; + int n, err; + struct v9fs_mux_data *m; + struct v9fs_req *req, *rptr, *rreq; + struct v9fs_fcall *rcall; + char *rbuf; + + m = a; + + if (m->err < 0) + return; + + rcall = NULL; + dprintk(DEBUG_MUX, "start mux %p pos %d\n", m, m->rpos); + + if (!m->rcall) { + m->rcall = + kmalloc(sizeof(struct v9fs_fcall) + m->msize, GFP_KERNEL); + if (!m->rcall) { + err = -ENOMEM; + goto error; + } - if (rcall) - *rcall = NULL; + m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall); + m->rpos = 0; + } - if (tcall->id != TVERSION) { - tid = v9fs_get_idpool(&v9ses->tidpool); - if (tid < 0) - return -ENOMEM; + clear_bit(Rpending, &m->wsched); + err = m->trans->read(m->trans, m->rbuf + m->rpos, m->msize - m->rpos); + dprintk(DEBUG_MUX, "mux %p got %d bytes\n", m, err); + if (err == -EAGAIN) { + clear_bit(Rworksched, &m->wsched); + return; } - tcall->tag = tid; + if (err <= 0) + goto error; - req.tcall = tcall; - req.err = 0; - req.rcall = NULL; + m->rpos += err; + while (m->rpos > 4) { + n = le32_to_cpu(*(__le32 *) m->rbuf); + if (n >= m->msize) { + dprintk(DEBUG_ERROR, + "requested packet size too big: %d\n", n); + err = -EIO; + goto error; + } - ret = v9fs_send(v9ses, &req); + if (m->rpos < n) + break; - if (ret < 0) { - if (tcall->id != TVERSION) - v9fs_put_idpool(tid, &v9ses->tidpool); - dprintk(DEBUG_MUX, "error %d\n", ret); - return ret; - } + dump_data(m->rbuf, n); + err = + v9fs_deserialize_fcall(m->rbuf, n, m->rcall, *m->extended); + if (err < 0) { + goto error; + } + + rcall = m->rcall; + rbuf = m->rbuf; + if (m->rpos > n) { + m->rcall = kmalloc(sizeof(struct v9fs_fcall) + m->msize, + GFP_KERNEL); + if (!m->rcall) { + err = -ENOMEM; + goto error; + } - ret = v9fs_recv(v9ses, &req); - - fcall = req.rcall; - - dprintk(DEBUG_MUX, "received: tag=%x, ret=%d\n", tcall->tag, ret); - if (ret == -ERESTARTSYS) { - if (v9ses->transport->status != Disconnected - && tcall->id != TFLUSH) { - unsigned long flags; - - dprintk(DEBUG_MUX, "flushing the tag: %d\n", - tcall->tag); - clear_thread_flag(TIF_SIGPENDING); - v9fs_t_flush(v9ses, tcall->tag); - spin_lock_irqsave(¤t->sighand->siglock, flags); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, - flags); - dprintk(DEBUG_MUX, "flushing done\n"); + m->rbuf = (char *)m->rcall + sizeof(struct v9fs_fcall); + memmove(m->rbuf, rbuf + n, m->rpos - n); + m->rpos -= n; + } else { + m->rcall = NULL; + m->rbuf = NULL; + m->rpos = 0; } - goto release_req; - } else if (ret < 0) - goto release_req; - - if (!fcall) - ret = -EIO; - else { - if (fcall->id == RERROR) { - ret = v9fs_errstr2errno(fcall->params.rerror.error); - if (ret == 0) { /* string match failed */ - if (fcall->params.rerror.errno) - ret = -(fcall->params.rerror.errno); - else - ret = -ESERVERFAULT; + dprintk(DEBUG_MUX, "mux %p fcall id %d tag %d\n", m, rcall->id, + rcall->tag); + + req = NULL; + spin_lock(&m->lock); + list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { + if (rreq->tag == rcall->tag) { + req = rreq; + req->rcall = rcall; + list_del(&req->req_list); + spin_unlock(&m->lock); + process_request(m, req); + break; } - } else if (fcall->id != tcall->id + 1) { - dprintk(DEBUG_ERROR, - "fcall mismatch: expected %d, got %d\n", - tcall->id + 1, fcall->id); - ret = -EIO; + + } + + if (!req) { + spin_unlock(&m->lock); + if (err >= 0 && rcall->id != RFLUSH) + dprintk(DEBUG_ERROR, + "unexpected response mux %p id %d tag %d\n", + m, rcall->id, rcall->tag); + kfree(rcall); } } - release_req: - if (tcall->id != TVERSION) - v9fs_put_idpool(tid, &v9ses->tidpool); - if (rcall) - *rcall = fcall; - else - kfree(fcall); + if (!list_empty(&m->req_list)) { + if (test_and_clear_bit(Rpending, &m->wsched)) + n = POLLIN; + else + n = m->trans->poll(m->trans, NULL); + + if (n & POLLIN) { + dprintk(DEBUG_MUX, "schedule read work mux %p\n", m); + queue_work(v9fs_mux_wq, &m->rq); + } else + clear_bit(Rworksched, &m->wsched); + } else + clear_bit(Rworksched, &m->wsched); + + return; - return ret; + error: + v9fs_mux_cancel(m, err); + clear_bit(Rworksched, &m->wsched); } /** - * v9fs_mux_cancel_requests - cancels all pending requests + * v9fs_send_request - send 9P request + * The function can sleep until the request is scheduled for sending. + * The function can be interrupted. Return from the function is not + * a guarantee that the request is sent succesfully. Can return errors + * that can be retrieved by PTR_ERR macros. * - * @v9ses: session info structure - * @err: error code to return to the requests + * @m: mux data + * @tc: request to be sent + * @cb: callback function to call when response is received + * @cba: parameter to pass to the callback function */ -void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err) +static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, + struct v9fs_fcall *tc, + v9fs_mux_req_callback cb, void *cba) { - struct v9fs_rpcreq *rptr; - struct v9fs_rpcreq *rreq; + int n; + struct v9fs_req *req; - dprintk(DEBUG_MUX, " %d\n", err); - spin_lock(&v9ses->muxlock); - list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) { - rreq->err = err; - } - spin_unlock(&v9ses->muxlock); - wake_up_all(&v9ses->read_wait); -} + dprintk(DEBUG_MUX, "mux %p task %p tcall %p id %d\n", m, current, + tc, tc->id); + if (m->err < 0) + return ERR_PTR(m->err); -/** - * v9fs_recvproc - kproc to handle demultiplexing responses - * @data: session info structure - * - */ + req = kmalloc(sizeof(struct v9fs_req), GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); -static int v9fs_recvproc(void *data) -{ - struct v9fs_session_info *v9ses = (struct v9fs_session_info *)data; - struct v9fs_fcall *rcall = NULL; - struct v9fs_rpcreq *rptr; - struct v9fs_rpcreq *req; - struct v9fs_rpcreq *rreq; - int err = 0; + if (tc->id == TVERSION) + n = V9FS_NOTAG; + else + n = v9fs_mux_get_tag(m); - allow_signal(SIGKILL); - set_current_state(TASK_INTERRUPTIBLE); - complete(&v9ses->proccmpl); - while (!kthread_should_stop() && err >= 0) { - req = rptr = rreq = NULL; - - rcall = kmalloc(v9ses->maxdata + V9FS_IOHDRSZ, GFP_KERNEL); - if (!rcall) { - eprintk(KERN_ERR, "no memory for buffers\n"); - break; - } + if (n < 0) + return ERR_PTR(-ENOMEM); - err = read_message(v9ses, rcall, v9ses->maxdata + V9FS_IOHDRSZ); - spin_lock(&v9ses->muxlock); - if (err < 0) { - list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) { - rreq->err = err; - } - if(err != -ERESTARTSYS) - eprintk(KERN_ERR, - "Transport error while reading message %d\n", err); - } else { - list_for_each_entry_safe(rreq, rptr, &v9ses->mux_fcalls, next) { - if (rreq->tcall->tag == rcall->tag) { - req = rreq; - req->rcall = rcall; - break; - } - } - } + v9fs_set_tag(tc, n); - if (req && (req->tcall->id == TFLUSH)) { - struct v9fs_rpcreq *treq = NULL; - list_for_each_entry_safe(treq, rptr, &v9ses->mux_fcalls, next) { - if (treq->tcall->tag == - req->tcall->params.tflush.oldtag) { - list_del(&rptr->next); - kfree(treq->rcall); - break; - } + req->tag = n; + req->tcall = tc; + req->rcall = NULL; + req->err = 0; + req->cb = cb; + req->cba = cba; + + spin_lock(&m->lock); + list_add_tail(&req->req_list, &m->unsent_req_list); + spin_unlock(&m->lock); + + if (test_and_clear_bit(Wpending, &m->wsched)) + n = POLLOUT; + else + n = m->trans->poll(m->trans, NULL); + + if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) + queue_work(v9fs_mux_wq, &m->wq); + + return req; +} + +static inline void +v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, + int err) +{ + v9fs_mux_req_callback cb; + int tag; + struct v9fs_mux_data *m; + struct v9fs_req *req, *rptr; + + m = a; + dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc, + rc, err, tc->params.tflush.oldtag); + + spin_lock(&m->lock); + cb = NULL; + tag = tc->params.tflush.oldtag; + list_for_each_entry_safe(req, rptr, &m->req_list, req_list) { + if (req->tag == tag) { + list_del(&req->req_list); + if (req->cb) { + cb = req->cb; + req->cb = NULL; + spin_unlock(&m->lock); + (*cb) (req->cba, req->tcall, req->rcall, + req->err); } + kfree(req); + wake_up(&m->equeue); + break; } + } - spin_unlock(&v9ses->muxlock); + if (!cb) + spin_unlock(&m->lock); - if (!req) { - if (err >= 0) - dprintk(DEBUG_ERROR, - "unexpected response: id %d tag %d\n", - rcall->id, rcall->tag); + v9fs_mux_put_tag(m, tag); + kfree(tc); + kfree(rc); +} - kfree(rcall); - } +static void +v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req) +{ + struct v9fs_fcall *fc; - wake_up_all(&v9ses->read_wait); - set_current_state(TASK_INTERRUPTIBLE); + dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag); + + fc = v9fs_create_tflush(req->tag); + v9fs_send_request(m, fc, v9fs_mux_flush_cb, m); +} + +static void +v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) +{ + struct v9fs_mux_rpc *r; + + if (err == ERREQFLUSH) { + dprintk(DEBUG_MUX, "err req flush\n"); + return; } - v9ses->transport->close(v9ses->transport); + r = a; + dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req, + tc, rc, err); + r->rcall = rc; + r->err = err; + wake_up(&r->wqueue); +} - /* Inform all pending processes about the failure */ - wake_up_all(&v9ses->read_wait); +/** + * v9fs_mux_rpc - sends 9P request and waits until a response is available. + * The function can be interrupted. + * @m: mux data + * @tc: request to be sent + * @rc: pointer where a pointer to the response is stored + */ +int +v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, + struct v9fs_fcall **rc) +{ + int err; + unsigned long flags; + struct v9fs_req *req; + struct v9fs_mux_rpc r; + + r.err = 0; + r.rcall = NULL; + r.m = m; + init_waitqueue_head(&r.wqueue); + + if (rc) + *rc = NULL; + + req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r); + if (IS_ERR(req)) { + err = PTR_ERR(req); + dprintk(DEBUG_MUX, "error %d\n", err); + return PTR_ERR(req); + } - if (signal_pending(current)) - complete(&v9ses->proccmpl); + r.req = req; + dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc, + req->tag, &r, req); + err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0); + if (r.err < 0) + err = r.err; + + if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) { + spin_lock(&m->lock); + req->tcall = NULL; + req->err = ERREQFLUSH; + spin_unlock(&m->lock); + + clear_thread_flag(TIF_SIGPENDING); + v9fs_mux_flush_request(m, req); + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } - dprintk(DEBUG_MUX, "recvproc: end\n"); - v9ses->recvproc = NULL; + if (!err) { + if (r.rcall) + dprintk(DEBUG_MUX, "got response id %d tag %d\n", + r.rcall->id, r.rcall->tag); + + if (rc) + *rc = r.rcall; + else + kfree(r.rcall); + } else { + kfree(r.rcall); + dprintk(DEBUG_MUX, "got error %d\n", err); + if (err > 0) + err = -EIO; + } - return err >= 0; + return err; } /** - * v9fs_mux_init - initialize multiplexer (spawn kproc) - * @v9ses: session info structure - * @dev_name: mount device information (to create unique kproc) - * + * v9fs_mux_rpcnb - sends 9P request without waiting for response. + * @m: mux data + * @tc: request to be sent + * @cb: callback function to be called when response arrives + * @cba: value to pass to the callback function */ +int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc, + v9fs_mux_req_callback cb, void *a) +{ + int err; + struct v9fs_req *req; + + req = v9fs_send_request(m, tc, cb, a); + if (IS_ERR(req)) { + err = PTR_ERR(req); + dprintk(DEBUG_MUX, "error %d\n", err); + return PTR_ERR(req); + } + + dprintk(DEBUG_MUX, "mux %p tc %p tag %d\n", m, tc, req->tag); + return 0; +} -int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name) +/** + * v9fs_mux_cancel - cancel all pending requests with error + * @m: mux data + * @err: error code + */ +void v9fs_mux_cancel(struct v9fs_mux_data *m, int err) { - char procname[60]; - - strncpy(procname, dev_name, sizeof(procname)); - procname[sizeof(procname) - 1] = 0; - - init_waitqueue_head(&v9ses->read_wait); - init_completion(&v9ses->fcread); - init_completion(&v9ses->proccmpl); - spin_lock_init(&v9ses->muxlock); - INIT_LIST_HEAD(&v9ses->mux_fcalls); - v9ses->recvproc = NULL; - v9ses->curfcall = NULL; - - v9ses->recvproc = kthread_create(v9fs_recvproc, v9ses, - "v9fs_recvproc %s", procname); - - if (IS_ERR(v9ses->recvproc)) { - eprintk(KERN_ERR, "cannot create receiving thread\n"); - v9fs_session_close(v9ses); - return -ECONNABORTED; + struct v9fs_req *req, *rtmp; + LIST_HEAD(cancel_list); + + dprintk(DEBUG_MUX, "mux %p err %d\n", m, err); + m->err = err; + spin_lock(&m->lock); + list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { + list_move(&req->req_list, &cancel_list); } + spin_unlock(&m->lock); - wake_up_process(v9ses->recvproc); - wait_for_completion(&v9ses->proccmpl); + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { + list_del(&req->req_list); + if (!req->err) + req->err = err; - return 0; + if (req->cb) + (*req->cb) (req->cba, req->tcall, req->rcall, req->err); + else + kfree(req->rcall); + + kfree(req); + } + + wake_up(&m->equeue); +} + +static u16 v9fs_mux_get_tag(struct v9fs_mux_data *m) +{ + int tag; + + tag = v9fs_get_idpool(&m->tidpool); + if (tag < 0) + return V9FS_NOTAG; + else + return (u16) tag; +} + +static void v9fs_mux_put_tag(struct v9fs_mux_data *m, u16 tag) +{ + if (tag != V9FS_NOTAG && v9fs_check_idpool(tag, &m->tidpool)) + v9fs_put_idpool(tag, &m->tidpool); } diff --git a/fs/9p/mux.h b/fs/9p/mux.h index 4994cb10badf..9473b84f24b2 100644 --- a/fs/9p/mux.h +++ b/fs/9p/mux.h @@ -3,6 +3,7 @@ * * Multiplexer Definitions * + * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -23,19 +24,35 @@ * */ -/* structure to manage each RPC transaction */ +struct v9fs_mux_data; -struct v9fs_rpcreq { - struct v9fs_fcall *tcall; - struct v9fs_fcall *rcall; - int err; /* error code if response failed */ +/** + * v9fs_mux_req_callback - callback function that is called when the + * response of a request is received. The callback is called from + * a workqueue and shouldn't block. + * + * @a - the pointer that was specified when the request was send to be + * passed to the callback + * @tc - request call + * @rc - response call + * @err - error code (non-zero if error occured) + */ +typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc, + struct v9fs_fcall *rc, int err); + +int v9fs_mux_global_init(void); +void v9fs_mux_global_exit(void); - /* XXX - could we put scatter/gather buffers here? */ +struct v9fs_mux_data *v9fs_mux_init(struct v9fs_transport *trans, int msize, + unsigned char *extended); +void v9fs_mux_destroy(struct v9fs_mux_data *); - struct list_head next; -}; +int v9fs_mux_send(struct v9fs_mux_data *m, struct v9fs_fcall *tc); +struct v9fs_fcall *v9fs_mux_recv(struct v9fs_mux_data *m); +int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc); +int v9fs_mux_rpcnb(struct v9fs_mux_data *m, struct v9fs_fcall *tc, + v9fs_mux_req_callback cb, void *a); -int v9fs_mux_init(struct v9fs_session_info *v9ses, const char *dev_name); -long v9fs_mux_rpc(struct v9fs_session_info *v9ses, - struct v9fs_fcall *tcall, struct v9fs_fcall **rcall); -void v9fs_mux_cancel_requests(struct v9fs_session_info *v9ses, int err); +void v9fs_mux_flush(struct v9fs_mux_data *m, int sendflush); +void v9fs_mux_cancel(struct v9fs_mux_data *m, int err); +int v9fs_errstr2errno(char *errstr, int len); diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c index 63b58ce98ff4..1a28ef97a3d1 100644 --- a/fs/9p/trans_fd.c +++ b/fs/9p/trans_fd.c @@ -3,6 +3,7 @@ * * File Descriptor Transport Layer * + * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -106,9 +107,6 @@ v9fs_fd_init(struct v9fs_session_info *v9ses, const char *addr, char *data) return -ENOPROTOOPT; } - sema_init(&trans->writelock, 1); - sema_init(&trans->readlock, 1); - ts = kmalloc(sizeof(struct v9fs_trans_fd), GFP_KERNEL); if (!ts) @@ -148,12 +146,12 @@ static void v9fs_fd_close(struct v9fs_transport *trans) if (!trans) return; - trans->status = Disconnected; - ts = trans->priv; + ts = xchg(&trans->priv, NULL); if (!ts) return; + trans->status = Disconnected; if (ts->in_file) fput(ts->in_file); @@ -163,10 +161,55 @@ static void v9fs_fd_close(struct v9fs_transport *trans) kfree(ts); } +static unsigned int +v9fs_fd_poll(struct v9fs_transport *trans, struct poll_table_struct *pt) +{ + int ret, n; + struct v9fs_trans_fd *ts; + mm_segment_t oldfs; + + if (!trans) + return -EIO; + + ts = trans->priv; + if (trans->status != Connected || !ts) + return -EIO; + + oldfs = get_fs(); + set_fs(get_ds()); + + if (!ts->in_file->f_op || !ts->in_file->f_op->poll) { + ret = -EIO; + goto end; + } + + ret = ts->in_file->f_op->poll(ts->in_file, pt); + + if (ts->out_file != ts->in_file) { + if (!ts->out_file->f_op || !ts->out_file->f_op->poll) { + ret = -EIO; + goto end; + } + + n = ts->out_file->f_op->poll(ts->out_file, pt); + + ret &= ~POLLOUT; + n &= ~POLLIN; + + ret |= n; + } + +end: + set_fs(oldfs); + return ret; +} + + struct v9fs_transport v9fs_trans_fd = { .init = v9fs_fd_init, .write = v9fs_fd_send, .read = v9fs_fd_recv, .close = v9fs_fd_close, + .poll = v9fs_fd_poll, }; diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c index a93c2bf94c33..44e830697acb 100644 --- a/fs/9p/trans_sock.c +++ b/fs/9p/trans_sock.c @@ -3,6 +3,7 @@ * * Socket Transport Layer * + * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> * Copyright (C) 1995, 1996 by Olaf Kirch <okir@monad.swb.de> @@ -26,6 +27,7 @@ */ #include <linux/config.h> +#include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> @@ -35,6 +37,7 @@ #include <asm/uaccess.h> #include <linux/inet.h> #include <linux/idr.h> +#include <linux/file.h> #include "debug.h" #include "v9fs.h" @@ -44,6 +47,7 @@ struct v9fs_trans_sock { struct socket *s; + struct file *filp; }; /** @@ -56,41 +60,26 @@ struct v9fs_trans_sock { static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len) { - struct msghdr msg; - struct kvec iov; - int result; - mm_segment_t oldfs; - struct v9fs_trans_sock *ts = trans ? trans->priv : NULL; + int ret; + struct v9fs_trans_sock *ts; - if (trans->status == Disconnected) + if (!trans || trans->status == Disconnected) { + dprintk(DEBUG_ERROR, "disconnected ...\n"); return -EREMOTEIO; + } - result = -EINVAL; - - oldfs = get_fs(); - set_fs(get_ds()); - - iov.iov_base = v; - iov.iov_len = len; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_iovlen = 1; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_flags = MSG_NOSIGNAL; + ts = trans->priv; - result = kernel_recvmsg(ts->s, &msg, &iov, 1, len, 0); + if (!(ts->filp->f_flags & O_NONBLOCK)) + dprintk(DEBUG_ERROR, "blocking read ...\n"); - dprintk(DEBUG_TRANS, "socket state %d\n", ts->s->state); - set_fs(oldfs); - - if (result <= 0) { - if (result != -ERESTARTSYS) + ret = kernel_read(ts->filp, ts->filp->f_pos, v, len); + if (ret <= 0) { + if (ret != -ERESTARTSYS && ret != -EAGAIN) trans->status = Disconnected; } - return result; + return ret; } /** @@ -103,40 +92,72 @@ static int v9fs_sock_recv(struct v9fs_transport *trans, void *v, int len) static int v9fs_sock_send(struct v9fs_transport *trans, void *v, int len) { - struct kvec iov; - struct msghdr msg; - int result = -1; + int ret; mm_segment_t oldfs; - struct v9fs_trans_sock *ts = trans ? trans->priv : NULL; + struct v9fs_trans_sock *ts; - dprintk(DEBUG_TRANS, "Sending packet size %d (%x)\n", len, len); - dump_data(v, len); + if (!trans || trans->status == Disconnected) { + dprintk(DEBUG_ERROR, "disconnected ...\n"); + return -EREMOTEIO; + } + + ts = trans->priv; + if (!ts) { + dprintk(DEBUG_ERROR, "no transport ...\n"); + return -EREMOTEIO; + } - down(&trans->writelock); + if (!(ts->filp->f_flags & O_NONBLOCK)) + dprintk(DEBUG_ERROR, "blocking write ...\n"); oldfs = get_fs(); set_fs(get_ds()); - iov.iov_base = v; - iov.iov_len = len; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_iovlen = 1; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_flags = MSG_NOSIGNAL; - result = kernel_sendmsg(ts->s, &msg, &iov, 1, len); + ret = vfs_write(ts->filp, (void __user *)v, len, &ts->filp->f_pos); set_fs(oldfs); - if (result < 0) { - if (result != -ERESTARTSYS) + if (ret < 0) { + if (ret != -ERESTARTSYS) trans->status = Disconnected; } - up(&trans->writelock); - return result; + return ret; +} + +static unsigned int v9fs_sock_poll(struct v9fs_transport *trans, + struct poll_table_struct *pt) { + + int ret; + struct v9fs_trans_sock *ts; + mm_segment_t oldfs; + + if (!trans) { + dprintk(DEBUG_ERROR, "no transport\n"); + return -EIO; + } + + ts = trans->priv; + if (trans->status != Connected || !ts) { + dprintk(DEBUG_ERROR, "transport disconnected: %d\n", trans->status); + return -EIO; + } + + oldfs = get_fs(); + set_fs(get_ds()); + + if (!ts->filp->f_op || !ts->filp->f_op->poll) { + dprintk(DEBUG_ERROR, "no poll operation\n"); + ret = -EIO; + goto end; + } + + ret = ts->filp->f_op->poll(ts->filp, pt); + +end: + set_fs(oldfs); + return ret; } + /** * v9fs_tcp_init - initialize TCP socket * @v9ses: session information @@ -153,9 +174,9 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data) int rc = 0; struct v9fs_trans_sock *ts = NULL; struct v9fs_transport *trans = v9ses->transport; + int fd; - sema_init(&trans->writelock, 1); - sema_init(&trans->readlock, 1); + trans->status = Disconnected; ts = kmalloc(sizeof(struct v9fs_trans_sock), GFP_KERNEL); @@ -164,6 +185,7 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data) trans->priv = ts; ts->s = NULL; + ts->filp = NULL; if (!addr) return -EINVAL; @@ -184,7 +206,18 @@ v9fs_tcp_init(struct v9fs_session_info *v9ses, const char *addr, char *data) return rc; } csocket->sk->sk_allocation = GFP_NOIO; + + fd = sock_map_fd(csocket); + if (fd < 0) { + sock_release(csocket); + kfree(ts); + trans->priv = NULL; + return fd; + } + ts->s = csocket; + ts->filp = fget(fd); + ts->filp->f_flags |= O_NONBLOCK; trans->status = Connected; return 0; @@ -202,7 +235,7 @@ static int v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { - int rc; + int rc, fd; struct socket *csocket; struct sockaddr_un sun_server; struct v9fs_transport *trans; @@ -212,6 +245,8 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name, csocket = NULL; trans = v9ses->transport; + trans->status = Disconnected; + if (strlen(dev_name) > UNIX_PATH_MAX) { eprintk(KERN_ERR, "v9fs_trans_unix: address too long: %s\n", dev_name); @@ -224,9 +259,7 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name, trans->priv = ts; ts->s = NULL; - - sema_init(&trans->writelock, 1); - sema_init(&trans->readlock, 1); + ts->filp = NULL; sun_server.sun_family = PF_UNIX; strcpy(sun_server.sun_path, dev_name); @@ -240,7 +273,18 @@ v9fs_unix_init(struct v9fs_session_info *v9ses, const char *dev_name, return rc; } csocket->sk->sk_allocation = GFP_NOIO; + + fd = sock_map_fd(csocket); + if (fd < 0) { + sock_release(csocket); + kfree(ts); + trans->priv = NULL; + return fd; + } + ts->s = csocket; + ts->filp = fget(fd); + ts->filp->f_flags |= O_NONBLOCK; trans->status = Connected; return 0; @@ -261,12 +305,11 @@ static void v9fs_sock_close(struct v9fs_transport *trans) ts = trans->priv; - if ((ts) && (ts->s)) { - dprintk(DEBUG_TRANS, "closing the socket %p\n", ts->s); - sock_release(ts->s); + if ((ts) && (ts->filp)) { + fput(ts->filp); + ts->filp = NULL; ts->s = NULL; trans->status = Disconnected; - dprintk(DEBUG_TRANS, "socket closed\n"); } kfree(ts); @@ -279,6 +322,7 @@ struct v9fs_transport v9fs_trans_tcp = { .write = v9fs_sock_send, .read = v9fs_sock_recv, .close = v9fs_sock_close, + .poll = v9fs_sock_poll, }; struct v9fs_transport v9fs_trans_unix = { @@ -286,4 +330,5 @@ struct v9fs_transport v9fs_trans_unix = { .write = v9fs_sock_send, .read = v9fs_sock_recv, .close = v9fs_sock_close, + .poll = v9fs_sock_poll, }; diff --git a/fs/9p/transport.h b/fs/9p/transport.h index 9e9cd418efd5..91fcdb94b361 100644 --- a/fs/9p/transport.h +++ b/fs/9p/transport.h @@ -3,6 +3,7 @@ * * Transport Definition * + * Copyright (C) 2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -31,14 +32,13 @@ enum v9fs_transport_status { struct v9fs_transport { enum v9fs_transport_status status; - struct semaphore writelock; - struct semaphore readlock; void *priv; int (*init) (struct v9fs_session_info *, const char *, char *); int (*write) (struct v9fs_transport *, void *, int); int (*read) (struct v9fs_transport *, void *, int); void (*close) (struct v9fs_transport *); + unsigned int (*poll)(struct v9fs_transport *, struct poll_table_struct *); }; extern struct v9fs_transport v9fs_trans_tcp; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 418c3743fdee..5250c428fc1f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -37,7 +37,6 @@ #include "v9fs_vfs.h" #include "transport.h" #include "mux.h" -#include "conv.h" /* TODO: sysfs or debugfs interface */ int v9fs_debug_level = 0; /* feature-rific global debug level */ @@ -213,7 +212,8 @@ retry: return -1; } - error = idr_get_new(&p->pool, NULL, &i); + /* no need to store exactly p, we just need something non-null */ + error = idr_get_new(&p->pool, p, &i); up(&p->lock); if (error == -EAGAIN) @@ -243,6 +243,16 @@ void v9fs_put_idpool(int id, struct v9fs_idpool *p) } /** + * v9fs_check_idpool - check if the specified id is available + * @id - id to check + * @p - pool + */ +int v9fs_check_idpool(int id, struct v9fs_idpool *p) +{ + return idr_find(&p->pool, id) != NULL; +} + +/** * v9fs_session_init - initialize session * @v9ses: session information structure * @dev_name: device being mounted @@ -259,6 +269,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses, int n = 0; int newfid = -1; int retval = -EINVAL; + struct v9fs_str *version; v9ses->name = __getname(); if (!v9ses->name) @@ -281,9 +292,6 @@ v9fs_session_init(struct v9fs_session_info *v9ses, /* id pools that are session-dependent: FIDs and TIDs */ idr_init(&v9ses->fidpool.pool); init_MUTEX(&v9ses->fidpool.lock); - idr_init(&v9ses->tidpool.pool); - init_MUTEX(&v9ses->tidpool.lock); - switch (v9ses->proto) { case PROTO_TCP: @@ -320,7 +328,12 @@ v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->shutdown = 0; v9ses->session_hung = 0; - if ((retval = v9fs_mux_init(v9ses, dev_name)) < 0) { + v9ses->mux = v9fs_mux_init(v9ses->transport, v9ses->maxdata + V9FS_IOHDRSZ, + &v9ses->extended); + + if (IS_ERR(v9ses->mux)) { + retval = PTR_ERR(v9ses->mux); + v9ses->mux = NULL; dprintk(DEBUG_ERROR, "problem initializing mux\n"); goto SessCleanUp; } @@ -339,13 +352,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses, goto FreeFcall; } - /* Really should check for 9P1 and report error */ - if (!strcmp(fcall->params.rversion.version, "9P2000.u")) { + version = &fcall->params.rversion.version; + if (version->len==8 && !memcmp(version->str, "9P2000.u", 8)) { dprintk(DEBUG_9P, "9P2000 UNIX extensions enabled\n"); v9ses->extended = 1; - } else { + } else if (version->len==6 && !memcmp(version->str, "9P2000", 6)) { dprintk(DEBUG_9P, "9P2000 legacy mode enabled\n"); v9ses->extended = 0; + } else { + retval = -EREMOTEIO; + goto FreeFcall; } n = fcall->params.rversion.msize; @@ -381,7 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses, } if (v9ses->afid != ~0) { - if (v9fs_t_clunk(v9ses, v9ses->afid, NULL)) + if (v9fs_t_clunk(v9ses, v9ses->afid)) dprintk(DEBUG_ERROR, "clunk failed\n"); } @@ -403,13 +419,16 @@ v9fs_session_init(struct v9fs_session_info *v9ses, void v9fs_session_close(struct v9fs_session_info *v9ses) { - if (v9ses->recvproc) { - send_sig(SIGKILL, v9ses->recvproc, 1); - wait_for_completion(&v9ses->proccmpl); + if (v9ses->mux) { + v9fs_mux_destroy(v9ses->mux); + v9ses->mux = NULL; } - if (v9ses->transport) + if (v9ses->transport) { v9ses->transport->close(v9ses->transport); + kfree(v9ses->transport); + v9ses->transport = NULL; + } __putname(v9ses->name); __putname(v9ses->remotename); @@ -420,8 +439,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) * and cancel all pending requests. */ void v9fs_session_cancel(struct v9fs_session_info *v9ses) { + dprintk(DEBUG_ERROR, "cancel session %p\n", v9ses); v9ses->transport->status = Disconnected; - v9fs_mux_cancel_requests(v9ses, -EIO); + v9fs_mux_cancel(v9ses->mux, -EIO); } extern int v9fs_error_init(void); @@ -433,11 +453,17 @@ extern int v9fs_error_init(void); static int __init init_v9fs(void) { + int ret; + v9fs_error_init(); printk(KERN_INFO "Installing v9fs 9P2000 file system support\n"); - return register_filesystem(&v9fs_fs_type); + ret = v9fs_mux_global_init(); + if (!ret) + ret = register_filesystem(&v9fs_fs_type); + + return ret; } /** @@ -447,6 +473,7 @@ static int __init init_v9fs(void) static void __exit exit_v9fs(void) { + v9fs_mux_global_exit(); unregister_filesystem(&v9fs_fs_type); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 45dcef42bdd6..f337da7a0eec 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -57,24 +57,14 @@ struct v9fs_session_info { /* book keeping */ struct v9fs_idpool fidpool; /* The FID pool for file descriptors */ - struct v9fs_idpool tidpool; /* The TID pool for transactions ids */ - /* transport information */ struct v9fs_transport *transport; + struct v9fs_mux_data *mux; int inprogress; /* session in progress => true */ int shutdown; /* session shutting down. no more attaches. */ unsigned char session_hung; - - /* mux private data */ - struct v9fs_fcall *curfcall; - wait_queue_head_t read_wait; - struct completion fcread; - struct completion proccmpl; - struct task_struct *recvproc; - - spinlock_t muxlock; - struct list_head mux_fcalls; + struct dentry *debugfs_dir; }; /* possible values of ->proto */ @@ -84,11 +74,14 @@ enum { PROTO_FD, }; +extern struct dentry *v9fs_debugfs_root; + int v9fs_session_init(struct v9fs_session_info *, const char *, char *); struct v9fs_session_info *v9fs_inode2v9ses(struct inode *); void v9fs_session_close(struct v9fs_session_info *v9ses); int v9fs_get_idpool(struct v9fs_idpool *p); void v9fs_put_idpool(int id, struct v9fs_idpool *p); +int v9fs_check_idpool(int id, struct v9fs_idpool *p); void v9fs_session_cancel(struct v9fs_session_info *v9ses); #define V9FS_MAGIC 0x01021997 diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 2f2cea7ee3e7..c78502ad00ed 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -45,9 +45,8 @@ extern struct dentry_operations v9fs_dentry_operations; struct inode *v9fs_get_inode(struct super_block *sb, int mode); ino_t v9fs_qid2ino(struct v9fs_qid *qid); -void v9fs_mistat2inode(struct v9fs_stat *, struct inode *, - struct super_block *); +void v9fs_stat2inode(struct v9fs_stat *, struct inode *, struct super_block *); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); -void v9fs_inode2mistat(struct inode *inode, struct v9fs_stat *mistat); +void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat); void v9fs_dentry_release(struct dentry *); diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index a6aa947de0f9..2dd806dac9f1 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -40,7 +40,6 @@ #include "v9fs.h" #include "9p.h" #include "v9fs_vfs.h" -#include "conv.h" #include "fid.h" /** @@ -95,24 +94,22 @@ static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd) void v9fs_dentry_release(struct dentry *dentry) { + int err; + dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); if (dentry->d_fsdata != NULL) { struct list_head *fid_list = dentry->d_fsdata; struct v9fs_fid *temp = NULL; struct v9fs_fid *current_fid = NULL; - struct v9fs_fcall *fcall = NULL; list_for_each_entry_safe(current_fid, temp, fid_list, list) { - if (v9fs_t_clunk - (current_fid->v9ses, current_fid->fid, &fcall)) - dprintk(DEBUG_ERROR, "clunk failed: %s\n", - FCALL_ERROR(fcall)); + err = v9fs_t_clunk(current_fid->v9ses, current_fid->fid); - v9fs_put_idpool(current_fid->fid, - ¤t_fid->v9ses->fidpool); + if (err < 0) + dprintk(DEBUG_ERROR, "clunk failed: %d name %s\n", + err, dentry->d_iname); - kfree(fcall); v9fs_fid_destroy(current_fid); } diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 57a43b8feef5..ae6d032b9b59 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -37,8 +37,8 @@ #include "debug.h" #include "v9fs.h" #include "9p.h" -#include "v9fs_vfs.h" #include "conv.h" +#include "v9fs_vfs.h" #include "fid.h" /** @@ -74,20 +74,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = filp->f_dentry->d_inode; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); struct v9fs_fid *file = filp->private_data; - unsigned int i, n; + unsigned int i, n, s; int fid = -1; int ret = 0; - struct v9fs_stat *mi = NULL; + struct v9fs_stat stat; int over = 0; dprintk(DEBUG_VFS, "name %s\n", filp->f_dentry->d_name.name); fid = file->fid; - mi = kmalloc(v9ses->maxdata, GFP_KERNEL); - if (!mi) - return -ENOMEM; - if (file->rdir_fcall && (filp->f_pos != file->rdir_pos)) { kfree(file->rdir_fcall); file->rdir_fcall = NULL; @@ -97,20 +93,20 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) n = file->rdir_fcall->params.rread.count; i = file->rdir_fpos; while (i < n) { - int s = v9fs_deserialize_stat(v9ses, - file->rdir_fcall->params.rread.data + i, - n - i, mi, v9ses->maxdata); + s = v9fs_deserialize_stat( + file->rdir_fcall->params.rread.data + i, + n - i, &stat, v9ses->extended); if (s == 0) { dprintk(DEBUG_ERROR, - "error while deserializing mistat\n"); + "error while deserializing stat\n"); ret = -EIO; goto FreeStructs; } - over = filldir(dirent, mi->name, strlen(mi->name), - filp->f_pos, v9fs_qid2ino(&mi->qid), - dt_type(mi)); + over = filldir(dirent, stat.name.str, stat.name.len, + filp->f_pos, v9fs_qid2ino(&stat.qid), + dt_type(&stat)); if (over) { file->rdir_fpos = i; @@ -130,7 +126,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) while (!over) { ret = v9fs_t_read(v9ses, fid, filp->f_pos, - v9ses->maxdata-V9FS_IOHDRSZ, &fcall); + v9ses->maxdata-V9FS_IOHDRSZ, &fcall); if (ret < 0) { dprintk(DEBUG_ERROR, "error while reading: %d: %p\n", ret, fcall); @@ -141,19 +137,18 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) n = ret; i = 0; while (i < n) { - int s = v9fs_deserialize_stat(v9ses, - fcall->params.rread.data + i, n - i, mi, - v9ses->maxdata); + s = v9fs_deserialize_stat(fcall->params.rread.data + i, + n - i, &stat, v9ses->extended); if (s == 0) { dprintk(DEBUG_ERROR, - "error while deserializing mistat\n"); + "error while deserializing stat\n"); return -EIO; } - over = filldir(dirent, mi->name, strlen(mi->name), - filp->f_pos, v9fs_qid2ino(&mi->qid), - dt_type(mi)); + over = filldir(dirent, stat.name.str, stat.name.len, + filp->f_pos, v9fs_qid2ino(&stat.qid), + dt_type(&stat)); if (over) { file->rdir_fcall = fcall; @@ -172,7 +167,6 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir) FreeStructs: kfree(fcall); - kfree(mi); return ret; } @@ -193,18 +187,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) fid->fid); fidnum = fid->fid; - filemap_fdatawrite(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + filemap_write_and_wait(inode->i_mapping); if (fidnum >= 0) { dprintk(DEBUG_VFS, "fidopen: %d v9f->fid: %d\n", fid->fidopen, fid->fid); - if (v9fs_t_clunk(v9ses, fidnum, NULL)) + if (v9fs_t_clunk(v9ses, fidnum)) dprintk(DEBUG_ERROR, "clunk failed\n"); - v9fs_put_idpool(fid->fid, &v9ses->fidpool); - kfree(fid->rdir_fcall); kfree(fid); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 89c849da8504..6852f0eb96ed 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -32,6 +32,7 @@ #include <linux/string.h> #include <linux/smp_lock.h> #include <linux/inet.h> +#include <linux/version.h> #include <linux/list.h> #include <asm/uaccess.h> #include <linux/idr.h> @@ -117,9 +118,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) result = v9fs_t_open(v9ses, newfid, open_mode, &fcall); if (result < 0) { - dprintk(DEBUG_ERROR, - "open failed, open_mode 0x%x: %s\n", open_mode, - FCALL_ERROR(fcall)); + PRINT_FCALL_ERROR("open failed", fcall); kfree(fcall); return result; } @@ -165,8 +164,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) return -ENOLCK; if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { - filemap_fdatawrite(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + filemap_write_and_wait(inode->i_mapping); invalidate_inode_pages(&inode->i_data); } @@ -257,7 +255,6 @@ v9fs_file_write(struct file *filp, const char __user * data, int result = -EIO; int rsize = 0; int total = 0; - char *buf; dprintk(DEBUG_VFS, "data %p count %d offset %x\n", data, (int)count, (int)*offset); @@ -265,28 +262,14 @@ v9fs_file_write(struct file *filp, const char __user * data, if (v9fid->iounit != 0 && rsize > v9fid->iounit) rsize = v9fid->iounit; - buf = kmalloc(v9ses->maxdata - V9FS_IOHDRSZ, GFP_KERNEL); - if (!buf) - return -ENOMEM; - do { if (count < rsize) rsize = count; - result = copy_from_user(buf, data, rsize); - if (result) { - dprintk(DEBUG_ERROR, "Problem copying from user\n"); - kfree(buf); - return -EFAULT; - } - - dump_data(buf, rsize); - result = v9fs_t_write(v9ses, fid, *offset, rsize, buf, &fcall); + result = v9fs_t_write(v9ses, fid, *offset, rsize, data, &fcall); if (result < 0) { - eprintk(KERN_ERR, "error while writing: %s(%d)\n", - FCALL_ERROR(fcall), result); + PRINT_FCALL_ERROR("error while writing", fcall); kfree(fcall); - kfree(buf); return result; } else *offset += result; @@ -306,7 +289,6 @@ v9fs_file_write(struct file *filp, const char __user * data, total += result; } while (count); - kfree(buf); return total; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 0ea965c3bb7d..a17b28854288 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -40,7 +40,6 @@ #include "v9fs.h" #include "9p.h" #include "v9fs_vfs.h" -#include "conv.h" #include "fid.h" static struct inode_operations v9fs_dir_inode_operations; @@ -127,100 +126,32 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) } /** - * v9fs_blank_mistat - helper function to setup a 9P stat structure + * v9fs_blank_wstat - helper function to setup a 9P stat structure * @v9ses: 9P session info (for determining extended mode) - * @mistat: structure to initialize + * @wstat: structure to initialize * */ static void -v9fs_blank_mistat(struct v9fs_session_info *v9ses, struct v9fs_stat *mistat) +v9fs_blank_wstat(struct v9fs_wstat *wstat) { - mistat->type = ~0; - mistat->dev = ~0; - mistat->qid.type = ~0; - mistat->qid.version = ~0; - *((long long *)&mistat->qid.path) = ~0; - mistat->mode = ~0; - mistat->atime = ~0; - mistat->mtime = ~0; - mistat->length = ~0; - mistat->name = mistat->data; - mistat->uid = mistat->data; - mistat->gid = mistat->data; - mistat->muid = mistat->data; - if (v9ses->extended) { - mistat->n_uid = ~0; - mistat->n_gid = ~0; - mistat->n_muid = ~0; - mistat->extension = mistat->data; - } - *mistat->data = 0; -} - -/** - * v9fs_mistat2unix - convert mistat to unix stat - * @mistat: Plan 9 metadata (mistat) structure - * @buf: unix metadata (stat) structure to populate - * @sb: superblock - * - */ - -static void -v9fs_mistat2unix(struct v9fs_stat *mistat, struct stat *buf, - struct super_block *sb) -{ - struct v9fs_session_info *v9ses = sb ? sb->s_fs_info : NULL; - - buf->st_nlink = 1; - - buf->st_atime = mistat->atime; - buf->st_mtime = mistat->mtime; - buf->st_ctime = mistat->mtime; - - buf->st_uid = (unsigned short)-1; - buf->st_gid = (unsigned short)-1; - - if (v9ses && v9ses->extended) { - /* TODO: string to uid mapping via user-space daemon */ - if (mistat->n_uid != -1) - sscanf(mistat->uid, "%x", (unsigned int *)&buf->st_uid); - - if (mistat->n_gid != -1) - sscanf(mistat->gid, "%x", (unsigned int *)&buf->st_gid); - } - - if (buf->st_uid == (unsigned short)-1) - buf->st_uid = v9ses->uid; - if (buf->st_gid == (unsigned short)-1) - buf->st_gid = v9ses->gid; - - buf->st_mode = p9mode2unixmode(v9ses, mistat->mode); - if ((S_ISBLK(buf->st_mode)) || (S_ISCHR(buf->st_mode))) { - char type = 0; - int major = -1; - int minor = -1; - sscanf(mistat->extension, "%c %u %u", &type, &major, &minor); - switch (type) { - case 'c': - buf->st_mode &= ~S_IFBLK; - buf->st_mode |= S_IFCHR; - break; - case 'b': - break; - default: - dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n", - type, mistat->extension); - }; - buf->st_rdev = MKDEV(major, minor); - } else - buf->st_rdev = 0; - - buf->st_size = mistat->length; - - buf->st_blksize = sb->s_blocksize; - buf->st_blocks = - (buf->st_size + buf->st_blksize - 1) >> sb->s_blocksize_bits; + wstat->type = ~0; + wstat->dev = ~0; + wstat->qid.type = ~0; + wstat->qid.version = ~0; + *((long long *)&wstat->qid.path) = ~0; + wstat->mode = ~0; + wstat->atime = ~0; + wstat->mtime = ~0; + wstat->length = ~0; + wstat->name = NULL; + wstat->uid = NULL; + wstat->gid = NULL; + wstat->muid = NULL; + wstat->n_uid = ~0; + wstat->n_gid = ~0; + wstat->n_muid = ~0; + wstat->extension = NULL; } /** @@ -312,12 +243,12 @@ v9fs_create(struct inode *dir, struct inode *file_inode = NULL; struct v9fs_fcall *fcall = NULL; struct v9fs_qid qid; - struct stat newstat; int dirfidnum = -1; long newfid = -1; int result = 0; unsigned int iounit = 0; int wfidno = -1; + int err; perm = unixmode2p9mode(v9ses, perm); @@ -349,57 +280,64 @@ v9fs_create(struct inode *dir, result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall); if (result < 0) { - dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall)); + PRINT_FCALL_ERROR("clone error", fcall); v9fs_put_idpool(newfid, &v9ses->fidpool); newfid = -1; goto CleanUpFid; } kfree(fcall); + fcall = NULL; result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name, perm, open_mode, &fcall); if (result < 0) { - dprintk(DEBUG_ERROR, "create fails: %s(%d)\n", - FCALL_ERROR(fcall), result); - + PRINT_FCALL_ERROR("create fails", fcall); goto CleanUpFid; } iounit = fcall->params.rcreate.iounit; qid = fcall->params.rcreate.qid; kfree(fcall); + fcall = NULL; - fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1); - dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate); - if (!fid) { - result = -ENOMEM; - goto CleanUpFid; - } + if (!(perm&V9FS_DMDIR)) { + fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1); + dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate); + if (!fid) { + result = -ENOMEM; + goto CleanUpFid; + } - fid->qid = qid; - fid->iounit = iounit; + fid->qid = qid; + fid->iounit = iounit; + } else { + err = v9fs_t_clunk(v9ses, newfid); + newfid = -1; + if (err < 0) + dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err); + } /* walk to the newly created file and put the fid in the dentry */ wfidno = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { + if (wfidno < 0) { eprintk(KERN_WARNING, "no free fids available\n"); return -ENOSPC; } result = v9fs_t_walk(v9ses, dirfidnum, wfidno, - (char *) file_dentry->d_name.name, NULL); + (char *) file_dentry->d_name.name, &fcall); if (result < 0) { - dprintk(DEBUG_ERROR, "clone error: %s\n", FCALL_ERROR(fcall)); + PRINT_FCALL_ERROR("clone error", fcall); v9fs_put_idpool(wfidno, &v9ses->fidpool); wfidno = -1; goto CleanUpFid; } + kfree(fcall); + fcall = NULL; if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) { - if (!v9fs_t_clunk(v9ses, newfid, &fcall)) { - v9fs_put_idpool(wfidno, &v9ses->fidpool); - } + v9fs_put_idpool(wfidno, &v9ses->fidpool); goto CleanUpFid; } @@ -409,62 +347,43 @@ v9fs_create(struct inode *dir, (perm & V9FS_DMDEVICE)) return 0; - result = v9fs_t_stat(v9ses, newfid, &fcall); + result = v9fs_t_stat(v9ses, wfidno, &fcall); if (result < 0) { - dprintk(DEBUG_ERROR, "stat error: %s(%d)\n", FCALL_ERROR(fcall), - result); + PRINT_FCALL_ERROR("stat error", fcall); goto CleanUpFid; } - v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb); - file_inode = v9fs_get_inode(sb, newstat.st_mode); + file_inode = v9fs_get_inode(sb, + p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode)); + if ((!file_inode) || IS_ERR(file_inode)) { dprintk(DEBUG_ERROR, "create inode failed\n"); result = -EBADF; goto CleanUpFid; } - v9fs_mistat2inode(fcall->params.rstat.stat, file_inode, sb); + v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb); kfree(fcall); fcall = NULL; file_dentry->d_op = &v9fs_dentry_operations; d_instantiate(file_dentry, file_inode); - if (perm & V9FS_DMDIR) { - if (!v9fs_t_clunk(v9ses, newfid, &fcall)) - v9fs_put_idpool(newfid, &v9ses->fidpool); - else - dprintk(DEBUG_ERROR, "clunk for mkdir failed: %s\n", - FCALL_ERROR(fcall)); - kfree(fcall); - fid->fidopen = 0; - fid->fidcreate = 0; - d_drop(file_dentry); - } - return 0; CleanUpFid: kfree(fcall); + fcall = NULL; if (newfid >= 0) { - if (!v9fs_t_clunk(v9ses, newfid, &fcall)) - v9fs_put_idpool(newfid, &v9ses->fidpool); - else - dprintk(DEBUG_ERROR, "clunk failed: %s\n", - FCALL_ERROR(fcall)); - - kfree(fcall); + err = v9fs_t_clunk(v9ses, newfid); + if (err < 0) + dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); } if (wfidno >= 0) { - if (!v9fs_t_clunk(v9ses, wfidno, &fcall)) - v9fs_put_idpool(wfidno, &v9ses->fidpool); - else - dprintk(DEBUG_ERROR, "clunk failed: %s\n", - FCALL_ERROR(fcall)); - - kfree(fcall); + err = v9fs_t_clunk(v9ses, wfidno); + if (err < 0) + dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); } return result; } @@ -509,10 +428,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) } result = v9fs_t_remove(v9ses, fid, &fcall); - if (result < 0) - dprintk(DEBUG_ERROR, "remove of file fails: %s(%d)\n", - FCALL_ERROR(fcall), result); - else { + if (result < 0) { + PRINT_FCALL_ERROR("remove fails", fcall); + } else { v9fs_put_idpool(fid, &v9ses->fidpool); v9fs_fid_destroy(v9fid); } @@ -567,7 +485,6 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct v9fs_fid *fid; struct inode *inode; struct v9fs_fcall *fcall = NULL; - struct stat newstat; int dirfidnum = -1; int newfid = -1; int result = 0; @@ -620,8 +537,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, goto FreeFcall; } - v9fs_mistat2unix(fcall->params.rstat.stat, &newstat, sb); - inode = v9fs_get_inode(sb, newstat.st_mode); + inode = v9fs_get_inode(sb, p9mode2unixmode(v9ses, + fcall->params.rstat.stat.mode)); if (IS_ERR(inode) && (PTR_ERR(inode) == -ENOSPC)) { eprintk(KERN_WARNING, "inode alloc failes, returns %ld\n", @@ -631,7 +548,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, goto FreeFcall; } - inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat->qid); + inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); fid = v9fs_fid_create(dentry, v9ses, newfid, 0); if (fid == NULL) { @@ -640,10 +557,10 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, goto FreeFcall; } - fid->qid = fcall->params.rstat.stat->qid; + fid->qid = fcall->params.rstat.stat.qid; dentry->d_op = &v9fs_dentry_operations; - v9fs_mistat2inode(fcall->params.rstat.stat, inode, inode->i_sb); + v9fs_stat2inode(&fcall->params.rstat.stat, inode, inode->i_sb); d_add(dentry, inode); kfree(fcall); @@ -699,7 +616,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, v9fs_fid_lookup(old_dentry->d_parent); struct v9fs_fid *newdirfid = v9fs_fid_lookup(new_dentry->d_parent); - struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); + struct v9fs_wstat wstat; struct v9fs_fcall *fcall = NULL; int fid = -1; int olddirfidnum = -1; @@ -708,9 +625,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, dprintk(DEBUG_VFS, "\n"); - if (!mistat) - return -ENOMEM; - if ((!oldfid) || (!olddirfid) || (!newdirfid)) { dprintk(DEBUG_ERROR, "problem with arguments\n"); return -EBADF; @@ -734,33 +648,22 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto FreeFcallnBail; } - v9fs_blank_mistat(v9ses, mistat); + v9fs_blank_wstat(&wstat); + wstat.muid = v9ses->name; + wstat.name = (char *) new_dentry->d_name.name; - strcpy(mistat->data + 1, v9ses->name); - mistat->name = mistat->data + 1 + strlen(v9ses->name); - - if (new_dentry->d_name.len > - (v9ses->maxdata - strlen(v9ses->name) - sizeof(struct v9fs_stat))) { - dprintk(DEBUG_ERROR, "new name too long\n"); - goto FreeFcallnBail; - } - - strcpy(mistat->name, new_dentry->d_name.name); - retval = v9fs_t_wstat(v9ses, fid, mistat, &fcall); + retval = v9fs_t_wstat(v9ses, fid, &wstat, &fcall); FreeFcallnBail: - kfree(mistat); - if (retval < 0) - dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n", - FCALL_ERROR(fcall)); + PRINT_FCALL_ERROR("wstat error", fcall); kfree(fcall); return retval; } /** - * v9fs_vfs_getattr - retreive file metadata + * v9fs_vfs_getattr - retrieve file metadata * @mnt - mount information * @dentry - file to get attributes on * @stat - metadata structure to populate @@ -788,7 +691,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, if (err < 0) dprintk(DEBUG_ERROR, "stat error\n"); else { - v9fs_mistat2inode(fcall->params.rstat.stat, dentry->d_inode, + v9fs_stat2inode(&fcall->params.rstat.stat, dentry->d_inode, dentry->d_inode->i_sb); generic_fillattr(dentry->d_inode, stat); } @@ -809,57 +712,44 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dentry->d_inode); struct v9fs_fid *fid = v9fs_fid_lookup(dentry); struct v9fs_fcall *fcall = NULL; - struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); + struct v9fs_wstat wstat; int res = -EPERM; dprintk(DEBUG_VFS, "\n"); - if (!mistat) - return -ENOMEM; - if (!fid) { dprintk(DEBUG_ERROR, "Couldn't find fid associated with dentry\n"); return -EBADF; } - v9fs_blank_mistat(v9ses, mistat); + v9fs_blank_wstat(&wstat); if (iattr->ia_valid & ATTR_MODE) - mistat->mode = unixmode2p9mode(v9ses, iattr->ia_mode); + wstat.mode = unixmode2p9mode(v9ses, iattr->ia_mode); if (iattr->ia_valid & ATTR_MTIME) - mistat->mtime = iattr->ia_mtime.tv_sec; + wstat.mtime = iattr->ia_mtime.tv_sec; if (iattr->ia_valid & ATTR_ATIME) - mistat->atime = iattr->ia_atime.tv_sec; + wstat.atime = iattr->ia_atime.tv_sec; if (iattr->ia_valid & ATTR_SIZE) - mistat->length = iattr->ia_size; + wstat.length = iattr->ia_size; if (v9ses->extended) { - char *ptr = mistat->data+1; - - if (iattr->ia_valid & ATTR_UID) { - mistat->uid = ptr; - ptr += 1+sprintf(ptr, "%08x", iattr->ia_uid); - mistat->n_uid = iattr->ia_uid; - } + if (iattr->ia_valid & ATTR_UID) + wstat.n_uid = iattr->ia_uid; - if (iattr->ia_valid & ATTR_GID) { - mistat->gid = ptr; - ptr += 1+sprintf(ptr, "%08x", iattr->ia_gid); - mistat->n_gid = iattr->ia_gid; - } + if (iattr->ia_valid & ATTR_GID) + wstat.n_gid = iattr->ia_gid; } - res = v9fs_t_wstat(v9ses, fid->fid, mistat, &fcall); + res = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); if (res < 0) - dprintk(DEBUG_ERROR, "wstat error: %s\n", FCALL_ERROR(fcall)); + PRINT_FCALL_ERROR("wstat error", fcall); - kfree(mistat); kfree(fcall); - if (res >= 0) res = inode_setattr(dentry->d_inode, iattr); @@ -867,51 +757,47 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) } /** - * v9fs_mistat2inode - populate an inode structure with mistat info - * @mistat: Plan 9 metadata (mistat) structure + * v9fs_stat2inode - populate an inode structure with mistat info + * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem * */ void -v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode, - struct super_block *sb) +v9fs_stat2inode(struct v9fs_stat *stat, struct inode *inode, + struct super_block *sb) { + int n; + char ext[32]; struct v9fs_session_info *v9ses = sb->s_fs_info; inode->i_nlink = 1; - inode->i_atime.tv_sec = mistat->atime; - inode->i_mtime.tv_sec = mistat->mtime; - inode->i_ctime.tv_sec = mistat->mtime; + inode->i_atime.tv_sec = stat->atime; + inode->i_mtime.tv_sec = stat->mtime; + inode->i_ctime.tv_sec = stat->mtime; - inode->i_uid = -1; - inode->i_gid = -1; + inode->i_uid = v9ses->uid; + inode->i_gid = v9ses->gid; if (v9ses->extended) { - /* TODO: string to uid mapping via user-space daemon */ - inode->i_uid = mistat->n_uid; - inode->i_gid = mistat->n_gid; - - if (mistat->n_uid == -1) - sscanf(mistat->uid, "%x", &inode->i_uid); - - if (mistat->n_gid == -1) - sscanf(mistat->gid, "%x", &inode->i_gid); + inode->i_uid = stat->n_uid; + inode->i_gid = stat->n_gid; } - if (inode->i_uid == -1) - inode->i_uid = v9ses->uid; - if (inode->i_gid == -1) - inode->i_gid = v9ses->gid; - - inode->i_mode = p9mode2unixmode(v9ses, mistat->mode); + inode->i_mode = p9mode2unixmode(v9ses, stat->mode); if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { char type = 0; int major = -1; int minor = -1; - sscanf(mistat->extension, "%c %u %u", &type, &major, &minor); + + n = stat->extension.len; + if (n > sizeof(ext)-1) + n = sizeof(ext)-1; + memmove(ext, stat->extension.str, n); + ext[n] = 0; + sscanf(ext, "%c %u %u", &type, &major, &minor); switch (type) { case 'c': inode->i_mode &= ~S_IFBLK; @@ -920,14 +806,14 @@ v9fs_mistat2inode(struct v9fs_stat *mistat, struct inode *inode, case 'b': break; default: - dprintk(DEBUG_ERROR, "Unknown special type %c (%s)\n", - type, mistat->extension); + dprintk(DEBUG_ERROR, "Unknown special type %c (%.*s)\n", + type, stat->extension.len, stat->extension.str); }; inode->i_rdev = MKDEV(major, minor); } else inode->i_rdev = 0; - inode->i_size = mistat->length; + inode->i_size = stat->length; inode->i_blksize = sb->s_blocksize; inode->i_blocks = @@ -955,71 +841,6 @@ ino_t v9fs_qid2ino(struct v9fs_qid *qid) } /** - * v9fs_vfs_symlink - helper function to create symlinks - * @dir: directory inode containing symlink - * @dentry: dentry for symlink - * @symname: symlink data - * - * See 9P2000.u RFC for more information - * - */ - -static int -v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -{ - int retval = -EPERM; - struct v9fs_fid *newfid; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); - struct v9fs_fcall *fcall = NULL; - struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); - - dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, - symname); - - if (!mistat) - return -ENOMEM; - - if (!v9ses->extended) { - dprintk(DEBUG_ERROR, "not extended\n"); - goto FreeFcall; - } - - /* issue a create */ - retval = v9fs_create(dir, dentry, S_IFLNK, 0); - if (retval != 0) - goto FreeFcall; - - newfid = v9fs_fid_lookup(dentry); - - /* issue a twstat */ - v9fs_blank_mistat(v9ses, mistat); - strcpy(mistat->data + 1, symname); - mistat->extension = mistat->data + 1; - retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall); - if (retval < 0) { - dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n", - FCALL_ERROR(fcall)); - goto FreeFcall; - } - - kfree(fcall); - - if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) { - dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n", - FCALL_ERROR(fcall)); - goto FreeFcall; - } - - d_drop(dentry); /* FID - will this also clunk? */ - - FreeFcall: - kfree(mistat); - kfree(fcall); - - return retval; -} - -/** * v9fs_readlink - read a symlink's location (internal version) * @dentry: dentry for symlink * @buffer: buffer to load symlink location into @@ -1058,16 +879,17 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) if (!fcall) return -EIO; - if (!(fcall->params.rstat.stat->mode & V9FS_DMSYMLINK)) { + if (!(fcall->params.rstat.stat.mode & V9FS_DMSYMLINK)) { retval = -EINVAL; goto FreeFcall; } /* copy extension buffer into buffer */ - if (strlen(fcall->params.rstat.stat->extension) < buflen) - buflen = strlen(fcall->params.rstat.stat->extension); + if (fcall->params.rstat.stat.extension.len < buflen) + buflen = fcall->params.rstat.stat.extension.len; - memcpy(buffer, fcall->params.rstat.stat->extension, buflen + 1); + memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1); + buffer[buflen-1] = 0; retval = buflen; @@ -1157,6 +979,77 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void __putname(s); } +static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, + int mode, const char *extension) +{ + int err, retval; + struct v9fs_session_info *v9ses; + struct v9fs_fcall *fcall; + struct v9fs_fid *fid; + struct v9fs_wstat wstat; + + v9ses = v9fs_inode2v9ses(dir); + retval = -EPERM; + fcall = NULL; + + if (!v9ses->extended) { + dprintk(DEBUG_ERROR, "not extended\n"); + goto free_mem; + } + + /* issue a create */ + retval = v9fs_create(dir, dentry, mode, 0); + if (retval != 0) + goto free_mem; + + fid = v9fs_fid_get_created(dentry); + if (!fid) { + dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n"); + goto free_mem; + } + + /* issue a Twstat */ + v9fs_blank_wstat(&wstat); + wstat.muid = v9ses->name; + wstat.extension = (char *) extension; + retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); + if (retval < 0) { + PRINT_FCALL_ERROR("wstat error", fcall); + goto free_mem; + } + + err = v9fs_t_clunk(v9ses, fid->fid); + if (err < 0) { + dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); + goto free_mem; + } + + d_drop(dentry); /* FID - will this also clunk? */ + +free_mem: + kfree(fcall); + return retval; +} + +/** + * v9fs_vfs_symlink - helper function to create symlinks + * @dir: directory inode containing symlink + * @dentry: dentry for symlink + * @symname: symlink data + * + * See 9P2000.u RFC for more information + * + */ + +static int +v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, + symname); + + return v9fs_vfs_mkspecial(dir, dentry, S_IFLNK, symname); +} + /** * v9fs_vfs_link - create a hardlink * @old_dentry: dentry for file to link to @@ -1173,64 +1066,24 @@ static int v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - int retval = -EPERM; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); - struct v9fs_fcall *fcall = NULL; - struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); - struct v9fs_fid *oldfid = v9fs_fid_lookup(old_dentry); - struct v9fs_fid *newfid = NULL; - char *symname = __getname(); + int retval; + struct v9fs_fid *oldfid; + char *name; dprintk(DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); - if (!v9ses->extended) { - dprintk(DEBUG_ERROR, "not extended\n"); - goto FreeMem; - } - - /* get fid of old_dentry */ - sprintf(symname, "hardlink(%d)\n", oldfid->fid); - - /* issue a create */ - retval = v9fs_create(dir, dentry, V9FS_DMLINK, 0); - if (retval != 0) - goto FreeMem; - - newfid = v9fs_fid_lookup(dentry); - if (!newfid) { - dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n"); - goto FreeMem; - } - - /* issue a twstat */ - v9fs_blank_mistat(v9ses, mistat); - strcpy(mistat->data + 1, symname); - mistat->extension = mistat->data + 1; - retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall); - if (retval < 0) { - dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n", - FCALL_ERROR(fcall)); - goto FreeMem; - } - - kfree(fcall); - - if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) { - dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n", - FCALL_ERROR(fcall)); - goto FreeMem; + oldfid = v9fs_fid_lookup(old_dentry); + if (!oldfid) { + dprintk(DEBUG_ERROR, "can't find oldfid\n"); + return -EPERM; } - d_drop(dentry); /* FID - will this also clunk? */ - - kfree(fcall); - fcall = NULL; + name = __getname(); + sprintf(name, "hardlink(%d)\n", oldfid->fid); + retval = v9fs_vfs_mkspecial(dir, dentry, V9FS_DMLINK, name); + __putname(name); - FreeMem: - kfree(mistat); - kfree(fcall); - __putname(symname); return retval; } @@ -1246,82 +1099,30 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, static int v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) { - int retval = -EPERM; - struct v9fs_fid *newfid; - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); - struct v9fs_fcall *fcall = NULL; - struct v9fs_stat *mistat = kmalloc(v9ses->maxdata, GFP_KERNEL); - char *symname = __getname(); + int retval; + char *name; dprintk(DEBUG_VFS, " %lu,%s mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); - if (!mistat) - return -ENOMEM; - - if (!new_valid_dev(rdev)) { - retval = -EINVAL; - goto FreeMem; - } - - if (!v9ses->extended) { - dprintk(DEBUG_ERROR, "not extended\n"); - goto FreeMem; - } - - /* issue a create */ - retval = v9fs_create(dir, dentry, mode, 0); - - if (retval != 0) - goto FreeMem; - - newfid = v9fs_fid_lookup(dentry); - if (!newfid) { - dprintk(DEBUG_ERROR, "coudn't resove fid from dentry\n"); - retval = -EINVAL; - goto FreeMem; - } + if (!new_valid_dev(rdev)) + return -EINVAL; + name = __getname(); /* build extension */ if (S_ISBLK(mode)) - sprintf(symname, "b %u %u", MAJOR(rdev), MINOR(rdev)); + sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISCHR(mode)) - sprintf(symname, "c %u %u", MAJOR(rdev), MINOR(rdev)); + sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISFIFO(mode)) - ; /* DO NOTHING */ + *name = 0; else { - retval = -EINVAL; - goto FreeMem; - } - - if (!S_ISFIFO(mode)) { - /* issue a twstat */ - v9fs_blank_mistat(v9ses, mistat); - strcpy(mistat->data + 1, symname); - mistat->extension = mistat->data + 1; - retval = v9fs_t_wstat(v9ses, newfid->fid, mistat, &fcall); - if (retval < 0) { - dprintk(DEBUG_ERROR, "v9fs_t_wstat error: %s\n", - FCALL_ERROR(fcall)); - goto FreeMem; - } + __putname(name); + return -EINVAL; } - /* need to update dcache so we show up */ - kfree(fcall); - - if (v9fs_t_clunk(v9ses, newfid->fid, &fcall)) { - dprintk(DEBUG_ERROR, "clunk for symlink failed: %s\n", - FCALL_ERROR(fcall)); - goto FreeMem; - } - - d_drop(dentry); /* FID - will this also clunk? */ - - FreeMem: - kfree(mistat); - kfree(fcall); - __putname(symname); + retval = v9fs_vfs_mkspecial(dir, dentry, mode, name); + __putname(name); return retval; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 82c5b0084079..2c4fa75be025 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -44,7 +44,6 @@ #include "v9fs.h" #include "9p.h" #include "v9fs_vfs.h" -#include "conv.h" #include "fid.h" static void v9fs_clear_inode(struct inode *); @@ -92,7 +91,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_op = &v9fs_super_ops; sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | - MS_NODIRATIME | MS_NOATIME; + MS_NOATIME; } /** @@ -123,12 +122,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type dprintk(DEBUG_VFS, " \n"); - v9ses = kcalloc(1, sizeof(struct v9fs_session_info), GFP_KERNEL); + v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return ERR_PTR(-ENOMEM); if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); + kfree(v9ses); return ERR_PTR(newfid); } @@ -157,7 +157,7 @@ static struct super_block *v9fs_get_sb(struct file_system_type stat_result = v9fs_t_stat(v9ses, newfid, &fcall); if (stat_result < 0) { dprintk(DEBUG_ERROR, "stat error\n"); - v9fs_t_clunk(v9ses, newfid, NULL); + v9fs_t_clunk(v9ses, newfid); v9fs_put_idpool(newfid, &v9ses->fidpool); } else { /* Setup the Root Inode */ @@ -167,10 +167,10 @@ static struct super_block *v9fs_get_sb(struct file_system_type goto put_back_sb; } - root_fid->qid = fcall->params.rstat.stat->qid; + root_fid->qid = fcall->params.rstat.stat.qid; root->d_inode->i_ino = - v9fs_qid2ino(&fcall->params.rstat.stat->qid); - v9fs_mistat2inode(fcall->params.rstat.stat, root->d_inode, sb); + v9fs_qid2ino(&fcall->params.rstat.stat.qid); + v9fs_stat2inode(&fcall->params.rstat.stat, root->d_inode, sb); } kfree(fcall); diff --git a/fs/Kconfig b/fs/Kconfig index d5255e627b5f..ef78e3a42d32 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -70,6 +70,7 @@ config FS_XIP config EXT3_FS tristate "Ext3 journalling file system support" + select JBD help This is the journaling version of the Second extended file system (often called ext3), the de facto standard Linux file system @@ -138,23 +139,20 @@ config EXT3_FS_SECURITY extended attributes for file security labels, say N. config JBD -# CONFIG_JBD could be its own option (even modular), but until there are -# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS -# dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS tristate - default EXT3_FS help This is a generic journaling layer for block devices. It is - currently used by the ext3 file system, but it could also be used to - add journal support to other file systems or block devices such as - RAID or LVM. + currently used by the ext3 and OCFS2 file systems, but it could + also be used to add journal support to other file systems or block + devices such as RAID or LVM. - If you are using the ext3 file system, you need to say Y here. If - you are not using ext3 then you will probably want to say N. + If you are using the ext3 or OCFS2 file systems, you need to + say Y here. If you are not using ext3 OCFS2 then you will probably + want to say N. To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 into the kernel, you cannot - compile this code as a module. + called jbd. If you are compiling ext3 or OCFS2 into the kernel, + you cannot compile this code as a module. config JBD_DEBUG bool "JBD (ext3) debugging support" @@ -326,6 +324,38 @@ config FS_POSIX_ACL source "fs/xfs/Kconfig" +config OCFS2_FS + tristate "OCFS2 file system support (EXPERIMENTAL)" + depends on NET && EXPERIMENTAL + select CONFIGFS_FS + select JBD + select CRC32 + select INET + help + OCFS2 is a general purpose extent based shared disk cluster file + system with many similarities to ext3. It supports 64 bit inode + numbers, and has automatically extending metadata groups which may + also make it attractive for non-clustered use. + + You'll want to install the ocfs2-tools package in order to at least + get "mount.ocfs2". + + Project web page: http://oss.oracle.com/projects/ocfs2 + Tools web page: http://oss.oracle.com/projects/ocfs2-tools + OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ + + Note: Features which OCFS2 does not support yet: + - extended attributes + - shared writeable mmap + - loopback is supported, but data written will not + be cluster coherent. + - quotas + - cluster aware flock + - Directory change notification (F_NOTIFY) + - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) + - POSIX ACLs + - readpages / writepages (not user visible) + config MINIX_FS tristate "Minix fs support" help @@ -768,7 +798,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" - depends on PROC_FS && EMBEDDED && EXPERIMENTAL && CRASH_DUMP + depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP help Exports the dump image of crashed kernel in ELF format. @@ -841,6 +871,20 @@ config RELAYFS_FS If unsure, say N. +config CONFIGFS_FS + tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + configfs is a ram-based filesystem that provides the converse + of sysfs's functionality. Where sysfs is a filesystem-based + view of kernel objects, configfs is a filesystem-based manager + of kernel objects, or config_items. + + Both sysfs and configfs can and should exist together on the + same system. One is not a replacement for the other. + + If unsure, say N. + endmenu menu "Miscellaneous filesystems" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 175b2e8177c1..f3d3d81eb7e9 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,6 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" - depends on MMU + depends on MMU && (BROKEN || !FRV) default y ---help--- ELF (Executable and Linkable Format) is a format for libraries and diff --git a/fs/Makefile b/fs/Makefile index 4c2655759078..1db711319c80 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,11 +10,11 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ - ioprio.o pnode.o + ioprio.o pnode.o drop_caches.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o obj-y += $(nfsd-y) $(nfsd-m) @@ -101,3 +101,5 @@ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ +obj-$(CONFIG_CONFIGFS_FS) += configfs/ +obj-$(CONFIG_OCFS2_FS) += ocfs2/ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 9ebe881c6786..44d439cb69f4 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -244,10 +244,10 @@ affs_put_inode(struct inode *inode) pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); affs_free_prealloc(inode); if (atomic_read(&inode->i_count) == 1) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (inode->i_size != AFFS_I(inode)->mmu_private) affs_truncate(inode); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } } diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 0a57fd7c726f..9eef6bf156ab 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -118,7 +118,7 @@ static int kafscmd(void *arg) _SRXAFSCM_xxxx_t func; int die; - printk("kAFS: Started kafscmd %d\n", current->pid); + printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid); daemonize("kafscmd"); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 6682d6d7f294..5c61c24dab2a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -137,7 +137,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page) #endif /* determine how many magic numbers there should be in this page */ - latter = dir->i_size - (page->index << PAGE_CACHE_SHIFT); + latter = dir->i_size - page_offset(page); if (latter >= PAGE_SIZE) qty = PAGE_SIZE; else diff --git a/fs/afs/volume.h b/fs/afs/volume.h index 1e691889c4c9..bfdcf19ba3f3 100644 --- a/fs/afs/volume.h +++ b/fs/afs/volume.h @@ -18,8 +18,6 @@ #include "kafsasyncd.h" #include "cache.h" -#define __packed __attribute__((packed)) - typedef enum { AFS_VLUPD_SLEEP, /* sleeping waiting for update timer to fire */ AFS_VLUPD_PENDING, /* on pending queue */ @@ -115,7 +113,7 @@ struct afs_volume struct cachefs_cookie *cache; /* caching cookie */ #endif afs_volid_t vid; /* volume ID */ - afs_voltype_t __packed type; /* type of volume */ + afs_voltype_t type; /* type of volume */ char type_force; /* force volume type (suppress R/O -> R/W) */ unsigned short nservers; /* number of server slots filled */ unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ @@ -29,7 +29,6 @@ #include <linux/highmem.h> #include <linux/workqueue.h> #include <linux/security.h> -#include <linux/rcuref.h> #include <asm/kmap_types.h> #include <asm/uaccess.h> @@ -514,7 +513,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) /* Must be done under the lock to serialise against cancellation. * Call this aio_fput as it duplicates fput via the fput_work. */ - if (unlikely(rcuref_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); diff --git a/fs/attr.c b/fs/attr.c index 67bcd9b14ea5..97de94670878 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -10,11 +10,11 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/smp_lock.h> +#include <linux/capability.h> #include <linux/fsnotify.h> #include <linux/fcntl.h> #include <linux/quotaops.h> #include <linux/security.h> -#include <linux/time.h> /* Taken over from the old code... */ @@ -67,20 +67,12 @@ EXPORT_SYMBOL(inode_change_ok); int inode_setattr(struct inode * inode, struct iattr * attr) { unsigned int ia_valid = attr->ia_valid; - int error = 0; - - if (ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { - error = vmtruncate(inode, attr->ia_size); - if (error || (ia_valid == ATTR_SIZE)) - goto out; - } else { - /* - * We skipped the truncate but must still update - * timestamps - */ - ia_valid |= ATTR_MTIME|ATTR_CTIME; - } + + if (ia_valid & ATTR_SIZE && + attr->ia_size != i_size_read(inode)) { + int error = vmtruncate(inode, attr->ia_size); + if (error) + return error; } if (ia_valid & ATTR_UID) @@ -104,8 +96,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr) inode->i_mode = mode; } mark_inode_dirty(inode); -out: - return error; + + return 0; } EXPORT_SYMBOL(inode_setattr); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index a1ab1c0ed215..870e2cf33016 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,6 +10,7 @@ * * ------------------------------------------------------------------------- */ +#include <linux/capability.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/param.h> @@ -229,9 +230,9 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr dentry->d_flags |= DCACHE_AUTOFS_PENDING; d_add(dentry, NULL); - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); autofs_revalidate(dentry, nd); - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); /* * If we are still pending, check if we had to handle diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index fca83e28edcf..385bed09b0d8 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -209,7 +209,7 @@ static inline int simple_empty_nolock(struct dentry *dentry) struct dentry *child; int ret = 0; - list_for_each_entry(child, &dentry->d_subdirs, d_child) + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) if (simple_positive(child)) goto out; ret = 1; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index feb6ac427d05..dc39589df165 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -105,7 +105,7 @@ repeat: next = this_parent->d_subdirs.next; resume: while (next != &this_parent->d_subdirs) { - struct dentry *dentry = list_entry(next, struct dentry, d_child); + struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); /* Negative dentry - give up */ if (!simple_positive(dentry)) { @@ -138,7 +138,7 @@ resume: } if (this_parent != top) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; goto resume; } @@ -163,7 +163,7 @@ repeat: next = this_parent->d_subdirs.next; resume: while (next != &this_parent->d_subdirs) { - struct dentry *dentry = list_entry(next, struct dentry, d_child); + struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); /* Negative dentry - give up */ if (!simple_positive(dentry)) { @@ -199,7 +199,7 @@ cont: } if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; goto resume; } @@ -238,7 +238,7 @@ static struct dentry *autofs4_expire(struct super_block *sb, /* On exit from the loop expire is set to a dgot dentry * to expire or it's NULL */ while ( next != &root->d_subdirs ) { - struct dentry *dentry = list_entry(next, struct dentry, d_child); + struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); /* Negative dentry - give up */ if ( !simple_positive(dentry) ) { @@ -302,7 +302,7 @@ next: expired, (int)expired->d_name.len, expired->d_name.name); spin_lock(&dcache_lock); list_del(&expired->d_parent->d_subdirs); - list_add(&expired->d_parent->d_subdirs, &expired->d_child); + list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 818b37be5153..2d3082854a29 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -91,7 +91,7 @@ repeat: next = this_parent->d_subdirs.next; resume: while (next != &this_parent->d_subdirs) { - struct dentry *dentry = list_entry(next, struct dentry, d_child); + struct dentry *dentry = list_entry(next, struct dentry, d_u.d_child); /* Negative dentry - don`t care */ if (!simple_positive(dentry)) { @@ -117,7 +117,7 @@ resume: if (this_parent != sbi->root) { struct dentry *dentry = this_parent; - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; spin_unlock(&dcache_lock); DPRINTK("parent dentry %p %.*s", diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2a771ec66956..e93a7ae467c9 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -12,6 +12,7 @@ * * ------------------------------------------------------------------------- */ +#include <linux/capability.h> #include <linux/errno.h> #include <linux/stat.h> #include <linux/param.h> @@ -86,7 +87,7 @@ static int autofs4_root_readdir(struct file *file, void *dirent, /* Update usage from here to top of tree, so that scan of top-level directories will give a useful result */ -static void autofs4_update_usage(struct dentry *dentry) +static void autofs4_update_usage(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry->d_sb->s_root; @@ -95,7 +96,7 @@ static void autofs4_update_usage(struct dentry *dentry) struct autofs_info *ino = autofs4_dentry_ino(dentry); if (ino) { - update_atime(dentry->d_inode); + touch_atime(mnt, dentry); ino->last_used = jiffies; } } @@ -143,7 +144,8 @@ static int autofs4_dcache_readdir(struct file * filp, void * dirent, filldir_t f } while(1) { - struct dentry *de = list_entry(list, struct dentry, d_child); + struct dentry *de = list_entry(list, + struct dentry, d_u.d_child); if (!d_unhashed(de) && de->d_inode) { spin_unlock(&dcache_lock); @@ -288,10 +290,10 @@ out: return autofs4_dcache_readdir(file, dirent, filldir); } -static int try_to_fill_dentry(struct dentry *dentry, - struct super_block *sb, - struct autofs_sb_info *sbi, int flags) +static int try_to_fill_dentry(struct vfsmount *mnt, struct dentry *dentry, int flags) { + struct super_block *sb = mnt->mnt_sb; + struct autofs_sb_info *sbi = autofs4_sbi(sb); struct autofs_info *de_info = autofs4_dentry_ino(dentry); int status = 0; @@ -366,7 +368,7 @@ static int try_to_fill_dentry(struct dentry *dentry, /* We don't update the usages for the autofs daemon itself, this is necessary for recursive autofs mounts */ if (!autofs4_oz_mode(sbi)) - autofs4_update_usage(dentry); + autofs4_update_usage(mnt, dentry); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; @@ -391,7 +393,7 @@ static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd) /* Pending dentry */ if (autofs4_ispending(dentry)) { if (!oz_mode) - status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags); + status = try_to_fill_dentry(nd->mnt, dentry, flags); return status; } @@ -408,14 +410,14 @@ static int autofs4_revalidate(struct dentry * dentry, struct nameidata *nd) dentry, dentry->d_name.len, dentry->d_name.name); spin_unlock(&dcache_lock); if (!oz_mode) - status = try_to_fill_dentry(dentry, dir->i_sb, sbi, flags); + status = try_to_fill_dentry(nd->mnt, dentry, flags); return status; } spin_unlock(&dcache_lock); /* Update the usage list */ if (!oz_mode) - autofs4_update_usage(dentry); + autofs4_update_usage(nd->mnt, dentry); return 1; } @@ -488,9 +490,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s d_add(dentry, NULL); if (dentry->d_op && dentry->d_op->d_revalidate) { - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); (dentry->d_op->d_revalidate)(dentry, nd); - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); } /* diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 72011826f0cb..f312103434d4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -33,8 +33,6 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file); -extern void dump_thread(struct pt_regs *, struct user *); - static struct linux_binfmt aout_format = { .module = THIS_MODULE, .load_binary = load_aout_binary, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f36f2210204f..f979ebbce49c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -58,7 +58,7 @@ extern int dump_fpu (struct pt_regs *, elf_fpregset_t *); * If we don't support core dumping, then supply a NULL so we * don't even try. */ -#ifdef USE_ELF_CORE_DUMP +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file); #else #define elf_core_dump NULL @@ -288,11 +288,17 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) { unsigned long map_addr; + unsigned long pageoffset = ELF_PAGEOFFSET(eppnt->p_vaddr); down_write(¤t->mm->mmap_sem); - map_addr = do_mmap(filep, ELF_PAGESTART(addr), - eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr), prot, type, - eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr)); + /* mmap() will return -EINVAL if given a zero size, but a + * segment with zero filesize is perfectly valid */ + if (eppnt->p_filesz + pageoffset) + map_addr = do_mmap(filep, ELF_PAGESTART(addr), + eppnt->p_filesz + pageoffset, prot, type, + eppnt->p_offset - pageoffset); + else + map_addr = ELF_PAGESTART(addr); up_write(¤t->mm->mmap_sem); return(map_addr); } @@ -616,7 +622,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free_file; retval = -ENOMEM; - elf_interpreter = (char *) kmalloc(elf_ppnt->p_filesz, + elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); if (!elf_interpreter) goto out_free_file; @@ -1107,7 +1113,7 @@ out: * Note that some platforms still use traditional core dumps and not * the ELF core dump. Each platform can select it as appropriate. */ -#ifdef USE_ELF_CORE_DUMP +#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) /* * ELF core dumper @@ -1628,17 +1634,17 @@ static int elf_core_dump(long signr, struct pt_regs * regs, struct file * file) ELF_CORE_WRITE_EXTRA_DATA; #endif - if ((off_t) file->f_pos != offset) { + if ((off_t)file->f_pos != offset) { /* Sanity check */ - printk("elf_core_dump: file->f_pos (%ld) != offset (%ld)\n", - (off_t) file->f_pos, offset); + printk(KERN_WARNING "elf_core_dump: file->f_pos (%ld) != offset (%ld)\n", + (off_t)file->f_pos, offset); } end_coredump: set_fs(fs); cleanup: - while(!list_empty(&thread_list)) { + while (!list_empty(&thread_list)) { struct list_head *tmp = thread_list.next; list_del(tmp); kfree(list_entry(tmp, struct elf_thread_status, list)); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index e0344f69c79d..5b3076e8ee90 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -187,7 +187,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs goto error; /* read the name of the interpreter into memory */ - interpreter_name = (char *) kmalloc(phdr->p_filesz, GFP_KERNEL); + interpreter_name = kmalloc(phdr->p_filesz, GFP_KERNEL); if (!interpreter_name) goto error; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 9d6625829b99..108d56bbd0d0 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -77,8 +77,6 @@ static int load_flat_shared_library(int id, struct lib_info *p); static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); static int flat_core_dump(long signr, struct pt_regs * regs, struct file *file); -extern void dump_thread(struct pt_regs *, struct user *); - static struct linux_binfmt flat_format = { .module = THIS_MODULE, .load_binary = load_flat_binary, @@ -444,19 +442,22 @@ static int load_flat_file(struct linux_binprm * bprm, flags = ntohl(hdr->flags); rev = ntohl(hdr->rev); - if (flags & FLAT_FLAG_KTRACE) - printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); - - if (strncmp(hdr->magic, "bFLT", 4) || - (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION)) { + if (strncmp(hdr->magic, "bFLT", 4)) { /* * because a lot of people do not manage to produce good * flat binaries, we leave this printk to help them realise * the problem. We only print the error if its not a script file */ if (strncmp(hdr->magic, "#!", 2)) - printk("BINFMT_FLAT: bad magic/rev (0x%x, need 0x%x)\n", - rev, (int) FLAT_VERSION); + printk("BINFMT_FLAT: bad header magic\n"); + return -ENOEXEC; + } + + if (flags & FLAT_FLAG_KTRACE) + printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename); + + if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) { + printk("BINFMT_FLAT: bad flat file version 0x%x (supported 0x%x and 0x%x)\n", rev, FLAT_VERSION, OLD_FLAT_VERSION); return -ENOEXEC; } diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 2568eb41cb3a..9ccc7d8275b8 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -588,11 +588,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 2: set_bit(Enabled, &e->flags); break; case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root); - down(&root->d_inode->i_sem); + mutex_lock(&root->d_inode->i_mutex); kill_node(e); - up(&root->d_inode->i_sem); + mutex_unlock(&root->d_inode->i_mutex); dput(root); break; default: return res; @@ -622,7 +622,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - down(&root->d_inode->i_sem); + mutex_lock(&root->d_inode->i_mutex); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -658,7 +658,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - up(&root->d_inode->i_sem); + mutex_unlock(&root->d_inode->i_mutex); dput(root); if (err) { @@ -703,12 +703,12 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer, case 1: enabled = 0; break; case 2: enabled = 1; break; case 3: root = dget(file->f_vfsmnt->mnt_sb->s_root); - down(&root->d_inode->i_sem); + mutex_lock(&root->d_inode->i_mutex); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - up(&root->d_inode->i_sem); + mutex_unlock(&root->d_inode->i_mutex); dput(root); default: return res; } @@ -126,6 +126,7 @@ static void bio_fs_destructor(struct bio *bio) inline void bio_init(struct bio *bio) { bio->bi_next = NULL; + bio->bi_bdev = NULL; bio->bi_flags = 1 << BIO_UPTODATE; bio->bi_rw = 0; bio->bi_vcnt = 0; @@ -313,7 +314,8 @@ int bio_get_nr_vecs(struct block_device *bdev) } static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset) + *page, unsigned int len, unsigned int offset, + unsigned short max_sectors) { int retried_segments = 0; struct bio_vec *bvec; @@ -324,10 +326,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (bio->bi_vcnt >= bio->bi_max_vecs) + if (((bio->bi_size + len) >> 9) > max_sectors) return 0; - if (((bio->bi_size + len) >> 9) > q->max_sectors) + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page == prev->bv_page && + offset == prev->bv_offset + prev->bv_len) { + prev->bv_len += len; + if (q->merge_bvec_fn && + q->merge_bvec_fn(q, bio, prev) < len) { + prev->bv_len -= len; + return 0; + } + + goto done; + } + } + + if (bio->bi_vcnt >= bio->bi_max_vecs) return 0; /* @@ -381,11 +404,31 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page bio->bi_vcnt++; bio->bi_phys_segments++; bio->bi_hw_segments++; + done: bio->bi_size += len; return len; } /** + * bio_add_pc_page - attempt to add page to bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block + * device limitations. The target block device must allow bio's + * smaller than PAGE_SIZE, so it is always possible to add a single + * page to an empty bio. This should only be used by REQ_PC bios. + */ +int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); +} + +/** * bio_add_page - attempt to add page to bio * @bio: destination bio * @page: page to add @@ -401,8 +444,8 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - return __bio_add_page(bdev_get_queue(bio->bi_bdev), bio, page, - len, offset); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + return __bio_add_page(q, bio, page, len, offset, q->max_sectors); } struct bio_map_data { @@ -514,7 +557,7 @@ struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, break; } - if (__bio_add_page(q, bio, page, bytes, 0) < bytes) { + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { ret = -EINVAL; break; } @@ -628,7 +671,8 @@ static struct bio *__bio_map_user_iov(request_queue_t *q, /* * sorry... */ - if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes) + if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < + bytes) break; len -= bytes; @@ -801,8 +845,8 @@ static struct bio *__bio_map_kern(request_queue_t *q, void *data, if (bytes > len) bytes = len; - if (__bio_add_page(q, bio, virt_to_page(data), bytes, - offset) < bytes) + if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + offset) < bytes) break; data += bytes; @@ -1228,6 +1272,7 @@ EXPORT_SYMBOL(bio_clone); EXPORT_SYMBOL(bio_phys_segments); EXPORT_SYMBOL(bio_hw_segments); EXPORT_SYMBOL(bio_add_page); +EXPORT_SYMBOL(bio_add_pc_page); EXPORT_SYMBOL(bio_get_nr_vecs); EXPORT_SYMBOL(bio_map_user); EXPORT_SYMBOL(bio_unmap_user); diff --git a/fs/block_dev.c b/fs/block_dev.c index e0df94c37b7e..6e50346fb1ee 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -202,7 +202,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) loff_t size; loff_t retval; - down(&bd_inode->i_sem); + mutex_lock(&bd_inode->i_mutex); size = i_size_read(bd_inode); switch (origin) { @@ -219,7 +219,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) } retval = offset; } - up(&bd_inode->i_sem); + mutex_unlock(&bd_inode->i_mutex); return retval; } diff --git a/fs/buffer.c b/fs/buffer.c index 5287be18633b..b9bb7ad6897b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -26,6 +26,7 @@ #include <linux/percpu.h> #include <linux/slab.h> #include <linux/smp_lock.h> +#include <linux/capability.h> #include <linux/blkdev.h> #include <linux/file.h> #include <linux/quotaops.h> @@ -153,14 +154,8 @@ int sync_blockdev(struct block_device *bdev) { int ret = 0; - if (bdev) { - int err; - - ret = filemap_fdatawrite(bdev->bd_inode->i_mapping); - err = filemap_fdatawait(bdev->bd_inode->i_mapping); - if (!ret) - ret = err; - } + if (bdev) + ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); return ret; } EXPORT_SYMBOL(sync_blockdev); @@ -358,11 +353,11 @@ static long do_fsync(unsigned int fd, int datasync) * We need to protect against concurrent writers, * which could cause livelocks in fsync_buffers_list */ - down(&mapping->host->i_sem); + mutex_lock(&mapping->host->i_mutex); err = file->f_op->fsync(file, file->f_dentry, datasync); if (!ret) ret = err; - up(&mapping->host->i_sem); + mutex_unlock(&mapping->host->i_mutex); err = filemap_fdatawait(mapping); if (!ret) ret = err; @@ -1768,7 +1763,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * handle that here by just cleaning them. */ - block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); head = page_buffers(page); bh = head; @@ -2160,11 +2155,12 @@ int block_read_full_page(struct page *page, get_block_t *get_block) * truncates. Uses prepare/commit_write to allow the filesystem to * deal with the hole. */ -int generic_cont_expand(struct inode *inode, loff_t size) +static int __generic_cont_expand(struct inode *inode, loff_t size, + pgoff_t index, unsigned int offset) { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index, offset, limit; + unsigned long limit; int err; err = -EFBIG; @@ -2176,24 +2172,24 @@ int generic_cont_expand(struct inode *inode, loff_t size) if (size > inode->i_sb->s_maxbytes) goto out; - offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ - - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - offset++; - } - index = size >> PAGE_CACHE_SHIFT; err = -ENOMEM; page = grab_cache_page(mapping, index); if (!page) goto out; err = mapping->a_ops->prepare_write(NULL, page, offset, offset); - if (!err) { - err = mapping->a_ops->commit_write(NULL, page, offset, offset); + if (err) { + /* + * ->prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + unlock_page(page); + page_cache_release(page); + vmtruncate(inode, inode->i_size); + goto out; } + + err = mapping->a_ops->commit_write(NULL, page, offset, offset); + unlock_page(page); page_cache_release(page); if (err > 0) @@ -2202,6 +2198,36 @@ out: return err; } +int generic_cont_expand(struct inode *inode, loff_t size) +{ + pgoff_t index; + unsigned int offset; + + offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ + + /* ugh. in prepare/commit_write, if from==to==start of block, we + ** skip the prepare. make sure we never send an offset for the start + ** of a block + */ + if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { + /* caller must handle this extra byte. */ + offset++; + } + index = size >> PAGE_CACHE_SHIFT; + + return __generic_cont_expand(inode, size, index, offset); +} + +int generic_cont_expand_simple(struct inode *inode, loff_t size) +{ + loff_t pos = size - 1; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; + + /* prepare/commit_write can handle even if from==to==start of block. */ + return __generic_cont_expand(inode, size, index, offset); +} + /* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. @@ -2313,7 +2339,7 @@ int generic_commit_write(struct file *file, struct page *page, __block_commit_write(inode,page,from,to); /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_sem. + * cannot change under us because we hold i_mutex. */ if (pos > inode->i_size) { i_size_write(inode, pos); @@ -2610,7 +2636,7 @@ int block_truncate_page(struct address_space *mapping, pgoff_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); unsigned blocksize; - pgoff_t iblock; + sector_t iblock; unsigned length, pos; struct inode *inode = mapping->host; struct page *page; @@ -2626,7 +2652,7 @@ int block_truncate_page(struct address_space *mapping, return 0; length = blocksize - length; - iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); page = grab_cache_page(mapping, index); err = -ENOMEM; @@ -3145,6 +3171,7 @@ EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); +EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h index decd138f14d4..da2ad5b451ac 100644 --- a/fs/cifs/cifs_uniupr.h +++ b/fs/cifs/cifs_uniupr.h @@ -242,7 +242,7 @@ static signed char UniCaseRangeLff20[27] = { /* * Lower Case Range */ -const static struct UniCaseRange CifsUniLowerRange[] = { +static const struct UniCaseRange CifsUniLowerRange[] = { 0x0380, 0x03ab, UniCaseRangeL0380, 0x0400, 0x042f, UniCaseRangeL0400, 0x0490, 0x04cb, UniCaseRangeL0490, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 136af8a08f45..79eeccd0437f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -874,9 +874,9 @@ static int cifs_oplock_thread(void * dummyarg) DeleteOplockQEntry(oplock_item); /* can not grab inode sem here since it would deadlock when oplock received on delete - since vfs_unlink holds the i_sem across + since vfs_unlink holds the i_mutex across the call */ - /* down(&inode->i_sem);*/ + /* mutex_lock(&inode->i_mutex);*/ if (S_ISREG(inode->i_mode)) { rc = filemap_fdatawrite(inode->i_mapping); if(CIFS_I(inode)->clientCanCacheRead == 0) { @@ -885,7 +885,7 @@ static int cifs_oplock_thread(void * dummyarg) } } else rc = 0; - /* up(&inode->i_sem);*/ + /* mutex_unlock(&inode->i_mutex);*/ if (rc) CIFS_I(inode)->write_behind_rc = rc; cFYI(1,("Oplock flush inode %p rc %d",inode,rc)); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 670ec1e84da0..378095a442d0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -127,8 +127,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, if (file->f_dentry->d_inode->i_mapping) { /* BB no need to lock inode until after invalidate since namei code should already have it locked? */ - filemap_fdatawrite(file->f_dentry->d_inode->i_mapping); - filemap_fdatawait(file->f_dentry->d_inode->i_mapping); + filemap_write_and_wait(file->f_dentry->d_inode->i_mapping); } cFYI(1, ("invalidating remote inode since open detected it " "changed")); @@ -419,8 +418,7 @@ static int cifs_reopen_file(struct inode *inode, struct file *file, pCifsInode = CIFS_I(inode); if (pCifsInode) { if (can_flush) { - filemap_fdatawrite(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + filemap_write_and_wait(inode->i_mapping); /* temporarily disable caching while we go to server to get inode info */ pCifsInode->clientCanCacheAll = FALSE; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f65310cc60a1..59359911f481 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1041,9 +1041,9 @@ int cifs_revalidate(struct dentry *direntry) } /* can not grab this sem since kernel filesys locking documentation - indicates i_sem may be taken by the kernel on lookup and rename - which could deadlock if we grab the i_sem here as well */ -/* down(&direntry->d_inode->i_sem);*/ + indicates i_mutex may be taken by the kernel on lookup and rename + which could deadlock if we grab the i_mutex here as well */ +/* mutex_lock(&direntry->d_inode->i_mutex);*/ /* need to write out dirty pages here */ if (direntry->d_inode->i_mapping) { /* do we need to lock inode until after invalidate completes @@ -1067,7 +1067,7 @@ int cifs_revalidate(struct dentry *direntry) } } } -/* up(&direntry->d_inode->i_sem); */ +/* mutex_unlock(&direntry->d_inode->i_mutex); */ kfree(full_path); FreeXid(xid); @@ -1149,8 +1149,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) /* BB check if we need to refresh inode from server now ? BB */ /* need to flush data before changing file size on server */ - filemap_fdatawrite(direntry->d_inode->i_mapping); - filemap_fdatawait(direntry->d_inode->i_mapping); + filemap_write_and_wait(direntry->d_inode->i_mapping); if (attrs->ia_valid & ATTR_SIZE) { /* To avoid spurious oplock breaks from server, in the case of diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 80072fd9b7fa..c607d923350a 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -93,7 +93,7 @@ static void coda_flag_children(struct dentry *parent, int flag) spin_lock(&dcache_lock); list_for_each(child, &parent->d_subdirs) { - de = list_entry(child, struct dentry, d_child); + de = list_entry(child, struct dentry, d_u.d_child); /* don't know what to do with negative dentries */ if ( ! de->d_inode ) continue; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 2391766e9c7c..8f1a517f8b4e 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -453,7 +453,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir) coda_vfs_stat.readdir++; host_inode = host_file->f_dentry->d_inode; - down(&host_inode->i_sem); + mutex_lock(&host_inode->i_mutex); host_file->f_pos = coda_file->f_pos; if (!host_file->f_op->readdir) { @@ -475,7 +475,7 @@ int coda_readdir(struct file *coda_file, void *dirent, filldir_t filldir) } out: coda_file->f_pos = host_file->f_pos; - up(&host_inode->i_sem); + mutex_unlock(&host_inode->i_mutex); return ret; } diff --git a/fs/coda/file.c b/fs/coda/file.c index e6bc022568f3..30b4630bd735 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -77,14 +77,14 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo return -EINVAL; host_inode = host_file->f_dentry->d_inode; - down(&coda_inode->i_sem); + mutex_lock(&coda_inode->i_mutex); ret = host_file->f_op->write(host_file, buf, count, ppos); coda_inode->i_size = host_inode->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; - up(&coda_inode->i_sem); + mutex_unlock(&coda_inode->i_mutex); return ret; } @@ -272,9 +272,9 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) if (host_file->f_op && host_file->f_op->fsync) { host_dentry = host_file->f_dentry; host_inode = host_dentry->d_inode; - down(&host_inode->i_sem); + mutex_lock(&host_inode->i_mutex); err = host_file->f_op->fsync(host_file, host_dentry, datasync); - up(&host_inode->i_sem); + mutex_unlock(&host_inode->i_mutex); } if ( !err && !datasync ) { diff --git a/fs/compat.c b/fs/compat.c index 818634120b69..271b75d1597f 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -494,9 +494,21 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, ret = sys_fcntl(fd, cmd, (unsigned long)&f); set_fs(old_fs); if (cmd == F_GETLK && ret == 0) { - if ((f.l_start >= COMPAT_OFF_T_MAX) || - ((f.l_start + f.l_len) > COMPAT_OFF_T_MAX)) + /* GETLK was successfule and we need to return the data... + * but it needs to fit in the compat structure. + * l_start shouldn't be too big, unless the original + * start + end is greater than COMPAT_OFF_T_MAX, in which + * case the app was asking for trouble, so we return + * -EOVERFLOW in that case. + * l_len could be too big, in which case we just truncate it, + * and only allow the app to see that part of the conflicting + * lock that might make sense to it anyway + */ + + if (f.l_start > COMPAT_OFF_T_MAX) ret = -EOVERFLOW; + if (f.l_len > COMPAT_OFF_T_MAX) + f.l_len = COMPAT_OFF_T_MAX; if (ret == 0) ret = put_compat_flock(&f, compat_ptr(arg)); } @@ -515,9 +527,11 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd, (unsigned long)&f); set_fs(old_fs); if (cmd == F_GETLK64 && ret == 0) { - if ((f.l_start >= COMPAT_LOFF_T_MAX) || - ((f.l_start + f.l_len) > COMPAT_LOFF_T_MAX)) + /* need to return lock information - see above for commentary */ + if (f.l_start > COMPAT_LOFF_T_MAX) ret = -EOVERFLOW; + if (f.l_len > COMPAT_LOFF_T_MAX) + f.l_len = COMPAT_LOFF_T_MAX; if (ret == 0) ret = put_compat_flock64(&f, compat_ptr(arg)); } @@ -1170,7 +1184,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, } ret = rw_verify_area(type, file, pos, tot_len); - if (ret) + if (ret < 0) goto out; fnv = NULL; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 43a2508ac696..5dd0207ffd46 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -10,11 +10,11 @@ * ioctls. */ -#ifdef INCLUDES #include <linux/config.h> #include <linux/types.h> #include <linux/compat.h> #include <linux/kernel.h> +#include <linux/capability.h> #include <linux/compiler.h> #include <linux/sched.h> #include <linux/smp.h> @@ -81,13 +81,9 @@ #include <linux/capi.h> #include <scsi/scsi.h> -/* Ugly hack. */ -#undef __KERNEL__ #include <scsi/scsi_ioctl.h> -#define __KERNEL__ #include <scsi/sg.h> -#include <asm/types.h> #include <asm/uaccess.h> #include <linux/ethtool.h> #include <linux/mii.h> @@ -95,7 +91,6 @@ #include <linux/watchdog.h> #include <linux/dm-ioctl.h> -#include <asm/module.h> #include <linux/soundcard.h> #include <linux/lp.h> #include <linux/ppdev.h> @@ -127,11 +122,7 @@ #include <linux/dvb/dmx.h> #include <linux/dvb/frontend.h> #include <linux/dvb/video.h> - -#undef INCLUDES -#endif - -#ifdef CODE +#include <linux/lp.h> /* Aiee. Someone does not find a difference between int and long */ #define EXT2_IOC32_GETFLAGS _IOR('f', 1, int) @@ -148,6 +139,12 @@ #define EXT2_IOC32_GETVERSION _IOR('v', 1, int) #define EXT2_IOC32_SETVERSION _IOW('v', 2, int) +static int do_ioctl32_pointer(unsigned int fd, unsigned int cmd, + unsigned long arg, struct file *f) +{ + return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); +} + static int w_long(unsigned int fd, unsigned int cmd, unsigned long arg) { mm_segment_t old_fs = get_fs(); @@ -207,244 +204,6 @@ static int do_ext3_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) return sys_ioctl(fd, cmd, (unsigned long)compat_ptr(arg)); } -struct video_tuner32 { - compat_int_t tuner; - char name[32]; - compat_ulong_t rangelow, rangehigh; - u32 flags; /* It is really u32 in videodev.h */ - u16 mode, signal; -}; - -static int get_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up) -{ - int i; - - if(get_user(kp->tuner, &up->tuner)) - return -EFAULT; - for(i = 0; i < 32; i++) - __get_user(kp->name[i], &up->name[i]); - __get_user(kp->rangelow, &up->rangelow); - __get_user(kp->rangehigh, &up->rangehigh); - __get_user(kp->flags, &up->flags); - __get_user(kp->mode, &up->mode); - __get_user(kp->signal, &up->signal); - return 0; -} - -static int put_video_tuner32(struct video_tuner *kp, struct video_tuner32 __user *up) -{ - int i; - - if(put_user(kp->tuner, &up->tuner)) - return -EFAULT; - for(i = 0; i < 32; i++) - __put_user(kp->name[i], &up->name[i]); - __put_user(kp->rangelow, &up->rangelow); - __put_user(kp->rangehigh, &up->rangehigh); - __put_user(kp->flags, &up->flags); - __put_user(kp->mode, &up->mode); - __put_user(kp->signal, &up->signal); - return 0; -} - -struct video_buffer32 { - compat_caddr_t base; - compat_int_t height, width, depth, bytesperline; -}; - -static int get_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up) -{ - u32 tmp; - - if (get_user(tmp, &up->base)) - return -EFAULT; - - /* This is actually a physical address stored - * as a void pointer. - */ - kp->base = (void *)(unsigned long) tmp; - - __get_user(kp->height, &up->height); - __get_user(kp->width, &up->width); - __get_user(kp->depth, &up->depth); - __get_user(kp->bytesperline, &up->bytesperline); - return 0; -} - -static int put_video_buffer32(struct video_buffer *kp, struct video_buffer32 __user *up) -{ - u32 tmp = (u32)((unsigned long)kp->base); - - if(put_user(tmp, &up->base)) - return -EFAULT; - __put_user(kp->height, &up->height); - __put_user(kp->width, &up->width); - __put_user(kp->depth, &up->depth); - __put_user(kp->bytesperline, &up->bytesperline); - return 0; -} - -struct video_clip32 { - s32 x, y, width, height; /* Its really s32 in videodev.h */ - compat_caddr_t next; -}; - -struct video_window32 { - u32 x, y, width, height, chromakey, flags; - compat_caddr_t clips; - compat_int_t clipcount; -}; - -/* You get back everything except the clips... */ -static int put_video_window32(struct video_window *kp, struct video_window32 __user *up) -{ - if(put_user(kp->x, &up->x)) - return -EFAULT; - __put_user(kp->y, &up->y); - __put_user(kp->width, &up->width); - __put_user(kp->height, &up->height); - __put_user(kp->chromakey, &up->chromakey); - __put_user(kp->flags, &up->flags); - __put_user(kp->clipcount, &up->clipcount); - return 0; -} - -#define VIDIOCGTUNER32 _IOWR('v',4, struct video_tuner32) -#define VIDIOCSTUNER32 _IOW('v',5, struct video_tuner32) -#define VIDIOCGWIN32 _IOR('v',9, struct video_window32) -#define VIDIOCSWIN32 _IOW('v',10, struct video_window32) -#define VIDIOCGFBUF32 _IOR('v',11, struct video_buffer32) -#define VIDIOCSFBUF32 _IOW('v',12, struct video_buffer32) -#define VIDIOCGFREQ32 _IOR('v',14, u32) -#define VIDIOCSFREQ32 _IOW('v',15, u32) - -enum { - MaxClips = (~0U-sizeof(struct video_window))/sizeof(struct video_clip) -}; - -static int do_set_window(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct video_window32 __user *up = compat_ptr(arg); - struct video_window __user *vw; - struct video_clip __user *p; - int nclips; - u32 n; - - if (get_user(nclips, &up->clipcount)) - return -EFAULT; - - /* Peculiar interface... */ - if (nclips < 0) - nclips = VIDEO_CLIPMAP_SIZE; - - if (nclips > MaxClips) - return -ENOMEM; - - vw = compat_alloc_user_space(sizeof(struct video_window) + - nclips * sizeof(struct video_clip)); - - p = nclips ? (struct video_clip __user *)(vw + 1) : NULL; - - if (get_user(n, &up->x) || put_user(n, &vw->x) || - get_user(n, &up->y) || put_user(n, &vw->y) || - get_user(n, &up->width) || put_user(n, &vw->width) || - get_user(n, &up->height) || put_user(n, &vw->height) || - get_user(n, &up->chromakey) || put_user(n, &vw->chromakey) || - get_user(n, &up->flags) || put_user(n, &vw->flags) || - get_user(n, &up->clipcount) || put_user(n, &vw->clipcount) || - get_user(n, &up->clips) || put_user(p, &vw->clips)) - return -EFAULT; - - if (nclips) { - struct video_clip32 __user *u = compat_ptr(n); - int i; - if (!u) - return -EINVAL; - for (i = 0; i < nclips; i++, u++, p++) { - s32 v; - if (get_user(v, &u->x) || - put_user(v, &p->x) || - get_user(v, &u->y) || - put_user(v, &p->y) || - get_user(v, &u->width) || - put_user(v, &p->width) || - get_user(v, &u->height) || - put_user(v, &p->height) || - put_user(NULL, &p->next)) - return -EFAULT; - } - } - - return sys_ioctl(fd, VIDIOCSWIN, (unsigned long)p); -} - -static int do_video_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - union { - struct video_tuner vt; - struct video_buffer vb; - struct video_window vw; - unsigned long vx; - } karg; - mm_segment_t old_fs = get_fs(); - void __user *up = compat_ptr(arg); - int err = 0; - - /* First, convert the command. */ - switch(cmd) { - case VIDIOCGTUNER32: cmd = VIDIOCGTUNER; break; - case VIDIOCSTUNER32: cmd = VIDIOCSTUNER; break; - case VIDIOCGWIN32: cmd = VIDIOCGWIN; break; - case VIDIOCGFBUF32: cmd = VIDIOCGFBUF; break; - case VIDIOCSFBUF32: cmd = VIDIOCSFBUF; break; - case VIDIOCGFREQ32: cmd = VIDIOCGFREQ; break; - case VIDIOCSFREQ32: cmd = VIDIOCSFREQ; break; - }; - - switch(cmd) { - case VIDIOCSTUNER: - case VIDIOCGTUNER: - err = get_video_tuner32(&karg.vt, up); - break; - - case VIDIOCSFBUF: - err = get_video_buffer32(&karg.vb, up); - break; - - case VIDIOCSFREQ: - err = get_user(karg.vx, (u32 __user *)up); - break; - }; - if(err) - goto out; - - set_fs(KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long)&karg); - set_fs(old_fs); - - if(err == 0) { - switch(cmd) { - case VIDIOCGTUNER: - err = put_video_tuner32(&karg.vt, up); - break; - - case VIDIOCGWIN: - err = put_video_window32(&karg.vw, up); - break; - - case VIDIOCGFBUF: - err = put_video_buffer32(&karg.vb, up); - break; - - case VIDIOCGFREQ: - err = put_user(((u32)karg.vx), (u32 __user *)up); - break; - }; - } -out: - return err; -} - struct compat_dmx_event { dmx_event_t event; compat_time_t timeStamp; @@ -1158,6 +917,40 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) return err; } +struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ + char req_state; + char orphan; + char sg_io_owned; + char problem; + int pack_id; + compat_uptr_t usr_ptr; + unsigned int duration; + int unused; +}; + +static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + int err, i; + sg_req_info_t *r; + struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg; + r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); + err = sys_ioctl(fd,cmd,(unsigned long)r); + if (err < 0) + return err; + for (i = 0; i < SG_MAX_QUEUE; i++) { + void __user *ptr; + int d; + + if (copy_in_user(o + i, r + i, offsetof(sg_req_info_t, usr_ptr)) || + get_user(ptr, &r[i].usr_ptr) || + get_user(d, &r[i].duration) || + put_user((u32)(unsigned long)(ptr), &o[i].usr_ptr) || + put_user(d, &o[i].duration)) + return -EFAULT; + } + return err; +} + struct sock_fprog32 { unsigned short len; compat_caddr_t filter; @@ -2713,6 +2506,49 @@ static int old_bridge_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg return -EINVAL; } +#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) +#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) +#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) +#define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) + +static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) +{ + mm_segment_t oldfs = get_fs(); + compat_ulong_t val32; + unsigned long kval; + int ret; + + switch (cmd) { + case RTC_IRQP_READ32: + case RTC_EPOCH_READ32: + set_fs(KERNEL_DS); + ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? + RTC_IRQP_READ : RTC_EPOCH_READ, + (unsigned long)&kval); + set_fs(oldfs); + if (ret) + return ret; + val32 = kval; + return put_user(val32, (unsigned int __user *)arg); + case RTC_IRQP_SET32: + case RTC_EPOCH_SET32: + ret = get_user(val32, (unsigned int __user *)arg); + if (ret) + return ret; + kval = val32; + + set_fs(KERNEL_DS); + ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ? + RTC_IRQP_SET : RTC_EPOCH_SET, + (unsigned long)&kval); + set_fs(oldfs); + return ret; + default: + /* unreached */ + return -ENOIOCTLCMD; + } +} + #if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE) struct ncp_ioctl_request_32 { u32 function; @@ -2900,10 +2736,34 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon } #endif -#undef CODE -#endif +static int +lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + struct compat_timeval *tc = (struct compat_timeval *)arg; + struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval)); + struct timeval ts; + if (get_user(ts.tv_sec, &tc->tv_sec) || + get_user(ts.tv_usec, &tc->tv_usec) || + put_user(ts.tv_sec, &tn->tv_sec) || + put_user(ts.tv_usec, &tn->tv_usec)) + return -EFAULT; + return sys_ioctl(fd, cmd, (unsigned long)tn); +} + +#define HANDLE_IOCTL(cmd,handler) \ + { (cmd), (ioctl_trans_handler_t)(handler) }, + +/* pointer to compatible structure or no argument */ +#define COMPATIBLE_IOCTL(cmd) \ + { (cmd), do_ioctl32_pointer }, + +/* argument is an unsigned long integer, not a pointer */ +#define ULONG_IOCTL(cmd) \ + { (cmd), (ioctl_trans_handler_t)sys_ioctl }, -#ifdef DECLARES + +struct ioctl_trans ioctl_start[] = { +#include <linux/compat_ioctl.h> HANDLE_IOCTL(MEMREADOOB32, mtd_rw_oob) HANDLE_IOCTL(MEMWRITEOOB32, mtd_rw_oob) #ifdef CONFIG_NET @@ -2983,6 +2843,7 @@ HANDLE_IOCTL(FDPOLLDRVSTAT32, fd_ioctl_trans) HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans) HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans) HANDLE_IOCTL(SG_IO,sg_ioctl_trans) +HANDLE_IOCTL(SG_GET_REQUEST_TABLE, sg_grt_trans) HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans) HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans) HANDLE_IOCTL(PPPIOCSPASS32, ppp_sock_fprog_ioctl_trans) @@ -3015,14 +2876,6 @@ COMPATIBLE_IOCTL(EXT3_IOC_GROUP_ADD) #ifdef CONFIG_JBD_DEBUG HANDLE_IOCTL(EXT3_IOC32_WAIT_FOR_READONLY, do_ext3_ioctl) #endif -HANDLE_IOCTL(VIDIOCGTUNER32, do_video_ioctl) -HANDLE_IOCTL(VIDIOCSTUNER32, do_video_ioctl) -HANDLE_IOCTL(VIDIOCGWIN32, do_video_ioctl) -HANDLE_IOCTL(VIDIOCSWIN32, do_set_window) -HANDLE_IOCTL(VIDIOCGFBUF32, do_video_ioctl) -HANDLE_IOCTL(VIDIOCSFBUF32, do_video_ioctl) -HANDLE_IOCTL(VIDIOCGFREQ32, do_video_ioctl) -HANDLE_IOCTL(VIDIOCSFREQ32, do_video_ioctl) /* One SMB ioctl needs translations. */ #define SMB_IOC_GETMOUNTUID_32 _IOR('u', 1, compat_uid_t) HANDLE_IOCTL(SMB_IOC_GETMOUNTUID_32, do_smb_getmountuid) @@ -3104,6 +2957,10 @@ HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl) HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl) HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl) HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl) +HANDLE_IOCTL(RTC_IRQP_READ32, rtc_ioctl) +HANDLE_IOCTL(RTC_IRQP_SET32, rtc_ioctl) +HANDLE_IOCTL(RTC_EPOCH_READ32, rtc_ioctl) +HANDLE_IOCTL(RTC_EPOCH_SET32, rtc_ioctl) #if defined(CONFIG_NCP_FS) || defined(CONFIG_NCP_FS_MODULE) HANDLE_IOCTL(NCP_IOC_NCPREQUEST_32, do_ncp_ncprequest) @@ -3121,5 +2978,19 @@ HANDLE_IOCTL(VIDEO_GET_EVENT, do_video_get_event) HANDLE_IOCTL(VIDEO_STILLPICTURE, do_video_stillpicture) HANDLE_IOCTL(VIDEO_SET_SPU_PALETTE, do_video_set_spu_palette) -#undef DECLARES -#endif +/* parport */ +COMPATIBLE_IOCTL(LPTIME) +COMPATIBLE_IOCTL(LPCHAR) +COMPATIBLE_IOCTL(LPABORTOPEN) +COMPATIBLE_IOCTL(LPCAREFUL) +COMPATIBLE_IOCTL(LPWAIT) +COMPATIBLE_IOCTL(LPSETIRQ) +COMPATIBLE_IOCTL(LPGETSTATUS) +COMPATIBLE_IOCTL(LPGETSTATUS) +COMPATIBLE_IOCTL(LPRESET) +/*LPGETSTATS not implemented, but no kernels seem to compile it in anyways*/ +COMPATIBLE_IOCTL(LPGETFLAGS) +HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) +}; + +int ioctl_table_size = ARRAY_SIZE(ioctl_start); diff --git a/fs/configfs/Makefile b/fs/configfs/Makefile new file mode 100644 index 000000000000..00ffb278e98c --- /dev/null +++ b/fs/configfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the configfs virtual filesystem +# + +obj-$(CONFIG_CONFIGFS_FS) += configfs.o + +configfs-objs := inode.o file.o dir.o symlink.o mount.o item.o diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h new file mode 100644 index 000000000000..8899d9c5f6bf --- /dev/null +++ b/fs/configfs/configfs_internal.h @@ -0,0 +1,142 @@ +/* -*- mode: c; c-basic-offset:8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * configfs_internal.h - Internal stuff for configfs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include <linux/slab.h> +#include <linux/list.h> + +struct configfs_dirent { + atomic_t s_count; + struct list_head s_sibling; + struct list_head s_children; + struct list_head s_links; + void * s_element; + int s_type; + umode_t s_mode; + struct dentry * s_dentry; +}; + +#define CONFIGFS_ROOT 0x0001 +#define CONFIGFS_DIR 0x0002 +#define CONFIGFS_ITEM_ATTR 0x0004 +#define CONFIGFS_ITEM_LINK 0x0020 +#define CONFIGFS_USET_DIR 0x0040 +#define CONFIGFS_USET_DEFAULT 0x0080 +#define CONFIGFS_USET_DROPPING 0x0100 +#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) + +extern struct vfsmount * configfs_mount; + +extern int configfs_is_root(struct config_item *item); + +extern struct inode * configfs_new_inode(mode_t mode); +extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); + +extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); +extern int configfs_make_dirent(struct configfs_dirent *, + struct dentry *, void *, umode_t, int); + +extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int); +extern void configfs_hash_and_remove(struct dentry * dir, const char * name); + +extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); +extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); + +extern int configfs_pin_fs(void); +extern void configfs_release_fs(void); + +extern struct rw_semaphore configfs_rename_sem; +extern struct super_block * configfs_sb; +extern struct file_operations configfs_dir_operations; +extern struct file_operations configfs_file_operations; +extern struct file_operations bin_fops; +extern struct inode_operations configfs_dir_inode_operations; +extern struct inode_operations configfs_symlink_inode_operations; + +extern int configfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname); +extern int configfs_unlink(struct inode *dir, struct dentry *dentry); + +struct configfs_symlink { + struct list_head sl_list; + struct config_item *sl_target; +}; + +extern int configfs_create_link(struct configfs_symlink *sl, + struct dentry *parent, + struct dentry *dentry); + +static inline struct config_item * to_item(struct dentry * dentry) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + return ((struct config_item *) sd->s_element); +} + +static inline struct configfs_attribute * to_attr(struct dentry * dentry) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + return ((struct configfs_attribute *) sd->s_element); +} + +static inline struct config_item *configfs_get_config_item(struct dentry *dentry) +{ + struct config_item * item = NULL; + + spin_lock(&dcache_lock); + if (!d_unhashed(dentry)) { + struct configfs_dirent * sd = dentry->d_fsdata; + if (sd->s_type & CONFIGFS_ITEM_LINK) { + struct configfs_symlink * sl = sd->s_element; + item = config_item_get(sl->sl_target); + } else + item = config_item_get(sd->s_element); + } + spin_unlock(&dcache_lock); + + return item; +} + +static inline void release_configfs_dirent(struct configfs_dirent * sd) +{ + if (!(sd->s_type & CONFIGFS_ROOT)) + kfree(sd); +} + +static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) +{ + if (sd) { + WARN_ON(!atomic_read(&sd->s_count)); + atomic_inc(&sd->s_count); + } + return sd; +} + +static inline void configfs_put(struct configfs_dirent * sd) +{ + WARN_ON(!atomic_read(&sd->s_count)); + if (atomic_dec_and_test(&sd->s_count)) + release_configfs_dirent(sd); +} + diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c new file mode 100644 index 000000000000..b668ec61527e --- /dev/null +++ b/fs/configfs/dir.c @@ -0,0 +1,1102 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c - Operations for configfs directories. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#undef DEBUG + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include <linux/configfs.h> +#include "configfs_internal.h" + +DECLARE_RWSEM(configfs_rename_sem); + +static void configfs_d_iput(struct dentry * dentry, + struct inode * inode) +{ + struct configfs_dirent * sd = dentry->d_fsdata; + + if (sd) { + BUG_ON(sd->s_dentry != dentry); + sd->s_dentry = NULL; + configfs_put(sd); + } + iput(inode); +} + +/* + * We _must_ delete our dentries on last dput, as the chain-to-parent + * behavior is required to clear the parents of default_groups. + */ +static int configfs_d_delete(struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations configfs_dentry_ops = { + .d_iput = configfs_d_iput, + /* simple_delete_dentry() isn't exported */ + .d_delete = configfs_d_delete, +}; + +/* + * Allocates a new configfs_dirent and links it to the parent configfs_dirent + */ +static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * parent_sd, + void * element) +{ + struct configfs_dirent * sd; + + sd = kmalloc(sizeof(*sd), GFP_KERNEL); + if (!sd) + return NULL; + + memset(sd, 0, sizeof(*sd)); + atomic_set(&sd->s_count, 1); + INIT_LIST_HEAD(&sd->s_links); + INIT_LIST_HEAD(&sd->s_children); + list_add(&sd->s_sibling, &parent_sd->s_children); + sd->s_element = element; + + return sd; +} + +int configfs_make_dirent(struct configfs_dirent * parent_sd, + struct dentry * dentry, void * element, + umode_t mode, int type) +{ + struct configfs_dirent * sd; + + sd = configfs_new_dirent(parent_sd, element); + if (!sd) + return -ENOMEM; + + sd->s_mode = mode; + sd->s_type = type; + sd->s_dentry = dentry; + if (dentry) { + dentry->d_fsdata = configfs_get(sd); + dentry->d_op = &configfs_dentry_ops; + } + + return 0; +} + +static int init_dir(struct inode * inode) +{ + inode->i_op = &configfs_dir_inode_operations; + inode->i_fop = &configfs_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inode->i_nlink++; + return 0; +} + +static int init_file(struct inode * inode) +{ + inode->i_size = PAGE_SIZE; + inode->i_fop = &configfs_file_operations; + return 0; +} + +static int init_symlink(struct inode * inode) +{ + inode->i_op = &configfs_symlink_inode_operations; + return 0; +} + +static int create_dir(struct config_item * k, struct dentry * p, + struct dentry * d) +{ + int error; + umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; + + error = configfs_create(d, mode, init_dir); + if (!error) { + error = configfs_make_dirent(p->d_fsdata, d, k, mode, + CONFIGFS_DIR); + if (!error) { + p->d_inode->i_nlink++; + (d)->d_op = &configfs_dentry_ops; + } + } + return error; +} + + +/** + * configfs_create_dir - create a directory for an config_item. + * @item: config_itemwe're creating directory for. + * @dentry: config_item's dentry. + */ + +static int configfs_create_dir(struct config_item * item, struct dentry *dentry) +{ + struct dentry * parent; + int error = 0; + + BUG_ON(!item); + + if (item->ci_parent) + parent = item->ci_parent->ci_dentry; + else if (configfs_mount && configfs_mount->mnt_sb) + parent = configfs_mount->mnt_sb->s_root; + else + return -EFAULT; + + error = create_dir(item,parent,dentry); + if (!error) + item->ci_dentry = dentry; + return error; +} + +int configfs_create_link(struct configfs_symlink *sl, + struct dentry *parent, + struct dentry *dentry) +{ + int err = 0; + umode_t mode = S_IFLNK | S_IRWXUGO; + + err = configfs_create(dentry, mode, init_symlink); + if (!err) { + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, + mode, CONFIGFS_ITEM_LINK); + if (!err) + dentry->d_op = &configfs_dentry_ops; + } + return err; +} + +static void remove_dir(struct dentry * d) +{ + struct dentry * parent = dget(d->d_parent); + struct configfs_dirent * sd; + + sd = d->d_fsdata; + list_del_init(&sd->s_sibling); + configfs_put(sd); + if (d->d_inode) + simple_rmdir(parent->d_inode,d); + + pr_debug(" o %s removing done (%d)\n",d->d_name.name, + atomic_read(&d->d_count)); + + dput(parent); +} + +/** + * configfs_remove_dir - remove an config_item's directory. + * @item: config_item we're removing. + * + * The only thing special about this is that we remove any files in + * the directory before we remove the directory, and we've inlined + * what used to be configfs_rmdir() below, instead of calling separately. + */ + +static void configfs_remove_dir(struct config_item * item) +{ + struct dentry * dentry = dget(item->ci_dentry); + + if (!dentry) + return; + + remove_dir(dentry); + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + + +/* attaches attribute's configfs_dirent to the dentry corresponding to the + * attribute file + */ +static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) +{ + struct configfs_attribute * attr = sd->s_element; + int error; + + error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); + if (error) + return error; + + dentry->d_op = &configfs_dentry_ops; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; + d_rehash(dentry); + + return 0; +} + +static struct dentry * configfs_lookup(struct inode *dir, + struct dentry *dentry, + struct nameidata *nd) +{ + struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; + struct configfs_dirent * sd; + int found = 0; + int err = 0; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_type & CONFIGFS_NOT_PINNED) { + const unsigned char * name = configfs_get_name(sd); + + if (strcmp(name, dentry->d_name.name)) + continue; + + found = 1; + err = configfs_attach_attr(sd, dentry); + break; + } + } + + if (!found) { + /* + * If it doesn't exist and it isn't a NOT_PINNED item, + * it must be negative. + */ + return simple_lookup(dir, dentry, nd); + } + + return ERR_PTR(err); +} + +/* + * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are + * attributes and are removed by rmdir(). We recurse, taking i_mutex + * on all children that are candidates for default detach. If the + * result is clean, then configfs_detach_group() will handle dropping + * i_mutex. If there is an error, the caller will clean up the i_mutex + * holders via configfs_detach_rollback(). + */ +static int configfs_detach_prep(struct dentry *dentry) +{ + struct configfs_dirent *parent_sd = dentry->d_fsdata; + struct configfs_dirent *sd; + int ret; + + ret = -EBUSY; + if (!list_empty(&parent_sd->s_links)) + goto out; + + ret = 0; + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_type & CONFIGFS_NOT_PINNED) + continue; + if (sd->s_type & CONFIGFS_USET_DEFAULT) { + mutex_lock(&sd->s_dentry->d_inode->i_mutex); + /* Mark that we've taken i_mutex */ + sd->s_type |= CONFIGFS_USET_DROPPING; + + ret = configfs_detach_prep(sd->s_dentry); + if (!ret) + continue; + } else + ret = -ENOTEMPTY; + + break; + } + +out: + return ret; +} + +/* + * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is + * set. + */ +static void configfs_detach_rollback(struct dentry *dentry) +{ + struct configfs_dirent *parent_sd = dentry->d_fsdata; + struct configfs_dirent *sd; + + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (sd->s_type & CONFIGFS_USET_DEFAULT) { + configfs_detach_rollback(sd->s_dentry); + + if (sd->s_type & CONFIGFS_USET_DROPPING) { + sd->s_type &= ~CONFIGFS_USET_DROPPING; + mutex_unlock(&sd->s_dentry->d_inode->i_mutex); + } + } + } +} + +static void detach_attrs(struct config_item * item) +{ + struct dentry * dentry = dget(item->ci_dentry); + struct configfs_dirent * parent_sd; + struct configfs_dirent * sd, * tmp; + + if (!dentry) + return; + + pr_debug("configfs %s: dropping attrs for dir\n", + dentry->d_name.name); + + parent_sd = dentry->d_fsdata; + list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED)) + continue; + list_del_init(&sd->s_sibling); + configfs_drop_dentry(sd, dentry); + configfs_put(sd); + } + + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + +static int populate_attrs(struct config_item *item) +{ + struct config_item_type *t = item->ci_type; + struct configfs_attribute *attr; + int error = 0; + int i; + + if (!t) + return -EINVAL; + if (t->ct_attrs) { + for (i = 0; (attr = t->ct_attrs[i]) != NULL; i++) { + if ((error = configfs_create_file(item, attr))) + break; + } + } + + if (error) + detach_attrs(item); + + return error; +} + +static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry); +static void configfs_detach_group(struct config_item *item); + +static void detach_groups(struct config_group *group) +{ + struct dentry * dentry = dget(group->cg_item.ci_dentry); + struct dentry *child; + struct configfs_dirent *parent_sd; + struct configfs_dirent *sd, *tmp; + + if (!dentry) + return; + + parent_sd = dentry->d_fsdata; + list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { + if (!sd->s_element || + !(sd->s_type & CONFIGFS_USET_DEFAULT)) + continue; + + child = sd->s_dentry; + + configfs_detach_group(sd->s_element); + child->d_inode->i_flags |= S_DEAD; + + /* + * From rmdir/unregister, a configfs_detach_prep() pass + * has taken our i_mutex for us. Drop it. + * From mkdir/register cleanup, there is no sem held. + */ + if (sd->s_type & CONFIGFS_USET_DROPPING) + mutex_unlock(&child->d_inode->i_mutex); + + d_delete(child); + dput(child); + } + + /** + * Drop reference from dget() on entrance. + */ + dput(dentry); +} + +/* + * This fakes mkdir(2) on a default_groups[] entry. It + * creates a dentry, attachs it, and then does fixup + * on the sd->s_type. + * + * We could, perhaps, tweak our parent's ->mkdir for a minute and + * try using vfs_mkdir. Just a thought. + */ +static int create_default_group(struct config_group *parent_group, + struct config_group *group) +{ + int ret; + struct qstr name; + struct configfs_dirent *sd; + /* We trust the caller holds a reference to parent */ + struct dentry *child, *parent = parent_group->cg_item.ci_dentry; + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; + name.name = group->cg_item.ci_name; + name.len = strlen(name.name); + name.hash = full_name_hash(name.name, name.len); + + ret = -ENOMEM; + child = d_alloc(parent, &name); + if (child) { + d_add(child, NULL); + + ret = configfs_attach_group(&parent_group->cg_item, + &group->cg_item, child); + if (!ret) { + sd = child->d_fsdata; + sd->s_type |= CONFIGFS_USET_DEFAULT; + } else { + d_delete(child); + dput(child); + } + } + + return ret; +} + +static int populate_groups(struct config_group *group) +{ + struct config_group *new_group; + struct dentry *dentry = group->cg_item.ci_dentry; + int ret = 0; + int i; + + if (group && group->default_groups) { + /* FYI, we're faking mkdir here + * I'm not sure we need this semaphore, as we're called + * from our parent's mkdir. That holds our parent's + * i_mutex, so afaik lookup cannot continue through our + * parent to find us, let alone mess with our tree. + * That said, taking our i_mutex is closer to mkdir + * emulation, and shouldn't hurt. */ + mutex_lock(&dentry->d_inode->i_mutex); + + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + + ret = create_default_group(group, new_group); + if (ret) + break; + } + + mutex_unlock(&dentry->d_inode->i_mutex); + } + + if (ret) + detach_groups(group); + + return ret; +} + +/* + * All of link_obj/unlink_obj/link_group/unlink_group require that + * subsys->su_sem is held. + */ + +static void unlink_obj(struct config_item *item) +{ + struct config_group *group; + + group = item->ci_group; + if (group) { + list_del_init(&item->ci_entry); + + item->ci_group = NULL; + item->ci_parent = NULL; + config_item_put(item); + + config_group_put(group); + } +} + +static void link_obj(struct config_item *parent_item, struct config_item *item) +{ + /* Parent seems redundant with group, but it makes certain + * traversals much nicer. */ + item->ci_parent = parent_item; + item->ci_group = config_group_get(to_config_group(parent_item)); + list_add_tail(&item->ci_entry, &item->ci_group->cg_children); + + config_item_get(item); +} + +static void unlink_group(struct config_group *group) +{ + int i; + struct config_group *new_group; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + unlink_group(new_group); + } + } + + group->cg_subsys = NULL; + unlink_obj(&group->cg_item); +} + +static void link_group(struct config_group *parent_group, struct config_group *group) +{ + int i; + struct config_group *new_group; + struct configfs_subsystem *subsys = NULL; /* gcc is a turd */ + + link_obj(&parent_group->cg_item, &group->cg_item); + + if (parent_group->cg_subsys) + subsys = parent_group->cg_subsys; + else if (configfs_is_root(&parent_group->cg_item)) + subsys = to_configfs_subsystem(group); + else + BUG(); + group->cg_subsys = subsys; + + if (group->default_groups) { + for (i = 0; group->default_groups[i]; i++) { + new_group = group->default_groups[i]; + link_group(group, new_group); + } + } +} + +/* + * The goal is that configfs_attach_item() (and + * configfs_attach_group()) can be called from either the VFS or this + * module. That is, they assume that the items have been created, + * the dentry allocated, and the dcache is all ready to go. + * + * If they fail, they must clean up after themselves as if they + * had never been called. The caller (VFS or local function) will + * handle cleaning up the dcache bits. + * + * configfs_detach_group() and configfs_detach_item() behave similarly on + * the way out. They assume that the proper semaphores are held, they + * clean up the configfs items, and they expect their callers will + * handle the dcache bits. + */ +static int configfs_attach_item(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + int ret; + + ret = configfs_create_dir(item, dentry); + if (!ret) { + ret = populate_attrs(item); + if (ret) { + configfs_remove_dir(item); + d_delete(dentry); + } + } + + return ret; +} + +static void configfs_detach_item(struct config_item *item) +{ + detach_attrs(item); + configfs_remove_dir(item); +} + +static int configfs_attach_group(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + int ret; + struct configfs_dirent *sd; + + ret = configfs_attach_item(parent_item, item, dentry); + if (!ret) { + sd = dentry->d_fsdata; + sd->s_type |= CONFIGFS_USET_DIR; + + ret = populate_groups(to_config_group(item)); + if (ret) { + configfs_detach_item(item); + d_delete(dentry); + } + } + + return ret; +} + +static void configfs_detach_group(struct config_item *item) +{ + detach_groups(to_config_group(item)); + configfs_detach_item(item); +} + +/* + * Drop the initial reference from make_item()/make_group() + * This function assumes that reference is held on item + * and that item holds a valid reference to the parent. Also, it + * assumes the caller has validated ci_type. + */ +static void client_drop_item(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + if (type->ct_group_ops && type->ct_group_ops->drop_item) + type->ct_group_ops->drop_item(to_config_group(parent_item), + item); + else + config_item_put(item); +} + + +static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int ret; + struct config_group *group; + struct config_item *item; + struct config_item *parent_item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; + struct config_item_type *type; + struct module *owner; + char *name; + + if (dentry->d_parent == configfs_sb->s_root) + return -EPERM; + + sd = dentry->d_parent->d_fsdata; + if (!(sd->s_type & CONFIGFS_USET_DIR)) + return -EPERM; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + subsys = to_config_group(parent_item)->cg_subsys; + BUG_ON(!subsys); + + if (!type || !type->ct_group_ops || + (!type->ct_group_ops->make_group && + !type->ct_group_ops->make_item)) { + config_item_put(parent_item); + return -EPERM; /* What lack-of-mkdir returns */ + } + + name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); + if (!name) { + config_item_put(parent_item); + return -ENOMEM; + } + snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); + + down(&subsys->su_sem); + group = NULL; + item = NULL; + if (type->ct_group_ops->make_group) { + group = type->ct_group_ops->make_group(to_config_group(parent_item), name); + if (group) { + link_group(to_config_group(parent_item), group); + item = &group->cg_item; + } + } else { + item = type->ct_group_ops->make_item(to_config_group(parent_item), name); + if (item) + link_obj(parent_item, item); + } + up(&subsys->su_sem); + + kfree(name); + if (!item) { + config_item_put(parent_item); + return -ENOMEM; + } + + ret = -EINVAL; + type = item->ci_type; + if (type) { + owner = type->ct_owner; + if (try_module_get(owner)) { + if (group) { + ret = configfs_attach_group(parent_item, + item, + dentry); + } else { + ret = configfs_attach_item(parent_item, + item, + dentry); + } + + if (ret) { + down(&subsys->su_sem); + if (group) + unlink_group(group); + else + unlink_obj(item); + client_drop_item(parent_item, item); + up(&subsys->su_sem); + + config_item_put(parent_item); + module_put(owner); + } + } + } + + return ret; +} + +static int configfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct config_item *parent_item; + struct config_item *item; + struct configfs_subsystem *subsys; + struct configfs_dirent *sd; + struct module *owner = NULL; + int ret; + + if (dentry->d_parent == configfs_sb->s_root) + return -EPERM; + + sd = dentry->d_fsdata; + if (sd->s_type & CONFIGFS_USET_DEFAULT) + return -EPERM; + + parent_item = configfs_get_config_item(dentry->d_parent); + subsys = to_config_group(parent_item)->cg_subsys; + BUG_ON(!subsys); + + if (!parent_item->ci_type) { + config_item_put(parent_item); + return -EINVAL; + } + + ret = configfs_detach_prep(dentry); + if (ret) { + configfs_detach_rollback(dentry); + config_item_put(parent_item); + return ret; + } + + item = configfs_get_config_item(dentry); + + /* Drop reference from above, item already holds one. */ + config_item_put(parent_item); + + if (item->ci_type) + owner = item->ci_type->ct_owner; + + if (sd->s_type & CONFIGFS_USET_DIR) { + configfs_detach_group(item); + + down(&subsys->su_sem); + unlink_group(to_config_group(item)); + } else { + configfs_detach_item(item); + + down(&subsys->su_sem); + unlink_obj(item); + } + + client_drop_item(parent_item, item); + up(&subsys->su_sem); + + /* Drop our reference from above */ + config_item_put(item); + + module_put(owner); + + return 0; +} + +struct inode_operations configfs_dir_inode_operations = { + .mkdir = configfs_mkdir, + .rmdir = configfs_rmdir, + .symlink = configfs_symlink, + .unlink = configfs_unlink, + .lookup = configfs_lookup, +}; + +#if 0 +int configfs_rename_dir(struct config_item * item, const char *new_name) +{ + int error = 0; + struct dentry * new_dentry, * parent; + + if (!strcmp(config_item_name(item), new_name)) + return -EINVAL; + + if (!item->parent) + return -EINVAL; + + down_write(&configfs_rename_sem); + parent = item->parent->dentry; + + mutex_lock(&parent->d_inode->i_mutex); + + new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); + if (!IS_ERR(new_dentry)) { + if (!new_dentry->d_inode) { + error = config_item_set_name(item, "%s", new_name); + if (!error) { + d_add(new_dentry, NULL); + d_move(item->dentry, new_dentry); + } + else + d_delete(new_dentry); + } else + error = -EEXIST; + dput(new_dentry); + } + mutex_unlock(&parent->d_inode->i_mutex); + up_write(&configfs_rename_sem); + + return error; +} +#endif + +static int configfs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry * dentry = file->f_dentry; + struct configfs_dirent * parent_sd = dentry->d_fsdata; + + mutex_lock(&dentry->d_inode->i_mutex); + file->private_data = configfs_new_dirent(parent_sd, NULL); + mutex_unlock(&dentry->d_inode->i_mutex); + + return file->private_data ? 0 : -ENOMEM; + +} + +static int configfs_dir_close(struct inode *inode, struct file *file) +{ + struct dentry * dentry = file->f_dentry; + struct configfs_dirent * cursor = file->private_data; + + mutex_lock(&dentry->d_inode->i_mutex); + list_del_init(&cursor->s_sibling); + mutex_unlock(&dentry->d_inode->i_mutex); + + release_configfs_dirent(cursor); + + return 0; +} + +/* Relationship between s_mode and the DT_xxx types */ +static inline unsigned char dt_type(struct configfs_dirent *sd) +{ + return (sd->s_mode >> 12) & 15; +} + +static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_dentry; + struct configfs_dirent * parent_sd = dentry->d_fsdata; + struct configfs_dirent *cursor = filp->private_data; + struct list_head *p, *q = &cursor->s_sibling; + ino_t ino; + int i = filp->f_pos; + + switch (i) { + case 0: + ino = dentry->d_inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + break; + filp->f_pos++; + i++; + /* fallthrough */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + break; + filp->f_pos++; + i++; + /* fallthrough */ + default: + if (filp->f_pos == 2) { + list_del(q); + list_add(q, &parent_sd->s_children); + } + for (p=q->next; p!= &parent_sd->s_children; p=p->next) { + struct configfs_dirent *next; + const char * name; + int len; + + next = list_entry(p, struct configfs_dirent, + s_sibling); + if (!next->s_element) + continue; + + name = configfs_get_name(next); + len = strlen(name); + if (next->s_dentry) + ino = next->s_dentry->d_inode->i_ino; + else + ino = iunique(configfs_sb, 2); + + if (filldir(dirent, name, len, filp->f_pos, ino, + dt_type(next)) < 0) + return 0; + + list_del(q); + list_add(q, p); + p = q; + filp->f_pos++; + } + } + return 0; +} + +static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin) +{ + struct dentry * dentry = file->f_dentry; + + mutex_lock(&dentry->d_inode->i_mutex); + switch (origin) { + case 1: + offset += file->f_pos; + case 0: + if (offset >= 0) + break; + default: + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + if (file->f_pos >= 2) { + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_dirent *cursor = file->private_data; + struct list_head *p; + loff_t n = file->f_pos - 2; + + list_del(&cursor->s_sibling); + p = sd->s_children.next; + while (n && p != &sd->s_children) { + struct configfs_dirent *next; + next = list_entry(p, struct configfs_dirent, + s_sibling); + if (next->s_element) + n--; + p = p->next; + } + list_add_tail(&cursor->s_sibling, p); + } + } + mutex_unlock(&dentry->d_inode->i_mutex); + return offset; +} + +struct file_operations configfs_dir_operations = { + .open = configfs_dir_open, + .release = configfs_dir_close, + .llseek = configfs_dir_lseek, + .read = generic_read_dir, + .readdir = configfs_readdir, +}; + +int configfs_register_subsystem(struct configfs_subsystem *subsys) +{ + int err; + struct config_group *group = &subsys->su_group; + struct qstr name; + struct dentry *dentry; + struct configfs_dirent *sd; + + err = configfs_pin_fs(); + if (err) + return err; + + if (!group->cg_item.ci_name) + group->cg_item.ci_name = group->cg_item.ci_namebuf; + + sd = configfs_sb->s_root->d_fsdata; + link_group(to_config_group(sd->s_element), group); + + mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); + + name.name = group->cg_item.ci_name; + name.len = strlen(name.name); + name.hash = full_name_hash(name.name, name.len); + + err = -ENOMEM; + dentry = d_alloc(configfs_sb->s_root, &name); + if (!dentry) + goto out_release; + + d_add(dentry, NULL); + + err = configfs_attach_group(sd->s_element, &group->cg_item, + dentry); + if (!err) + dentry = NULL; + else + d_delete(dentry); + + mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); + + if (dentry) { + dput(dentry); +out_release: + unlink_group(group); + configfs_release_fs(); + } + + return err; +} + +void configfs_unregister_subsystem(struct configfs_subsystem *subsys) +{ + struct config_group *group = &subsys->su_group; + struct dentry *dentry = group->cg_item.ci_dentry; + + if (dentry->d_parent != configfs_sb->s_root) { + printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); + return; + } + + mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); + mutex_lock(&dentry->d_inode->i_mutex); + if (configfs_detach_prep(dentry)) { + printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); + } + configfs_detach_group(&group->cg_item); + dentry->d_inode->i_flags |= S_DEAD; + mutex_unlock(&dentry->d_inode->i_mutex); + + d_delete(dentry); + + mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex); + + dput(dentry); + + unlink_group(group); + configfs_release_fs(); +} + +EXPORT_SYMBOL(configfs_register_subsystem); +EXPORT_SYMBOL(configfs_unregister_subsystem); diff --git a/fs/configfs/file.c b/fs/configfs/file.c new file mode 100644 index 000000000000..c26cd61f13af --- /dev/null +++ b/fs/configfs/file.c @@ -0,0 +1,360 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c - operations for regular (text) files. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/dnotify.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/semaphore.h> + +#include <linux/configfs.h> +#include "configfs_internal.h" + + +struct configfs_buffer { + size_t count; + loff_t pos; + char * page; + struct configfs_item_operations * ops; + struct semaphore sem; + int needs_read_fill; +}; + + +/** + * fill_read_buffer - allocate and fill buffer from item. + * @dentry: dentry pointer. + * @buffer: data buffer for file. + * + * Allocate @buffer->page, if it hasn't been already, then call the + * config_item's show() method to fill the buffer with this attribute's + * data. + * This is called only once, on the file's first read. + */ +static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buffer) +{ + struct configfs_attribute * attr = to_attr(dentry); + struct config_item * item = to_item(dentry->d_parent); + struct configfs_item_operations * ops = buffer->ops; + int ret = 0; + ssize_t count; + + if (!buffer->page) + buffer->page = (char *) get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + + count = ops->show_attribute(item,attr,buffer->page); + buffer->needs_read_fill = 0; + BUG_ON(count > (ssize_t)PAGE_SIZE); + if (count >= 0) + buffer->count = count; + else + ret = count; + return ret; +} + + +/** + * flush_read_buffer - push buffer to userspace. + * @buffer: data buffer for file. + * @userbuf: user-passed buffer. + * @count: number of bytes requested. + * @ppos: file position. + * + * Copy the buffer we filled in fill_read_buffer() to userspace. + * This is done at the reader's leisure, copying and advancing + * the amount they specify each time. + * This may be called continuously until the buffer is empty. + */ +static int flush_read_buffer(struct configfs_buffer * buffer, char __user * buf, + size_t count, loff_t * ppos) +{ + int error; + + if (*ppos > buffer->count) + return 0; + + if (count > (buffer->count - *ppos)) + count = buffer->count - *ppos; + + error = copy_to_user(buf,buffer->page + *ppos,count); + if (!error) + *ppos += count; + return error ? -EFAULT : count; +} + +/** + * configfs_read_file - read an attribute. + * @file: file pointer. + * @buf: buffer to fill. + * @count: number of bytes to read. + * @ppos: starting offset in file. + * + * Userspace wants to read an attribute file. The attribute descriptor + * is in the file's ->d_fsdata. The target item is in the directory's + * ->d_fsdata. + * + * We call fill_read_buffer() to allocate and fill the buffer from the + * item's show() method exactly once (if the read is happening from + * the beginning of the file). That should fill the entire buffer with + * all the data the item has to offer for that attribute. + * We then call flush_read_buffer() to copy the buffer to userspace + * in the increments specified. + */ + +static ssize_t +configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct configfs_buffer * buffer = file->private_data; + ssize_t retval = 0; + + down(&buffer->sem); + if (buffer->needs_read_fill) { + if ((retval = fill_read_buffer(file->f_dentry,buffer))) + goto out; + } + pr_debug("%s: count = %d, ppos = %lld, buf = %s\n", + __FUNCTION__,count,*ppos,buffer->page); + retval = flush_read_buffer(buffer,buf,count,ppos); +out: + up(&buffer->sem); + return retval; +} + + +/** + * fill_write_buffer - copy buffer from userspace. + * @buffer: data buffer for file. + * @userbuf: data from user. + * @count: number of bytes in @userbuf. + * + * Allocate @buffer->page if it hasn't been already, then + * copy the user-supplied buffer into it. + */ + +static int +fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size_t count) +{ + int error; + + if (!buffer->page) + buffer->page = (char *)get_zeroed_page(GFP_KERNEL); + if (!buffer->page) + return -ENOMEM; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + error = copy_from_user(buffer->page,buf,count); + buffer->needs_read_fill = 1; + return error ? -EFAULT : count; +} + + +/** + * flush_write_buffer - push buffer to config_item. + * @file: file pointer. + * @buffer: data buffer for file. + * + * Get the correct pointers for the config_item and the attribute we're + * dealing with, then call the store() method for the attribute, + * passing the buffer that we acquired in fill_write_buffer(). + */ + +static int +flush_write_buffer(struct dentry * dentry, struct configfs_buffer * buffer, size_t count) +{ + struct configfs_attribute * attr = to_attr(dentry); + struct config_item * item = to_item(dentry->d_parent); + struct configfs_item_operations * ops = buffer->ops; + + return ops->store_attribute(item,attr,buffer->page,count); +} + + +/** + * configfs_write_file - write an attribute. + * @file: file pointer + * @buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Similar to configfs_read_file(), though working in the opposite direction. + * We allocate and fill the data from the user in fill_write_buffer(), + * then push it to the config_item in flush_write_buffer(). + * There is no easy way for us to know if userspace is only doing a partial + * write, so we don't support them. We expect the entire buffer to come + * on the first write. + * Hint: if you're writing a value, first read the file, modify only the + * the value you're changing, then write entire buffer back. + */ + +static ssize_t +configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +{ + struct configfs_buffer * buffer = file->private_data; + + down(&buffer->sem); + count = fill_write_buffer(buffer,buf,count); + if (count > 0) + count = flush_write_buffer(file->f_dentry,buffer,count); + if (count > 0) + *ppos += count; + up(&buffer->sem); + return count; +} + +static int check_perm(struct inode * inode, struct file * file) +{ + struct config_item *item = configfs_get_config_item(file->f_dentry->d_parent); + struct configfs_attribute * attr = to_attr(file->f_dentry); + struct configfs_buffer * buffer; + struct configfs_item_operations * ops = NULL; + int error = 0; + + if (!item || !attr) + goto Einval; + + /* Grab the module reference for this attribute if we have one */ + if (!try_module_get(attr->ca_owner)) { + error = -ENODEV; + goto Done; + } + + if (item->ci_type) + ops = item->ci_type->ct_item_ops; + else + goto Eaccess; + + /* File needs write support. + * The inode's perms must say it's ok, + * and we must have a store method. + */ + if (file->f_mode & FMODE_WRITE) { + + if (!(inode->i_mode & S_IWUGO) || !ops->store_attribute) + goto Eaccess; + + } + + /* File needs read support. + * The inode's perms must say it's ok, and we there + * must be a show method for it. + */ + if (file->f_mode & FMODE_READ) { + if (!(inode->i_mode & S_IRUGO) || !ops->show_attribute) + goto Eaccess; + } + + /* No error? Great, allocate a buffer for the file, and store it + * it in file->private_data for easy access. + */ + buffer = kmalloc(sizeof(struct configfs_buffer),GFP_KERNEL); + if (buffer) { + memset(buffer,0,sizeof(struct configfs_buffer)); + init_MUTEX(&buffer->sem); + buffer->needs_read_fill = 1; + buffer->ops = ops; + file->private_data = buffer; + } else + error = -ENOMEM; + goto Done; + + Einval: + error = -EINVAL; + goto Done; + Eaccess: + error = -EACCES; + module_put(attr->ca_owner); + Done: + if (error && item) + config_item_put(item); + return error; +} + +static int configfs_open_file(struct inode * inode, struct file * filp) +{ + return check_perm(inode,filp); +} + +static int configfs_release(struct inode * inode, struct file * filp) +{ + struct config_item * item = to_item(filp->f_dentry->d_parent); + struct configfs_attribute * attr = to_attr(filp->f_dentry); + struct module * owner = attr->ca_owner; + struct configfs_buffer * buffer = filp->private_data; + + if (item) + config_item_put(item); + /* After this point, attr should not be accessed. */ + module_put(owner); + + if (buffer) { + if (buffer->page) + free_page((unsigned long)buffer->page); + kfree(buffer); + } + return 0; +} + +struct file_operations configfs_file_operations = { + .read = configfs_read_file, + .write = configfs_write_file, + .llseek = generic_file_llseek, + .open = configfs_open_file, + .release = configfs_release, +}; + + +int configfs_add_file(struct dentry * dir, const struct configfs_attribute * attr, int type) +{ + struct configfs_dirent * parent_sd = dir->d_fsdata; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; + + mutex_lock(&dir->d_inode->i_mutex); + error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); + mutex_unlock(&dir->d_inode->i_mutex); + + return error; +} + + +/** + * configfs_create_file - create an attribute file for an item. + * @item: item we're creating for. + * @attr: atrribute descriptor. + */ + +int configfs_create_file(struct config_item * item, const struct configfs_attribute * attr) +{ + BUG_ON(!item || !item->ci_dentry || !attr); + + return configfs_add_file(item->ci_dentry, attr, + CONFIGFS_ITEM_ATTR); +} + diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c new file mode 100644 index 000000000000..6577c588de9d --- /dev/null +++ b/fs/configfs/inode.c @@ -0,0 +1,162 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c - basic inode and dentry operations. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Please see Documentation/filesystems/configfs.txt for more information. + */ + +#undef DEBUG + +#include <linux/pagemap.h> +#include <linux/namei.h> +#include <linux/backing-dev.h> + +#include <linux/configfs.h> +#include "configfs_internal.h" + +extern struct super_block * configfs_sb; + +static struct address_space_operations configfs_aops = { + .readpage = simple_readpage, + .prepare_write = simple_prepare_write, + .commit_write = simple_commit_write +}; + +static struct backing_dev_info configfs_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, +}; + +struct inode * configfs_new_inode(mode_t mode) +{ + struct inode * inode = new_inode(configfs_sb); + if (inode) { + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &configfs_aops; + inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; + } + return inode; +} + +int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode *)) +{ + int error = 0; + struct inode * inode = NULL; + if (dentry) { + if (!dentry->d_inode) { + if ((inode = configfs_new_inode(mode))) { + if (dentry->d_parent && dentry->d_parent->d_inode) { + struct inode *p_inode = dentry->d_parent->d_inode; + p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; + } + goto Proceed; + } + else + error = -ENOMEM; + } else + error = -EEXIST; + } else + error = -ENOENT; + goto Done; + + Proceed: + if (init) + error = init(inode); + if (!error) { + d_instantiate(dentry, inode); + if (S_ISDIR(mode) || S_ISLNK(mode)) + dget(dentry); /* pin link and directory dentries in core */ + } else + iput(inode); + Done: + return error; +} + +/* + * Get the name for corresponding element represented by the given configfs_dirent + */ +const unsigned char * configfs_get_name(struct configfs_dirent *sd) +{ + struct attribute * attr; + + if (!sd || !sd->s_element) + BUG(); + + /* These always have a dentry, so use that */ + if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) + return sd->s_dentry->d_name.name; + + if (sd->s_type & CONFIGFS_ITEM_ATTR) { + attr = sd->s_element; + return attr->name; + } + return NULL; +} + + +/* + * Unhashes the dentry corresponding to given configfs_dirent + * Called with parent inode's i_mutex held. + */ +void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) +{ + struct dentry * dentry = sd->s_dentry; + + if (dentry) { + spin_lock(&dcache_lock); + if (!(d_unhashed(dentry) && dentry->d_inode)) { + dget_locked(dentry); + __d_drop(dentry); + spin_unlock(&dcache_lock); + simple_unlink(parent->d_inode, dentry); + } else + spin_unlock(&dcache_lock); + } +} + +void configfs_hash_and_remove(struct dentry * dir, const char * name) +{ + struct configfs_dirent * sd; + struct configfs_dirent * parent_sd = dir->d_fsdata; + + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { + if (!sd->s_element) + continue; + if (!strcmp(configfs_get_name(sd), name)) { + list_del_init(&sd->s_sibling); + configfs_drop_dentry(sd, dir); + configfs_put(sd); + break; + } + } + mutex_unlock(&dir->d_inode->i_mutex); +} + + diff --git a/fs/configfs/item.c b/fs/configfs/item.c new file mode 100644 index 000000000000..e07485ac50ad --- /dev/null +++ b/fs/configfs/item.c @@ -0,0 +1,227 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * item.c - library routines for handling generic config items + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on kobject: + * kobject is Copyright (c) 2002-2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + * + * Please see the file Documentation/filesystems/configfs.txt for + * critical information about using the config_item interface. + */ + +#include <linux/string.h> +#include <linux/module.h> +#include <linux/stat.h> +#include <linux/slab.h> + +#include <linux/configfs.h> + + +static inline struct config_item * to_item(struct list_head * entry) +{ + return container_of(entry,struct config_item,ci_entry); +} + +/* Evil kernel */ +static void config_item_release(struct kref *kref); + +/** + * config_item_init - initialize item. + * @item: item in question. + */ +void config_item_init(struct config_item * item) +{ + kref_init(&item->ci_kref); + INIT_LIST_HEAD(&item->ci_entry); +} + +/** + * config_item_set_name - Set the name of an item + * @item: item. + * @name: name. + * + * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a + * dynamically allocated string that @item->ci_name points to. + * Otherwise, use the static @item->ci_namebuf array. + */ + +int config_item_set_name(struct config_item * item, const char * fmt, ...) +{ + int error = 0; + int limit = CONFIGFS_ITEM_NAME_LEN; + int need; + va_list args; + char * name; + + /* + * First, try the static array + */ + va_start(args,fmt); + need = vsnprintf(item->ci_namebuf,limit,fmt,args); + va_end(args); + if (need < limit) + name = item->ci_namebuf; + else { + /* + * Need more space? Allocate it and try again + */ + limit = need + 1; + name = kmalloc(limit,GFP_KERNEL); + if (!name) { + error = -ENOMEM; + goto Done; + } + va_start(args,fmt); + need = vsnprintf(name,limit,fmt,args); + va_end(args); + + /* Still? Give up. */ + if (need >= limit) { + kfree(name); + error = -EFAULT; + goto Done; + } + } + + /* Free the old name, if necessary. */ + if (item->ci_name && item->ci_name != item->ci_namebuf) + kfree(item->ci_name); + + /* Now, set the new name */ + item->ci_name = name; + Done: + return error; +} + +EXPORT_SYMBOL(config_item_set_name); + +void config_item_init_type_name(struct config_item *item, + const char *name, + struct config_item_type *type) +{ + config_item_set_name(item, name); + item->ci_type = type; + config_item_init(item); +} +EXPORT_SYMBOL(config_item_init_type_name); + +void config_group_init_type_name(struct config_group *group, const char *name, + struct config_item_type *type) +{ + config_item_set_name(&group->cg_item, name); + group->cg_item.ci_type = type; + config_group_init(group); +} +EXPORT_SYMBOL(config_group_init_type_name); + +struct config_item * config_item_get(struct config_item * item) +{ + if (item) + kref_get(&item->ci_kref); + return item; +} + +/** + * config_item_cleanup - free config_item resources. + * @item: item. + */ + +void config_item_cleanup(struct config_item * item) +{ + struct config_item_type * t = item->ci_type; + struct config_group * s = item->ci_group; + struct config_item * parent = item->ci_parent; + + pr_debug("config_item %s: cleaning up\n",config_item_name(item)); + if (item->ci_name != item->ci_namebuf) + kfree(item->ci_name); + item->ci_name = NULL; + if (t && t->ct_item_ops && t->ct_item_ops->release) + t->ct_item_ops->release(item); + if (s) + config_group_put(s); + if (parent) + config_item_put(parent); +} + +static void config_item_release(struct kref *kref) +{ + config_item_cleanup(container_of(kref, struct config_item, ci_kref)); +} + +/** + * config_item_put - decrement refcount for item. + * @item: item. + * + * Decrement the refcount, and if 0, call config_item_cleanup(). + */ +void config_item_put(struct config_item * item) +{ + if (item) + kref_put(&item->ci_kref, config_item_release); +} + + +/** + * config_group_init - initialize a group for use + * @k: group + */ + +void config_group_init(struct config_group *group) +{ + config_item_init(&group->cg_item); + INIT_LIST_HEAD(&group->cg_children); +} + + +/** + * config_group_find_obj - search for item in group. + * @group: group we're looking in. + * @name: item's name. + * + * Lock group via @group->cg_subsys, and iterate over @group->cg_list, + * looking for a matching config_item. If matching item is found + * take a reference and return the item. + */ + +struct config_item * config_group_find_obj(struct config_group * group, const char * name) +{ + struct list_head * entry; + struct config_item * ret = NULL; + + /* XXX LOCKING! */ + list_for_each(entry,&group->cg_children) { + struct config_item * item = to_item(entry); + if (config_item_name(item) && + !strcmp(config_item_name(item), name)) { + ret = config_item_get(item); + break; + } + } + return ret; +} + + +EXPORT_SYMBOL(config_item_init); +EXPORT_SYMBOL(config_group_init); +EXPORT_SYMBOL(config_item_get); +EXPORT_SYMBOL(config_item_put); + diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c new file mode 100644 index 000000000000..1a2f6f6a4d91 --- /dev/null +++ b/fs/configfs/mount.c @@ -0,0 +1,159 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mount.c - operations for initializing and mounting configfs. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/init.h> + +#include <linux/configfs.h> +#include "configfs_internal.h" + +/* Random magic number */ +#define CONFIGFS_MAGIC 0x62656570 + +struct vfsmount * configfs_mount = NULL; +struct super_block * configfs_sb = NULL; +static int configfs_mnt_count = 0; + +static struct super_operations configfs_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, +}; + +static struct config_group configfs_root_group = { + .cg_item = { + .ci_namebuf = "root", + .ci_name = configfs_root_group.cg_item.ci_namebuf, + }, +}; + +int configfs_is_root(struct config_item *item) +{ + return item == &configfs_root_group.cg_item; +} + +static struct configfs_dirent configfs_root = { + .s_sibling = LIST_HEAD_INIT(configfs_root.s_sibling), + .s_children = LIST_HEAD_INIT(configfs_root.s_children), + .s_element = &configfs_root_group.cg_item, + .s_type = CONFIGFS_ROOT, +}; + +static int configfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = CONFIGFS_MAGIC; + sb->s_op = &configfs_ops; + configfs_sb = sb; + + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + if (inode) { + inode->i_op = &configfs_dir_inode_operations; + inode->i_fop = &configfs_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inode->i_nlink++; + } else { + pr_debug("configfs: could not get root inode\n"); + return -ENOMEM; + } + + root = d_alloc_root(inode); + if (!root) { + pr_debug("%s: could not get root dentry!\n",__FUNCTION__); + iput(inode); + return -ENOMEM; + } + config_group_init(&configfs_root_group); + configfs_root_group.cg_item.ci_dentry = root; + root->d_fsdata = &configfs_root; + sb->s_root = root; + return 0; +} + +static struct super_block *configfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_single(fs_type, flags, data, configfs_fill_super); +} + +static struct file_system_type configfs_fs_type = { + .owner = THIS_MODULE, + .name = "configfs", + .get_sb = configfs_get_sb, + .kill_sb = kill_litter_super, +}; + +int configfs_pin_fs(void) +{ + return simple_pin_fs("configfs", &configfs_mount, + &configfs_mnt_count); +} + +void configfs_release_fs(void) +{ + simple_release_fs(&configfs_mount, &configfs_mnt_count); +} + + +static decl_subsys(config, NULL, NULL); + +static int __init configfs_init(void) +{ + int err; + + kset_set_kset_s(&config_subsys, kernel_subsys); + err = subsystem_register(&config_subsys); + if (err) + return err; + + err = register_filesystem(&configfs_fs_type); + if (err) { + printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + subsystem_unregister(&config_subsys); + } + + return err; +} + +static void __exit configfs_exit(void) +{ + unregister_filesystem(&configfs_fs_type); + subsystem_unregister(&config_subsys); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("0.0.1"); +MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); + +module_init(configfs_init); +module_exit(configfs_exit); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c new file mode 100644 index 000000000000..50f5840521a9 --- /dev/null +++ b/fs/configfs/symlink.c @@ -0,0 +1,281 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.c - operations for configfs symlinks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on sysfs: + * sysfs is Copyright (C) 2001, 2002, 2003 Patrick Mochel + * + * configfs Copyright (C) 2005 Oracle. All rights reserved. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/namei.h> + +#include <linux/configfs.h> +#include "configfs_internal.h" + +static int item_depth(struct config_item * item) +{ + struct config_item * p = item; + int depth = 0; + do { depth++; } while ((p = p->ci_parent) && !configfs_is_root(p)); + return depth; +} + +static int item_path_length(struct config_item * item) +{ + struct config_item * p = item; + int length = 1; + do { + length += strlen(config_item_name(p)) + 1; + p = p->ci_parent; + } while (p && !configfs_is_root(p)); + return length; +} + +static void fill_item_path(struct config_item * item, char * buffer, int length) +{ + struct config_item * p; + + --length; + for (p = item; p && !configfs_is_root(p); p = p->ci_parent) { + int cur = strlen(config_item_name(p)); + + /* back up enough to print this bus id with '/' */ + length -= cur; + strncpy(buffer + length,config_item_name(p),cur); + *(buffer + --length) = '/'; + } +} + +static int create_link(struct config_item *parent_item, + struct config_item *item, + struct dentry *dentry) +{ + struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata; + struct configfs_symlink *sl; + int ret; + + ret = -ENOMEM; + sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); + if (sl) { + sl->sl_target = config_item_get(item); + /* FIXME: needs a lock, I'd bet */ + list_add(&sl->sl_list, &target_sd->s_links); + ret = configfs_create_link(sl, parent_item->ci_dentry, + dentry); + if (ret) { + list_del_init(&sl->sl_list); + config_item_put(item); + kfree(sl); + } + } + + return ret; +} + + +static int get_target(const char *symname, struct nameidata *nd, + struct config_item **target) +{ + int ret; + + ret = path_lookup(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, nd); + if (!ret) { + if (nd->dentry->d_sb == configfs_sb) { + *target = configfs_get_config_item(nd->dentry); + if (!*target) { + ret = -ENOENT; + path_release(nd); + } + } else + ret = -EPERM; + } + + return ret; +} + + +int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +{ + int ret; + struct nameidata nd; + struct config_item *parent_item; + struct config_item *target_item; + struct config_item_type *type; + + ret = -EPERM; /* What lack-of-symlink returns */ + if (dentry->d_parent == configfs_sb->s_root) + goto out; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + + if (!type || !type->ct_item_ops || + !type->ct_item_ops->allow_link) + goto out_put; + + ret = get_target(symname, &nd, &target_item); + if (ret) + goto out_put; + + ret = type->ct_item_ops->allow_link(parent_item, target_item); + if (!ret) + ret = create_link(parent_item, target_item, dentry); + + config_item_put(target_item); + path_release(&nd); + +out_put: + config_item_put(parent_item); + +out: + return ret; +} + +int configfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct configfs_dirent *sd = dentry->d_fsdata; + struct configfs_symlink *sl; + struct config_item *parent_item; + struct config_item_type *type; + int ret; + + ret = -EPERM; /* What lack-of-symlink returns */ + if (!(sd->s_type & CONFIGFS_ITEM_LINK)) + goto out; + + if (dentry->d_parent == configfs_sb->s_root) + BUG(); + + sl = sd->s_element; + + parent_item = configfs_get_config_item(dentry->d_parent); + type = parent_item->ci_type; + + list_del_init(&sd->s_sibling); + configfs_drop_dentry(sd, dentry->d_parent); + dput(dentry); + configfs_put(sd); + + /* + * drop_link() must be called before + * list_del_init(&sl->sl_list), so that the order of + * drop_link(this, target) and drop_item(target) is preserved. + */ + if (type && type->ct_item_ops && + type->ct_item_ops->drop_link) + type->ct_item_ops->drop_link(parent_item, + sl->sl_target); + + /* FIXME: Needs lock */ + list_del_init(&sl->sl_list); + + /* Put reference from create_link() */ + config_item_put(sl->sl_target); + kfree(sl); + + config_item_put(parent_item); + + ret = 0; + +out: + return ret; +} + +static int configfs_get_target_path(struct config_item * item, struct config_item * target, + char *path) +{ + char * s; + int depth, size; + + depth = item_depth(item); + size = item_path_length(target) + depth * 3 - 1; + if (size > PATH_MAX) + return -ENAMETOOLONG; + + pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size); + + for (s = path; depth--; s += 3) + strcpy(s,"../"); + + fill_item_path(target, path, size); + pr_debug("%s: path = '%s'\n", __FUNCTION__, path); + + return 0; +} + +static int configfs_getlink(struct dentry *dentry, char * path) +{ + struct config_item *item, *target_item; + int error = 0; + + item = configfs_get_config_item(dentry->d_parent); + if (!item) + return -EINVAL; + + target_item = configfs_get_config_item(dentry); + if (!target_item) { + config_item_put(item); + return -EINVAL; + } + + down_read(&configfs_rename_sem); + error = configfs_get_target_path(item, target_item, path); + up_read(&configfs_rename_sem); + + config_item_put(item); + config_item_put(target_item); + return error; + +} + +static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int error = -ENOMEM; + unsigned long page = get_zeroed_page(GFP_KERNEL); + + if (page) { + error = configfs_getlink(dentry, (char *)page); + if (!error) { + nd_set_link(nd, (char *)page); + return (void *)page; + } + } + + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + if (cookie) { + unsigned long page = (unsigned long)cookie; + free_page(page); + } +} + +struct inode_operations configfs_symlink_inode_operations = { + .follow_link = configfs_follow_link, + .readlink = generic_readlink, + .put_link = configfs_put_link, +}; + diff --git a/fs/dcache.c b/fs/dcache.c index 17e439138681..134d6775183f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -71,7 +71,7 @@ struct dentry_stat_t dentry_stat = { static void d_callback(struct rcu_head *head) { - struct dentry * dentry = container_of(head, struct dentry, d_rcu); + struct dentry * dentry = container_of(head, struct dentry, d_u.d_rcu); if (dname_external(dentry)) kfree(dentry->d_name.name); @@ -86,7 +86,7 @@ static void d_free(struct dentry *dentry) { if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - call_rcu(&dentry->d_rcu, d_callback); + call_rcu(&dentry->d_u.d_rcu, d_callback); } /* @@ -193,7 +193,7 @@ kill_it: { list_del(&dentry->d_lru); dentry_stat.nr_unused--; } - list_del(&dentry->d_child); + list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ /*drops the locks, at that point nobody can reach this dentry */ dentry_iput(dentry); @@ -367,7 +367,7 @@ static inline void prune_one_dentry(struct dentry * dentry) struct dentry * parent; __d_drop(dentry); - list_del(&dentry->d_child); + list_del(&dentry->d_u.d_child); dentry_stat.nr_dentry--; /* For d_free, below */ dentry_iput(dentry); parent = dentry->d_parent; @@ -518,7 +518,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; /* Have we found a mount point ? */ if (d_mountpoint(dentry)) @@ -532,7 +532,7 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; goto resume; } @@ -569,7 +569,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; if (!list_empty(&dentry->d_lru)) { @@ -610,7 +610,7 @@ dentry->d_parent->d_name.name, dentry->d_name.name, found); * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; this_parent = this_parent->d_parent; #ifdef DCACHE_DEBUG printk(KERN_DEBUG "select_parent: ascending to %s/%s, found=%d\n", @@ -753,12 +753,12 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_parent = dget(parent); dentry->d_sb = parent->d_sb; } else { - INIT_LIST_HEAD(&dentry->d_child); + INIT_LIST_HEAD(&dentry->d_u.d_child); } spin_lock(&dcache_lock); if (parent) - list_add(&dentry->d_child, &parent->d_subdirs); + list_add(&dentry->d_u.d_child, &parent->d_subdirs); dentry_stat.nr_dentry++; spin_unlock(&dcache_lock); @@ -808,10 +808,14 @@ void d_instantiate(struct dentry *entry, struct inode * inode) * * Fill in inode information in the entry. On success, it returns NULL. * If an unhashed alias of "entry" already exists, then we return the - * aliased dentry instead. + * aliased dentry instead and drop one reference to inode. * * Note that in order to avoid conflicts with rename() etc, the caller * had better be holding the parent directory semaphore. + * + * This also assumes that the inode count has been incremented + * (or otherwise set) by the caller to indicate that it is now + * in use by the dcache. */ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) { @@ -838,6 +842,7 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode) dget_locked(alias); spin_unlock(&dcache_lock); BUG_ON(!d_unhashed(alias)); + iput(inode); return alias; } list_add(&entry->d_alias, &inode->i_dentry); @@ -1310,8 +1315,8 @@ already_unhashed: /* Unhash the target: dput() will then get rid of it */ __d_drop(target); - list_del(&dentry->d_child); - list_del(&target->d_child); + list_del(&dentry->d_u.d_child); + list_del(&target->d_u.d_child); /* Switch the names.. */ switch_names(dentry, target); @@ -1322,15 +1327,15 @@ already_unhashed: if (IS_ROOT(dentry)) { dentry->d_parent = target->d_parent; target->d_parent = target; - INIT_LIST_HEAD(&target->d_child); + INIT_LIST_HEAD(&target->d_u.d_child); } else { do_switch(dentry->d_parent, target->d_parent); /* And add them back to the (new) parent lists */ - list_add(&target->d_child, &target->d_parent->d_subdirs); + list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); } - list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); + list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); write_sequnlock(&rename_lock); @@ -1568,7 +1573,7 @@ repeat: resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; - struct dentry *dentry = list_entry(tmp, struct dentry, d_child); + struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child); next = tmp->next; if (d_unhashed(dentry)||!dentry->d_inode) continue; @@ -1579,7 +1584,7 @@ resume: atomic_dec(&dentry->d_count); } if (this_parent != root) { - next = this_parent->d_child.next; + next = this_parent->d_u.d_child.next; atomic_dec(&this_parent->d_count); this_parent = this_parent->d_parent; goto resume; diff --git a/fs/dcookies.c b/fs/dcookies.c index 02aa0ddc582a..f8274a8f83bd 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -18,6 +18,7 @@ #include <linux/slab.h> #include <linux/list.h> #include <linux/mount.h> +#include <linux/capability.h> #include <linux/dcache.h> #include <linux/mm.h> #include <linux/errno.h> diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a86ac4aeaedb..d4f1a2cddd47 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -146,7 +146,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, } *dentry = NULL; - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry)) { if ((mode & S_IFMT) == S_IFDIR) @@ -155,7 +155,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode, error = debugfs_create(parent->d_inode, *dentry, mode); } else error = PTR_ERR(dentry); - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); return error; } @@ -273,7 +273,7 @@ void debugfs_remove(struct dentry *dentry) if (!parent || !parent->d_inode) return; - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); if (debugfs_positive(dentry)) { if (dentry->d_inode) { if (S_ISDIR(dentry->d_inode->i_mode)) @@ -283,7 +283,7 @@ void debugfs_remove(struct dentry *dentry) dput(dentry); } } - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } EXPORT_SYMBOL_GPL(debugfs_remove); diff --git a/fs/devfs/base.c b/fs/devfs/base.c index 1274422a5384..b621521e09d4 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -2162,27 +2162,27 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd) * * make sure that * d_instantiate always runs under lock - * we release i_sem lock before going to sleep + * we release i_mutex lock before going to sleep * * unfortunately sometimes d_revalidate is called with - * and sometimes without i_sem lock held. The following checks + * and sometimes without i_mutex lock held. The following checks * attempt to deduce when we need to add (and drop resp.) lock * here. This relies on current (2.6.2) calling coventions: * - * lookup_hash is always run under i_sem and is passing NULL + * lookup_hash is always run under i_mutex and is passing NULL * as nd * - * open(...,O_CREATE,...) calls _lookup_hash under i_sem + * open(...,O_CREATE,...) calls _lookup_hash under i_mutex * and sets flags to LOOKUP_OPEN|LOOKUP_CREATE * * all other invocations of ->d_revalidate seem to happen - * outside of i_sem + * outside of i_mutex */ need_lock = nd && (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT)); if (need_lock) - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); if (is_devfsd_or_child(fs_info)) { devfs_handle_t de = lookup_info->de; @@ -2221,9 +2221,9 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd) add_wait_queue(&lookup_info->wait_queue, &wait); read_unlock(&parent->u.dir.lock); /* at this point it is always (hopefully) locked */ - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); schedule(); - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); /* * This does not need nor should remove wait from wait_queue. * Wait queue head is never reused - nothing is ever added to it @@ -2238,7 +2238,7 @@ static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd) out: if (need_lock) - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); return 1; } /* End Function devfs_d_revalidate_wait */ @@ -2284,9 +2284,9 @@ static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry, /* Unlock directory semaphore, which will release any waiters. They will get the hashed dentry, and may be forced to wait for revalidation */ - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); wait_for_devfsd_finished(fs_info); /* If I'm not devfsd, must wait */ - down(&dir->i_sem); /* Grab it again because them's the rules */ + mutex_lock(&dir->i_mutex); /* Grab it again because them's the rules */ de = lookup_info.de; /* If someone else has been so kind as to make the inode, we go home early */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index f2be44d4491f..bfb8a230bac9 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -130,7 +130,7 @@ static struct dentry *get_node(int num) { char s[12]; struct dentry *root = devpts_root; - down(&root->d_inode->i_sem); + mutex_lock(&root->d_inode->i_mutex); return lookup_one_len(s, root, sprintf(s, "%d", num)); } @@ -161,7 +161,7 @@ int devpts_pty_new(struct tty_struct *tty) if (!IS_ERR(dentry) && !dentry->d_inode) d_instantiate(dentry, inode); - up(&devpts_root->d_inode->i_sem); + mutex_unlock(&devpts_root->d_inode->i_mutex); return 0; } @@ -178,7 +178,7 @@ struct tty_struct *devpts_get_tty(int number) dput(dentry); } - up(&devpts_root->d_inode->i_sem); + mutex_unlock(&devpts_root->d_inode->i_mutex); return tty; } @@ -196,7 +196,7 @@ void devpts_pty_kill(int number) } dput(dentry); } - up(&devpts_root->d_inode->i_sem); + mutex_unlock(&devpts_root->d_inode->i_mutex); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 3931e7f1e6bf..30dbbd1df511 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -56,7 +56,7 @@ * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. * This determines whether we need to do the fancy locking which prevents * direct-IO from being able to read uninitialised disk blocks. If its zero - * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_sem is + * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is * not held for the entire direct write (taken briefly, initially, during a * direct read though, but its never held for the duration of a direct-IO). */ @@ -930,7 +930,7 @@ out: } /* - * Releases both i_sem and i_alloc_sem + * Releases both i_mutex and i_alloc_sem */ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, @@ -1062,11 +1062,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, /* * All block lookups have been performed. For READ requests - * we can let i_sem go now that its achieved its purpose + * we can let i_mutex go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) - up(&dio->inode->i_sem); + mutex_unlock(&dio->inode->i_mutex); /* * OK, all BIOs are submitted, so we can decrement bio_count to truly @@ -1145,18 +1145,18 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * The locking rules are governed by the dio_lock_type parameter. * * DIO_NO_LOCKING (no locking, for raw block device access) - * For writes, i_sem is not held on entry; it is never taken. + * For writes, i_mutex is not held on entry; it is never taken. * * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_sem and return with i_sem held, even though + * For writes we are called under i_mutex and return with i_mutex held, even though * it is internally dropped. - * For reads, i_sem is not held on entry, but it is taken and dropped before + * For reads, i_mutex is not held on entry, but it is taken and dropped before * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) - * For writes we are called without i_sem, return without it, never touch it. - * For reads, i_sem is held on entry and will be released before returning. + * For writes we are called without i_mutex, return without it, never touch it. + * For reads, i_mutex is held on entry and will be released before returning. * * Additional i_alloc_sem locking requirements described inline below. */ @@ -1214,11 +1214,11 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * For block device access DIO_NO_LOCKING is used, * neither readers nor writers do any locking at all * For regular files using DIO_LOCKING, - * readers need to grab i_sem and i_alloc_sem - * writers need to grab i_alloc_sem only (i_sem is already held) + * readers need to grab i_mutex and i_alloc_sem + * writers need to grab i_alloc_sem only (i_mutex is already held) * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here - * (i_sem is already held and release for writers here) + * (i_mutex is already held and release for writers here) */ dio->lock_type = dio_lock_type; if (dio_lock_type != DIO_NO_LOCKING) { @@ -1228,7 +1228,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, mapping = iocb->ki_filp->f_mapping; if (dio_lock_type != DIO_OWN_LOCKING) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); reader_with_isem = 1; } @@ -1240,7 +1240,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } if (dio_lock_type == DIO_OWN_LOCKING) { - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); reader_with_isem = 0; } } @@ -1266,7 +1266,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: if (reader_with_isem) - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (rw & WRITE) current->flags &= ~PF_SYNCWRITE; return retval; diff --git a/fs/dquot.c b/fs/dquot.c index 2a62b3dc20ec..1966c890b48d 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -77,6 +77,7 @@ #include <linux/kmod.h> #include <linux/namei.h> #include <linux/buffer_head.h> +#include <linux/capability.h> #include <linux/quotaops.h> #include <asm/uaccess.h> @@ -100,7 +101,7 @@ * operation is just reading pointers from inode (or not using them at all) the * read lock is enough. If pointers are altered function must hold write lock * (these locking rules also apply for S_NOQUOTA flag in the inode - note that - * for altering the flag i_sem is also needed). If operation is holding + * for altering the flag i_mutex is also needed). If operation is holding * reference to dquot in other way (e.g. quotactl ops) it must be guarded by * dqonoff_sem. * This locking assures that: @@ -117,9 +118,9 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem > + * i_mutex > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem > * > dquot->dq_lock > dqio_sem - * i_sem on quota files is special (it's below dqio_sem) + * i_mutex on quota files is special (it's below dqio_sem) */ static DEFINE_SPINLOCK(dq_list_lock); @@ -1369,11 +1370,11 @@ int vfs_quota_off(struct super_block *sb, int type) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_enabled(sb, cnt)) { - down(&toputinode[cnt]->i_sem); + mutex_lock(&toputinode[cnt]->i_mutex); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, 0); - up(&toputinode[cnt]->i_sem); + mutex_unlock(&toputinode[cnt]->i_mutex); mark_inode_dirty(toputinode[cnt]); iput(toputinode[cnt]); } @@ -1417,7 +1418,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) write_inode_now(inode, 1); /* And now flush the block cache so that kernel sees the changes */ invalidate_bdev(sb->s_bdev, 0); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); down(&dqopt->dqonoff_sem); if (sb_has_quota_enabled(sb, type)) { error = -EBUSY; @@ -1449,7 +1450,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) goto out_file_init; } up(&dqopt->dqio_sem); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); set_enable_flags(dqopt, type); add_dquot_ref(sb, type); @@ -1470,7 +1471,7 @@ out_lock: inode->i_flags |= oldflags; up_write(&dqopt->dqptr_sem); } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); out_fmt: put_quota_format(fmt); diff --git a/fs/drop_caches.c b/fs/drop_caches.c new file mode 100644 index 000000000000..4e4762389bdc --- /dev/null +++ b/fs/drop_caches.c @@ -0,0 +1,68 @@ +/* + * Implement the manual drop-all-pagecache function + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/writeback.h> +#include <linux/sysctl.h> +#include <linux/gfp.h> + +/* A global variable is a bit ugly, but it keeps the code simple */ +int sysctl_drop_caches; + +static void drop_pagecache_sb(struct super_block *sb) +{ + struct inode *inode; + + spin_lock(&inode_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (inode->i_state & (I_FREEING|I_WILL_FREE)) + continue; + invalidate_inode_pages(inode->i_mapping); + } + spin_unlock(&inode_lock); +} + +void drop_pagecache(void) +{ + struct super_block *sb; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) + drop_pagecache_sb(sb); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); +} + +void drop_slab(void) +{ + int nr_objects; + + do { + nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); + } while (nr_objects > 10); +} + +int drop_caches_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (write) { + if (sysctl_drop_caches & 1) + drop_pagecache(); + if (sysctl_drop_caches & 2) + drop_slab(); + } + return 0; +} diff --git a/fs/exec.c b/fs/exec.c index 22533cce0611..b5bcf1aae0ab 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -324,7 +324,7 @@ void install_arg_page(struct vm_area_struct *vma, lru_cache_add_active(page); set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); - page_add_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address); pte_unmap_unlock(pte, ptl); /* no need for flush_tlb */ @@ -632,10 +632,10 @@ static inline int de_thread(struct task_struct *tsk) * synchronize with any firing (by calling del_timer_sync) * before we can safely let the old group leader die. */ - sig->real_timer.data = (unsigned long)current; + sig->real_timer.data = current; spin_unlock_irq(lock); - if (del_timer_sync(&sig->real_timer)) - add_timer(&sig->real_timer); + if (hrtimer_cancel(&sig->real_timer)) + hrtimer_restart(&sig->real_timer); spin_lock_irq(lock); } while (atomic_read(&sig->count) > count) { @@ -760,7 +760,7 @@ no_thread_group: spin_lock(&oldsighand->siglock); spin_lock(&newsighand->siglock); - current->sighand = newsighand; + rcu_assign_pointer(current->sighand, newsighand); recalc_sigpending(); spin_unlock(&newsighand->siglock); @@ -768,7 +768,7 @@ no_thread_group: write_unlock_irq(&tasklist_lock); if (atomic_dec_and_test(&oldsighand->count)) - kmem_cache_free(sighand_cachep, oldsighand); + sighand_free(oldsighand); } BUG_ON(!thread_group_leader(current)); @@ -1462,6 +1462,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { current->signal->flags = SIGNAL_GROUP_EXIT; current->signal->group_exit_code = exit_code; + current->signal->group_stop_count = 0; retval = 0; } spin_unlock_irq(¤t->sighand->siglock); @@ -1477,7 +1478,6 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) * Clear any false indication of pending signals that might * be seen by the filesystem code called to write the core file. */ - current->signal->group_stop_count = 0; clear_thread_flag(TIF_SIGPENDING); if (current->signal->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) @@ -1505,7 +1505,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) goto close_fail; if (!file->f_op->write) goto close_fail; - if (do_truncate(file->f_dentry, 0, file) != 0) + if (do_truncate(file->f_dentry, 0, 0, file) != 0) goto close_fail; retval = binfmt->core_dump(signr, regs, file); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index c49d6254379a..5bfe40085fbc 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -177,9 +177,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, struct dentry *ppd; struct dentry *npd; - down(&pd->d_inode->i_sem); + mutex_lock(&pd->d_inode->i_mutex); ppd = CALL(nops,get_parent)(pd); - up(&pd->d_inode->i_sem); + mutex_unlock(&pd->d_inode->i_mutex); if (IS_ERR(ppd)) { err = PTR_ERR(ppd); @@ -201,9 +201,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, break; } dprintk("find_exported_dentry: found name: %s\n", nbuf); - down(&ppd->d_inode->i_sem); + mutex_lock(&ppd->d_inode->i_mutex); npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); - up(&ppd->d_inode->i_sem); + mutex_unlock(&ppd->d_inode->i_mutex); if (IS_ERR(npd)) { err = PTR_ERR(npd); dprintk("find_exported_dentry: lookup failed: %d\n", err); @@ -242,9 +242,9 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, struct dentry *nresult; err = CALL(nops,get_name)(target_dir, nbuf, result); if (!err) { - down(&target_dir->d_inode->i_sem); + mutex_lock(&target_dir->d_inode->i_mutex); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); - up(&target_dir->d_inode->i_sem); + mutex_unlock(&target_dir->d_inode->i_mutex); if (!IS_ERR(nresult)) { if (nresult->d_inode) { dput(result); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 6af2f4130290..35acc43b897f 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -4,6 +4,7 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ +#include <linux/capability.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> @@ -149,7 +150,7 @@ ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl, } /* - * inode->i_sem: don't care + * inode->i_mutex: don't care */ static struct posix_acl * ext2_get_acl(struct inode *inode, int type) @@ -211,7 +212,7 @@ ext2_get_acl(struct inode *inode, int type) } /* - * inode->i_sem: down + * inode->i_mutex: down */ static int ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) @@ -301,8 +302,8 @@ ext2_permission(struct inode *inode, int mask, struct nameidata *nd) /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * - * dir->i_sem: down - * inode->i_sem: up (access to inode is still exclusive) + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ int ext2_init_acl(struct inode *inode, struct inode *dir) @@ -361,7 +362,7 @@ cleanup: * for directories) are added. There are no more bits available in the * file mode. * - * inode->i_sem: down + * inode->i_mutex: down */ int ext2_acl_chmod(struct inode *inode) diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index bb6908066494..2c00953d4b0b 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -16,6 +16,7 @@ #include <linux/quotaops.h> #include <linux/sched.h> #include <linux/buffer_head.h> +#include <linux/capability.h> /* * balloc.c contains the blocks allocation and deallocation routines diff --git a/fs/ext2/bitmap.c b/fs/ext2/bitmap.c index 20145b74623f..e9983a0dd396 100644 --- a/fs/ext2/bitmap.c +++ b/fs/ext2/bitmap.c @@ -7,8 +7,12 @@ * Universite Pierre et Marie Curie (Paris VI) */ +#ifdef EXT2FS_DEBUG + #include <linux/buffer_head.h> +#include "ext2.h" + static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) @@ -23,3 +27,6 @@ unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars) nibblemap[(map->b_data[i] >> 4) & 0xf]; return (sum); } + +#endif /* EXT2FS_DEBUG */ + diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 5b5f52876b42..7442bdd1267a 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -592,7 +592,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent) goto fail; } kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, chunk_size); + memset(kaddr, 0, chunk_size); de = (struct ext2_dir_entry_2 *)kaddr; de->name_len = 1; de->rec_len = cpu_to_le16(EXT2_DIR_REC_LEN(1)); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e977f8566d14..00de0a7312a2 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -53,7 +53,7 @@ struct ext2_inode_info { #ifdef CONFIG_EXT2_FS_XATTR /* * Extended attributes can be read independently of the main file - * data. Taking i_sem even when reading would cause contention + * data. Taking i_mutex even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 709d8676b962..3ca9afdf713d 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -8,6 +8,7 @@ */ #include "ext2.h" +#include <linux/capability.h> #include <linux/time.h> #include <linux/sched.h> #include <asm/current.h> diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 522fa70dd8ea..8d6819846fc9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1152,7 +1152,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -1189,7 +1189,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 0099462d4271..a2ca3107d475 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -325,7 +325,7 @@ cleanup: /* * Inode operation listxattr() * - * dentry->d_inode->i_sem: don't care + * dentry->d_inode->i_mutex: don't care */ ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -389,10 +389,6 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name, ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", name_index, name, value, (long)value_len); - if (IS_RDONLY(inode)) - return -EROFS; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; if (value == NULL) value_len = 0; if (name == NULL) diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 52b30ee6a25f..f28a6a499c96 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/string.h> +#include <linux/capability.h> #include <linux/fs.h> #include <linux/smp_lock.h> #include <linux/ext2_fs.h> @@ -38,8 +39,6 @@ ext2_xattr_trusted_get(struct inode *inode, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -50,8 +49,6 @@ ext2_xattr_trusted_set(struct inode *inode, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 0c03ea131a94..f383e7c3a7b5 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -35,16 +35,10 @@ static int ext2_xattr_user_get(struct inode *inode, const char *name, void *buffer, size_t size) { - int error; - if (strcmp(name, "") == 0) return -EINVAL; if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - error = permission(inode, MAY_READ, NULL); - if (error) - return error; - return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size); } @@ -52,18 +46,10 @@ static int ext2_xattr_user_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int error; - if (strcmp(name, "") == 0) return -EINVAL; if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - if ( !S_ISREG(inode->i_mode) && - (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) - return -EPERM; - error = permission(inode, MAY_WRITE, NULL); - if (error) - return error; return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, value, size, flags); diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 3ac38266fc9e..47a9da2dfb4f 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -7,6 +7,7 @@ #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/capability.h> #include <linux/fs.h> #include <linux/ext3_jbd.h> #include <linux/ext3_fs.h> @@ -152,7 +153,7 @@ ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl, /* * Inode operation get_posix_acl(). * - * inode->i_sem: don't care + * inode->i_mutex: don't care */ static struct posix_acl * ext3_get_acl(struct inode *inode, int type) @@ -216,7 +217,7 @@ ext3_get_acl(struct inode *inode, int type) /* * Set the access or default ACL of an inode. * - * inode->i_sem: down unless called from ext3_new_inode + * inode->i_mutex: down unless called from ext3_new_inode */ static int ext3_set_acl(handle_t *handle, struct inode *inode, int type, @@ -306,8 +307,8 @@ ext3_permission(struct inode *inode, int mask, struct nameidata *nd) /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * - * dir->i_sem: down - * inode->i_sem: up (access to inode is still exclusive) + * dir->i_mutex: down + * inode->i_mutex: up (access to inode is still exclusive) */ int ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) @@ -368,7 +369,7 @@ cleanup: * for directories) are added. There are no more bits available in the * file mode. * - * inode->i_sem: down + * inode->i_mutex: down */ int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index ae1148c24c53..6250fcdf14a1 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -13,6 +13,7 @@ #include <linux/config.h> #include <linux/time.h> +#include <linux/capability.h> #include <linux/fs.h> #include <linux/jbd.h> #include <linux/ext3_fs.h> @@ -20,8 +21,6 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> -#include "bitmap.h" - /* * balloc.c contains the blocks allocation and deallocation routines */ diff --git a/fs/ext3/bitmap.c b/fs/ext3/bitmap.c index 5b4ba3e246e6..cb16b4c5d5df 100644 --- a/fs/ext3/bitmap.c +++ b/fs/ext3/bitmap.c @@ -7,8 +7,11 @@ * Universite Pierre et Marie Curie (Paris VI) */ +#ifdef EXT3FS_DEBUG + #include <linux/buffer_head.h> -#include "bitmap.h" + +#include "ext3_fs.h" static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; @@ -24,3 +27,6 @@ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) nibblemap[(map->b_data[i] >> 4) & 0xf]; return (sum); } + +#endif /* EXT3FS_DEBUG */ + diff --git a/fs/ext3/bitmap.h b/fs/ext3/bitmap.h deleted file mode 100644 index 6ee503a6bb4e..000000000000 --- a/fs/ext3/bitmap.h +++ /dev/null @@ -1,8 +0,0 @@ -/* linux/fs/ext3/bitmap.c - * - * Copyright (C) 2005 Simtec Electronics - * Ben Dooks <ben@simtec.co.uk> - * -*/ - -extern unsigned long ext3_count_free (struct buffer_head *, unsigned int ); diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 9e4a24376210..dc826464f313 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -26,7 +26,6 @@ #include <asm/byteorder.h> -#include "bitmap.h" #include "xattr.h" #include "acl.h" @@ -651,7 +650,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { ext3_warning(sb, __FUNCTION__, - "bad orphan ino %lu! e2fsck was run?\n", ino); + "bad orphan ino %lu! e2fsck was run?", ino); goto out; } @@ -660,7 +659,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) bitmap_bh = read_inode_bitmap(sb, block_group); if (!bitmap_bh) { ext3_warning(sb, __FUNCTION__, - "inode bitmap error for orphan %lu\n", ino); + "inode bitmap error for orphan %lu", ino); goto out; } @@ -672,7 +671,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) !(inode = iget(sb, ino)) || is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { ext3_warning(sb, __FUNCTION__, - "bad orphan inode %lu! e2fsck was run?\n", ino); + "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, ext3_test_bit(bit, bitmap_bh->b_data)); diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 706d68608381..556cd5510078 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/jbd.h> +#include <linux/capability.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> #include <linux/time.h> diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b3c690a3b54a..af193a304ee5 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1476,7 +1476,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { ext3_warning(sb, __FUNCTION__, - "Directory index full!\n"); + "Directory index full!"); err = -ENOSPC; goto cleanup; } diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 6104ad310507..1041dab6de2f 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -31,7 +31,7 @@ static int verify_group_input(struct super_block *sb, unsigned start = le32_to_cpu(es->s_blocks_count); unsigned end = start + input->blocks_count; unsigned group = input->group; - unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group; + unsigned itend = input->inode_table + sbi->s_itb_per_group; unsigned overhead = ext3_bg_has_super(sb, group) ? (1 + ext3_bg_num_gdb(sb, group) + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; @@ -340,7 +340,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ ext3_warning(sb, __FUNCTION__, - "reserved GDT %ld missing grp %d (%ld)\n", + "reserved GDT %ld missing grp %d (%ld)", blk, grp, grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); return -EINVAL; @@ -393,7 +393,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, if (EXT3_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { ext3_warning(sb, __FUNCTION__, - "won't resize using backup superblock at %llu\n", + "won't resize using backup superblock at %llu", (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); return -EPERM; } @@ -417,7 +417,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__u32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { ext3_warning(sb, __FUNCTION__, - "new group %u GDT block %lu not reserved\n", + "new group %u GDT block %lu not reserved", input->group, gdblock); err = -EINVAL; goto exit_dind; @@ -540,7 +540,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { ext3_warning(sb, __FUNCTION__, - "reserved block %lu not at offset %ld\n", + "reserved block %lu not at offset %ld", blk, (long)(data - (__u32 *)dind->b_data)); err = -EINVAL; goto exit_bh; @@ -683,7 +683,7 @@ exit_err: if (err) { ext3_warning(sb, __FUNCTION__, "can't update backup for group %d (err %d), " - "forcing fsck on next reboot\n", group, err); + "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT3_VALID_FS; sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); mark_buffer_dirty(sbi->s_sbh); @@ -722,7 +722,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { ext3_warning(sb, __FUNCTION__, - "Can't resize non-sparse filesystem further\n"); + "Can't resize non-sparse filesystem further"); return -EPERM; } @@ -730,13 +730,13 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_RESIZE_INODE)){ ext3_warning(sb, __FUNCTION__, - "No reserved GDT blocks, can't resize\n"); + "No reserved GDT blocks, can't resize"); return -EPERM; } inode = iget(sb, EXT3_RESIZE_INO); if (!inode || is_bad_inode(inode)) { ext3_warning(sb, __FUNCTION__, - "Error opening resize inode\n"); + "Error opening resize inode"); iput(inode); return -ENOENT; } @@ -764,9 +764,9 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) } lock_super(sb); - if (input->group != EXT3_SB(sb)->s_groups_count) { + if (input->group != sbi->s_groups_count) { ext3_warning(sb, __FUNCTION__, - "multiple resizers run on filesystem!\n"); + "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; } @@ -799,7 +799,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. * - * The key field here is EXT3_SB(sb)->s_groups_count: as long as + * The key field here is sbi->s_groups_count: as long as * that retains its old value, nobody is going to access the new * group. * @@ -859,7 +859,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) smp_wmb(); /* Update the global fs size fields */ - EXT3_SB(sb)->s_groups_count++; + sbi->s_groups_count++; ext3_journal_dirty_metadata(handle, primary); @@ -874,7 +874,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) percpu_counter_mod(&sbi->s_freeinodes_counter, EXT3_INODES_PER_GROUP(sb)); - ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + ext3_journal_dirty_metadata(handle, sbi->s_sbh); sb->s_dirt = 1; exit_journal: @@ -937,7 +937,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (last == 0) { ext3_warning(sb, __FUNCTION__, - "need to use ext2online to resize further\n"); + "need to use ext2online to resize further"); return -EPERM; } @@ -973,7 +973,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, lock_super(sb); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, - "multiple resizers run on filesystem!\n"); + "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_put; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 4e6730622d90..56bf76586019 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -43,7 +43,8 @@ #include "acl.h" #include "namei.h" -static int ext3_load_journal(struct super_block *, struct ext3_super_block *); +static int ext3_load_journal(struct super_block *, struct ext3_super_block *, + unsigned long journal_devnum); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, int); static void ext3_commit_super (struct super_block * sb, @@ -628,7 +629,7 @@ enum { Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, - Opt_commit, Opt_journal_update, Opt_journal_inum, + Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, @@ -666,6 +667,7 @@ static match_table_t tokens = { {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, + {Opt_journal_dev, "journal_dev=%u"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, @@ -705,8 +707,9 @@ static unsigned long get_sb_block(void **data) return sb_block; } -static int parse_options (char * options, struct super_block *sb, - unsigned long * inum, unsigned long *n_blocks_count, int is_remount) +static int parse_options (char *options, struct super_block *sb, + unsigned long *inum, unsigned long *journal_devnum, + unsigned long *n_blocks_count, int is_remount) { struct ext3_sb_info *sbi = EXT3_SB(sb); char * p; @@ -839,6 +842,16 @@ static int parse_options (char * options, struct super_block *sb, return 0; *inum = option; break; + case Opt_journal_dev: + if (is_remount) { + printk(KERN_ERR "EXT3-fs: cannot specify " + "journal on remount\n"); + return 0; + } + if (match_int(&args[0], &option)) + return 0; + *journal_devnum = option; + break; case Opt_noload: set_opt (sbi->s_mount_opt, NOLOAD); break; @@ -1331,6 +1344,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) unsigned long logic_sb_block; unsigned long offset = 0; unsigned long journal_inum = 0; + unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; int blocksize; @@ -1411,7 +1425,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, RESERVATION); - if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) + if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -1622,7 +1637,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) */ if (!test_opt(sb, NOLOAD) && EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { - if (ext3_load_journal(sb, es)) + if (ext3_load_journal(sb, es, journal_devnum)) goto failed_mount2; } else if (journal_inum) { if (ext3_create_journal(sb, es, journal_inum)) @@ -1902,15 +1917,24 @@ out_bdev: return NULL; } -static int ext3_load_journal(struct super_block * sb, - struct ext3_super_block * es) +static int ext3_load_journal(struct super_block *sb, + struct ext3_super_block *es, + unsigned long journal_devnum) { journal_t *journal; int journal_inum = le32_to_cpu(es->s_journal_inum); - dev_t journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + dev_t journal_dev; int err = 0; int really_read_only; + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + printk(KERN_INFO "EXT3-fs: external journal device major/minor " + "numbers have changed\n"); + journal_dev = new_decode_dev(journal_devnum); + } else + journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); + really_read_only = bdev_read_only(sb->s_bdev); /* @@ -1969,6 +1993,16 @@ static int ext3_load_journal(struct super_block * sb, EXT3_SB(sb)->s_journal = journal; ext3_clear_journal_err(sb, es); + + if (journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + } + return 0; } @@ -2116,7 +2150,7 @@ int ext3_force_commit(struct super_block *sb) static void ext3_write_super (struct super_block * sb) { - if (down_trylock(&sb->s_lock) == 0) + if (mutex_trylock(&sb->s_lock) != 0) BUG(); sb->s_dirt = 0; } @@ -2197,7 +2231,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } @@ -2567,7 +2601,7 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -2610,7 +2644,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; ext3_mark_inode_dirty(handle, inode); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 430de9f63be3..e8d60bf6b7df 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -140,7 +140,7 @@ ext3_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_sem: don't care + * dentry->d_inode->i_mutex: don't care */ ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -946,10 +946,6 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, }; int error; - if (IS_RDONLY(inode)) - return -EROFS; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; if (!name) return -EINVAL; if (strlen(name) > 255) diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index f68bfd1cf519..86d91f1186dc 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -7,6 +7,7 @@ #include <linux/module.h> #include <linux/string.h> +#include <linux/capability.h> #include <linux/fs.h> #include <linux/smp_lock.h> #include <linux/ext3_jbd.h> @@ -39,8 +40,6 @@ ext3_xattr_trusted_get(struct inode *inode, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -51,8 +50,6 @@ ext3_xattr_trusted_set(struct inode *inode, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index e907cae7a07c..a85a0a17c4fd 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -37,16 +37,10 @@ static int ext3_xattr_user_get(struct inode *inode, const char *name, void *buffer, size_t size) { - int error; - if (strcmp(name, "") == 0) return -EINVAL; if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - error = permission(inode, MAY_READ, NULL); - if (error) - return error; - return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size); } @@ -54,19 +48,10 @@ static int ext3_xattr_user_set(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - int error; - if (strcmp(name, "") == 0) return -EINVAL; if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - if ( !S_ISREG(inode->i_mode) && - (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) - return -EPERM; - error = permission(inode, MAY_WRITE, NULL); - if (error) - return error; - return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 77c24fcf712a..1acc941245fb 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -295,7 +295,8 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } -int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys) +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -303,9 +304,12 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys) int cluster, offset; *phys = 0; + *mapped_blocks = 0; if ((sbi->fat_bits != 32) && (inode->i_ino == MSDOS_ROOT_INO)) { - if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) + if (sector < (sbi->dir_entries >> sbi->dir_per_block_bits)) { *phys = sector + sbi->dir_start; + *mapped_blocks = 1; + } return 0; } last_block = (MSDOS_I(inode)->mmu_private + (sb->s_blocksize - 1)) @@ -318,7 +322,11 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys) cluster = fat_bmap_cluster(inode, cluster); if (cluster < 0) return cluster; - else if (cluster) + else if (cluster) { *phys = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } return 0; } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index ba824964b9bb..db0de5c621c7 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -45,8 +45,8 @@ static inline void fat_dir_readahead(struct inode *dir, sector_t iblock, if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO)) return; - bh = sb_getblk(sb, phys); - if (bh && !buffer_uptodate(bh)) { + bh = sb_find_get_block(sb, phys); + if (bh == NULL || !buffer_uptodate(bh)) { for (sec = 0; sec < sbi->sec_per_clus; sec++) sb_breadahead(sb, phys + sec); } @@ -68,8 +68,8 @@ static int fat__get_entry(struct inode *dir, loff_t *pos, { struct super_block *sb = dir->i_sb; sector_t phys, iblock; - int offset; - int err; + unsigned long mapped_blocks; + int err, offset; next: if (*bh) @@ -77,7 +77,7 @@ next: *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; - err = fat_bmap(dir, iblock, &phys); + err = fat_bmap(dir, iblock, &phys, &mapped_blocks); if (err || !phys) return -1; /* beyond EOF or error */ @@ -418,7 +418,7 @@ EODir: return err; } -EXPORT_SYMBOL(fat_search_long); +EXPORT_SYMBOL_GPL(fat_search_long); struct fat_ioctl_filldir_callback { struct dirent __user *dirent; @@ -729,13 +729,13 @@ static int fat_dir_ioctl(struct inode * inode, struct file * filp, buf.dirent = d1; buf.result = 0; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = -ENOENT; if (!IS_DEADDIR(inode)) { ret = __fat_readdir(inode, filp, &buf, fat_ioctl_filldir, short_only, both); } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret >= 0) ret = buf.result; return ret; @@ -780,7 +780,7 @@ int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, return -ENOENT; } -EXPORT_SYMBOL(fat_get_dotdot_entry); +EXPORT_SYMBOL_GPL(fat_get_dotdot_entry); /* See if directory is empty */ int fat_dir_empty(struct inode *dir) @@ -803,7 +803,7 @@ int fat_dir_empty(struct inode *dir) return result; } -EXPORT_SYMBOL(fat_dir_empty); +EXPORT_SYMBOL_GPL(fat_dir_empty); /* * fat_subdirs counts the number of sub-directories of dir. It can be run @@ -849,7 +849,7 @@ int fat_scan(struct inode *dir, const unsigned char *name, return -ENOENT; } -EXPORT_SYMBOL(fat_scan); +EXPORT_SYMBOL_GPL(fat_scan); static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) { @@ -936,7 +936,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) return 0; } -EXPORT_SYMBOL(fat_remove_entries); +EXPORT_SYMBOL_GPL(fat_remove_entries); static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, struct buffer_head **bhs, int nr_bhs) @@ -1048,7 +1048,7 @@ error: return err; } -EXPORT_SYMBOL(fat_alloc_new_dir); +EXPORT_SYMBOL_GPL(fat_alloc_new_dir); static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, int *nr_cluster, struct msdos_dir_entry **de, @@ -1264,4 +1264,4 @@ error_remove: return err; } -EXPORT_SYMBOL(fat_add_entries); +EXPORT_SYMBOL_GPL(fat_add_entries); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 4164cd54c4d1..a1a9e0451217 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -476,6 +476,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) sbi->prev_free = entry; if (sbi->free_clusters != -1) sbi->free_clusters--; + sb->s_dirt = 1; cluster[idx_clus] = entry; idx_clus++; @@ -496,6 +497,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) /* Couldn't allocate the free entries */ sbi->free_clusters = 0; + sb->s_dirt = 1; err = -ENOSPC; out: @@ -509,7 +511,6 @@ out: } for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); - fat_clusters_flush(sb); if (err && idx_clus) fat_free_clusters(inode, cluster[0]); @@ -542,8 +543,10 @@ int fat_free_clusters(struct inode *inode, int cluster) } ops->ent_put(&fatent, FAT_ENT_FREE); - if (sbi->free_clusters != -1) + if (sbi->free_clusters != -1) { sbi->free_clusters++; + sb->s_dirt = 1; + } if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { if (sb->s_flags & MS_SYNCHRONOUS) { @@ -578,7 +581,7 @@ error: return err; } -EXPORT_SYMBOL(fat_free_clusters); +EXPORT_SYMBOL_GPL(fat_free_clusters); int fat_count_free_clusters(struct super_block *sb) { @@ -605,6 +608,7 @@ int fat_count_free_clusters(struct super_block *sb) } while (fat_ent_next(sbi, &fatent)); } sbi->free_clusters = free; + sb->s_dirt = 1; fatent_brelse(&fatent); out: unlock_fat(sbi); diff --git a/fs/fat/file.c b/fs/fat/file.c index 7134403d5be2..e99c5a73b39e 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -6,11 +6,13 @@ * regular file handling primitives for fat-based filesystems */ +#include <linux/capability.h> #include <linux/module.h> #include <linux/time.h> #include <linux/msdos_fs.h> #include <linux/smp_lock.h> #include <linux/buffer_head.h> +#include <linux/writeback.h> int fat_generic_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) @@ -40,7 +42,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, if (err) return err; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (IS_RDONLY(inode)) { err = -EROFS; @@ -102,7 +104,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED; mark_inode_dirty(inode); up: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return err; } default: @@ -124,6 +126,24 @@ struct file_operations fat_file_operations = { .sendfile = generic_file_sendfile, }; +static int fat_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + loff_t start = inode->i_size, count = size - inode->i_size; + int err; + + err = generic_cont_expand_simple(inode, size); + if (err) + goto out; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (IS_SYNC(inode)) + err = sync_page_range_nolock(inode, mapping, start, count); +out: + return err; +} + int fat_notify_change(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); @@ -132,11 +152,17 @@ int fat_notify_change(struct dentry *dentry, struct iattr *attr) lock_kernel(); - /* FAT cannot truncate to a longer file */ + /* + * Expand the file. Since inode_setattr() updates ->i_size + * before calling the ->truncate(), but FAT needs to fill the + * hole before it. + */ if (attr->ia_valid & ATTR_SIZE) { if (attr->ia_size > inode->i_size) { - error = -EPERM; - goto out; + error = fat_cont_expand(inode, attr->ia_size); + if (error || attr->ia_valid == ATTR_SIZE) + goto out; + attr->ia_valid &= ~ATTR_SIZE; } } @@ -173,7 +199,7 @@ out: return error; } -EXPORT_SYMBOL(fat_notify_change); +EXPORT_SYMBOL_GPL(fat_notify_change); /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a0f9b9fe1307..e7f4aa7fc686 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,10 +18,12 @@ #include <linux/seq_file.h> #include <linux/msdos_fs.h> #include <linux/pagemap.h> +#include <linux/mpage.h> #include <linux/buffer_head.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/parser.h> +#include <linux/uio.h> #include <asm/unaligned.h> #ifndef CONFIG_FAT_DEFAULT_IOCHARSET @@ -48,51 +50,97 @@ static int fat_add_cluster(struct inode *inode) return err; } -static int fat_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int __fat_get_blocks(struct inode *inode, sector_t iblock, + unsigned long *max_blocks, + struct buffer_head *bh_result, int create) { struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); sector_t phys; - int err; + unsigned long mapped_blocks; + int err, offset; - err = fat_bmap(inode, iblock, &phys); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks); if (err) return err; if (phys) { map_bh(bh_result, sb, phys); + *max_blocks = min(mapped_blocks, *max_blocks); return 0; } if (!create) return 0; + if (iblock != MSDOS_I(inode)->mmu_private >> sb->s_blocksize_bits) { fat_fs_panic(sb, "corrupted file size (i_pos %lld, %lld)", MSDOS_I(inode)->i_pos, MSDOS_I(inode)->mmu_private); return -EIO; } - if (!((unsigned long)iblock & (MSDOS_SB(sb)->sec_per_clus - 1))) { + + offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); + if (!offset) { + /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) return err; } - MSDOS_I(inode)->mmu_private += sb->s_blocksize; - err = fat_bmap(inode, iblock, &phys); + /* available blocks on this cluster */ + mapped_blocks = sbi->sec_per_clus - offset; + + *max_blocks = min(mapped_blocks, *max_blocks); + MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; + + err = fat_bmap(inode, iblock, &phys, &mapped_blocks); if (err) return err; - if (!phys) - BUG(); + BUG_ON(!phys); + BUG_ON(*max_blocks != mapped_blocks); set_buffer_new(bh_result); map_bh(bh_result, sb, phys); return 0; } +static int fat_get_blocks(struct inode *inode, sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + int err; + + err = __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create); + if (err) + return err; + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + return 0; +} + +static int fat_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + unsigned long max_blocks = 1; + return __fat_get_blocks(inode, iblock, &max_blocks, bh_result, create); +} + static int fat_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, fat_get_block, wbc); } +static int fat_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, fat_get_block); +} + static int fat_readpage(struct file *file, struct page *page) { - return block_read_full_page(page, fat_get_block); + return mpage_readpage(page, fat_get_block); +} + +static int fat_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, fat_get_block); } static int fat_prepare_write(struct file *file, struct page *page, @@ -115,6 +163,34 @@ static int fat_commit_write(struct file *file, struct page *page, return err; } +static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + + if (rw == WRITE) { + /* + * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(), + * so we need to update the ->mmu_private to block boundary. + * + * But we must fill the remaining area or hole by nul for + * updating ->mmu_private. + */ + loff_t size = offset + iov_length(iov, nr_segs); + if (MSDOS_I(inode)->mmu_private < size) + return -EINVAL; + } + + /* + * FAT need to use the DIO_LOCKING for avoiding the race + * condition of fat_get_block() and ->truncate(). + */ + return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, fat_get_blocks, NULL); +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, fat_get_block); @@ -122,10 +198,13 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) static struct address_space_operations fat_aops = { .readpage = fat_readpage, + .readpages = fat_readpages, .writepage = fat_writepage, + .writepages = fat_writepages, .sync_page = block_sync_page, .prepare_write = fat_prepare_write, .commit_write = fat_commit_write, + .direct_IO = fat_direct_IO, .bmap = _fat_bmap }; @@ -182,7 +261,7 @@ void fat_attach(struct inode *inode, loff_t i_pos) spin_unlock(&sbi->inode_hash_lock); } -EXPORT_SYMBOL(fat_attach); +EXPORT_SYMBOL_GPL(fat_attach); void fat_detach(struct inode *inode) { @@ -193,7 +272,7 @@ void fat_detach(struct inode *inode) spin_unlock(&sbi->inode_hash_lock); } -EXPORT_SYMBOL(fat_detach); +EXPORT_SYMBOL_GPL(fat_detach); struct inode *fat_iget(struct super_block *sb, loff_t i_pos) { @@ -347,7 +426,7 @@ out: return inode; } -EXPORT_SYMBOL(fat_build_inode); +EXPORT_SYMBOL_GPL(fat_build_inode); static void fat_delete_inode(struct inode *inode) { @@ -374,12 +453,17 @@ static void fat_clear_inode(struct inode *inode) unlock_kernel(); } -static void fat_put_super(struct super_block *sb) +static void fat_write_super(struct super_block *sb) { - struct msdos_sb_info *sbi = MSDOS_SB(sb); + sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) fat_clusters_flush(sb); +} + +static void fat_put_super(struct super_block *sb) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sbi->nls_disk) { unload_nls(sbi->nls_disk); @@ -537,7 +621,7 @@ int fat_sync_inode(struct inode *inode) return fat_write_inode(inode, 1); } -EXPORT_SYMBOL(fat_sync_inode); +EXPORT_SYMBOL_GPL(fat_sync_inode); static int fat_show_options(struct seq_file *m, struct vfsmount *mnt); static struct super_operations fat_sops = { @@ -546,6 +630,7 @@ static struct super_operations fat_sops = { .write_inode = fat_write_inode, .delete_inode = fat_delete_inode, .put_super = fat_put_super, + .write_super = fat_write_super, .statfs = fat_statfs, .clear_inode = fat_clear_inode, .remount_fs = fat_remount, @@ -1347,7 +1432,7 @@ out_fail: return error; } -EXPORT_SYMBOL(fat_fill_super); +EXPORT_SYMBOL_GPL(fat_fill_super); int __init fat_cache_init(void); void fat_cache_destroy(void); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 2a0df2122f5d..32fb0a3f1da4 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -33,7 +33,7 @@ void fat_fs_panic(struct super_block *s, const char *fmt, ...) } } -EXPORT_SYMBOL(fat_fs_panic); +EXPORT_SYMBOL_GPL(fat_fs_panic); /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ @@ -67,8 +67,6 @@ void fat_clusters_flush(struct super_block *sb) if (sbi->prev_free != -1) fsinfo->next_cluster = cpu_to_le32(sbi->prev_free); mark_buffer_dirty(bh); - if (sb->s_flags & MS_SYNCHRONOUS) - sync_dirty_buffer(bh); } brelse(bh); } @@ -194,7 +192,7 @@ void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date) *date = cpu_to_le16(nl_day-day_n[month-1]+1+(month << 5)+(year << 9)); } -EXPORT_SYMBOL(fat_date_unix2dos); +EXPORT_SYMBOL_GPL(fat_date_unix2dos); int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { @@ -222,4 +220,4 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) return err; } -EXPORT_SYMBOL(fat_sync_bhs); +EXPORT_SYMBOL_GPL(fat_sync_bhs); diff --git a/fs/fcntl.c b/fs/fcntl.c index 863b46e0d78a..d0767fe58362 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/capability.h> #include <linux/dnotify.h> #include <linux/smp_lock.h> #include <linux/slab.h> @@ -457,11 +458,11 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = band_table[reason - POLL_IN]; si.si_fd = fd; - if (!send_group_sig_info(fown->signum, &si, p)) + if (!group_send_sig_info(fown->signum, &si, p)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: - send_group_sig_info(SIGIO, SEND_SIG_PRIV, p); + group_send_sig_info(SIGIO, SEND_SIG_PRIV, p); } } @@ -495,7 +496,7 @@ static void send_sigurg_to_task(struct task_struct *p, struct fown_struct *fown) { if (sigio_perm(p, fown, SIGURG)) - send_group_sig_info(SIGURG, SEND_SIG_PRIV, p); + group_send_sig_info(SIGURG, SEND_SIG_PRIV, p); } int send_sigurg(struct fown_struct *fown) diff --git a/fs/fifo.c b/fs/fifo.c index 5455916241f0..923371b753ab 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -35,7 +35,7 @@ static int fifo_open(struct inode *inode, struct file *filp) int ret; ret = -ERESTARTSYS; - if (down_interruptible(PIPE_SEM(*inode))) + if (mutex_lock_interruptible(PIPE_MUTEX(*inode))) goto err_nolock_nocleanup; if (!inode->i_pipe) { @@ -119,7 +119,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); return 0; err_rd: @@ -139,7 +139,7 @@ err: free_pipe_info(inode); err_nocleanup: - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); err_nolock_nocleanup: return ret; diff --git a/fs/file_table.c b/fs/file_table.c index c3a5e2fd663b..768b58167543 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -16,6 +16,7 @@ #include <linux/eventpoll.h> #include <linux/rcupdate.h> #include <linux/mount.h> +#include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> @@ -117,7 +118,7 @@ EXPORT_SYMBOL(get_empty_filp); void fastcall fput(struct file *file) { - if (rcuref_dec_and_test(&file->f_count)) + if (atomic_dec_and_test(&file->f_count)) __fput(file); } @@ -166,7 +167,7 @@ struct file fastcall *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!rcuref_inc_lf(&file->f_count)) { + if (!atomic_inc_not_zero(&file->f_count)) { /* File object ref couldn't be taken */ rcu_read_unlock(); return NULL; @@ -198,7 +199,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (rcuref_inc_lf(&file->f_count)) + if (atomic_inc_not_zero(&file->f_count)) *fput_needed = 1; else /* Didn't get the reference, someone's freed */ @@ -213,7 +214,7 @@ struct file fastcall *fget_light(unsigned int fd, int *fput_needed) void put_filp(struct file *file) { - if (rcuref_dec_and_test(&file->f_count)) { + if (atomic_dec_and_test(&file->f_count)) { security_file_free(file); file_kill(file); file_free(file); diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index d0401dc68d41..6f5df1700e95 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -99,8 +99,8 @@ static int vxfs_immed_readpage(struct file *fp, struct page *pp) { struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); - u_int64_t offset = pp->index << PAGE_CACHE_SHIFT; - caddr_t kaddr; + u_int64_t offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT; + caddr_t kaddr; kaddr = kmap(pp); memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8f873e621f41..e08ab4702d97 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -148,6 +148,26 @@ void fuse_release_background(struct fuse_req *req) spin_unlock(&fuse_lock); } +static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + struct fuse_init_out *arg = &req->misc.init_out; + + if (arg->major != FUSE_KERNEL_VERSION) + fc->conn_error = 1; + else { + fc->minor = arg->minor; + fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; + } + + /* After INIT reply is received other requests can go + out. So do (FUSE_MAX_OUTSTANDING - 1) number of + up()s on outstanding_sem. The last up() is done in + fuse_putback_request() */ + for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) + up(&fc->outstanding_sem); +} + /* * This function is called when a request is finished. Either a reply * has arrived or it was interrupted (and not yet sent) or some error @@ -172,19 +192,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) up_read(&fc->sbput_sem); } wake_up(&req->waitq); - if (req->in.h.opcode == FUSE_INIT) { - int i; - - if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; - - /* After INIT reply is received other requests can go - out. So do (FUSE_MAX_OUTSTANDING - 1) number of - up()s on outstanding_sem. The last up() is done in - fuse_putback_request() */ - for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) - up(&fc->outstanding_sem); - } else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) { + if (req->in.h.opcode == FUSE_INIT) + process_init_reply(fc, req); + else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) { /* Special case for failed iget in CREATE */ u64 nodeid = req->in.h.nodeid; __fuse_get_request(req); @@ -357,7 +367,7 @@ void fuse_send_init(struct fuse_conn *fc) /* This is called from fuse_read_super() so there's guaranteed to be a request available */ struct fuse_req *req = do_get_request(fc); - struct fuse_init_in_out *arg = &req->misc.init_in_out; + struct fuse_init_in *arg = &req->misc.init_in; arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; req->in.h.opcode = FUSE_INIT; @@ -365,8 +375,12 @@ void fuse_send_init(struct fuse_conn *fc) req->in.args[0].size = sizeof(*arg); req->in.args[0].value = arg; req->out.numargs = 1; - req->out.args[0].size = sizeof(*arg); - req->out.args[0].value = arg; + /* Variable length arguement used for backward compatibility + with interface version < 7.5. Rest of init_out is zeroed + by do_get_request(), so a short reply is not a problem */ + req->out.argvar = 1; + req->out.args[0].size = sizeof(struct fuse_init_out); + req->out.args[0].value = &req->misc.init_out; request_send_background(fc, req); } @@ -615,6 +629,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, struct fuse_copy_state cs; unsigned reqsize; + restart: spin_lock(&fuse_lock); fc = file->private_data; err = -EPERM; @@ -630,20 +645,25 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, req = list_entry(fc->pending.next, struct fuse_req, list); list_del_init(&req->list); - spin_unlock(&fuse_lock); in = &req->in; - reqsize = req->in.h.len; - fuse_copy_init(&cs, 1, req, iov, nr_segs); - err = -EINVAL; - if (iov_length(iov, nr_segs) >= reqsize) { - err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); - if (!err) - err = fuse_copy_args(&cs, in->numargs, in->argpages, - (struct fuse_arg *) in->args, 0); + reqsize = in->h.len; + /* If request is too large, reply with an error and restart the read */ + if (iov_length(iov, nr_segs) < reqsize) { + req->out.h.error = -EIO; + /* SETXATTR is special, since it may contain too large data */ + if (in->h.opcode == FUSE_SETXATTR) + req->out.h.error = -E2BIG; + request_end(fc, req); + goto restart; } + spin_unlock(&fuse_lock); + fuse_copy_init(&cs, 1, req, iov, nr_segs); + err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); + if (!err) + err = fuse_copy_args(&cs, in->numargs, in->argpages, + (struct fuse_arg *) in->args, 0); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); req->locked = 0; if (!err && req->interrupted) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 51f5da652771..417bcee466f6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -13,8 +13,16 @@ #include <linux/gfp.h> #include <linux/sched.h> #include <linux/namei.h> -#include <linux/mount.h> +/* + * FUSE caches dentries and attributes with separate timeout. The + * time in jiffies until the dentry/attributes are valid is stored in + * dentry->d_time and fuse_inode->i_time respectively. + */ + +/* + * Calculate the time in jiffies until a dentry/attributes are valid + */ static inline unsigned long time_to_jiffies(unsigned long sec, unsigned long nsec) { @@ -22,6 +30,50 @@ static inline unsigned long time_to_jiffies(unsigned long sec, return jiffies + timespec_to_jiffies(&ts); } +/* + * Set dentry and possibly attribute timeouts from the lookup/mk* + * replies + */ +static void fuse_change_timeout(struct dentry *entry, struct fuse_entry_out *o) +{ + entry->d_time = time_to_jiffies(o->entry_valid, o->entry_valid_nsec); + if (entry->d_inode) + get_fuse_inode(entry->d_inode)->i_time = + time_to_jiffies(o->attr_valid, o->attr_valid_nsec); +} + +/* + * Mark the attributes as stale, so that at the next call to + * ->getattr() they will be fetched from userspace + */ +void fuse_invalidate_attr(struct inode *inode) +{ + get_fuse_inode(inode)->i_time = jiffies - 1; +} + +/* + * Just mark the entry as stale, so that a next attempt to look it up + * will result in a new lookup call to userspace + * + * This is called when a dentry is about to become negative and the + * timeout is unknown (unlink, rmdir, rename and in some cases + * lookup) + */ +static void fuse_invalidate_entry_cache(struct dentry *entry) +{ + entry->d_time = jiffies - 1; +} + +/* + * Same as fuse_invalidate_entry_cache(), but also try to remove the + * dentry from the hash + */ +static void fuse_invalidate_entry(struct dentry *entry) +{ + d_invalidate(entry); + fuse_invalidate_entry_cache(entry); +} + static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, struct dentry *entry, struct fuse_entry_out *outarg) @@ -37,17 +89,34 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, req->out.args[0].value = outarg; } +/* + * Check whether the dentry is still valid + * + * If the entry validity timeout has expired and the dentry is + * positive, try to redo the lookup. If the lookup results in a + * different inode, then let the VFS invalidate the dentry and redo + * the lookup once more. If the lookup results in the same inode, + * then refresh the attributes, timeouts and mark the dentry valid. + */ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { - if (!entry->d_inode || is_bad_inode(entry->d_inode)) + struct inode *inode = entry->d_inode; + + if (inode && is_bad_inode(inode)) return 0; else if (time_after(jiffies, entry->d_time)) { int err; struct fuse_entry_out outarg; - struct inode *inode = entry->d_inode; - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_conn *fc; + struct fuse_req *req; + + /* Doesn't hurt to "reset" the validity timeout */ + fuse_invalidate_entry_cache(entry); + if (!inode) + return 0; + + fc = get_fuse_conn(inode); + req = fuse_get_request(fc); if (!req) return 0; @@ -55,6 +124,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) request_send(fc, req); err = req->out.h.error; if (!err) { + struct fuse_inode *fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { fuse_send_forget(fc, req, outarg.nodeid, 1); return 0; @@ -66,18 +136,18 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; fuse_change_attributes(inode, &outarg.attr); - entry->d_time = time_to_jiffies(outarg.entry_valid, - outarg.entry_valid_nsec); - fi->i_time = time_to_jiffies(outarg.attr_valid, - outarg.attr_valid_nsec); + fuse_change_timeout(entry, &outarg); } return 1; } +/* + * Check if there's already a hashed alias of this directory inode. + * If yes, then lookup and mkdir must not create a new alias. + */ static int dir_alias(struct inode *inode) { if (S_ISDIR(inode->i_mode)) { - /* Don't allow creating an alias to a directory */ struct dentry *alias = d_find_alias(inode); if (alias) { dput(alias); @@ -96,8 +166,14 @@ static struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, }; -static int fuse_lookup_iget(struct inode *dir, struct dentry *entry, - struct inode **inodep) +static inline int valid_mode(int m) +{ + return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || + S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); +} + +static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + struct nameidata *nd) { int err; struct fuse_entry_out outarg; @@ -106,53 +182,49 @@ static int fuse_lookup_iget(struct inode *dir, struct dentry *entry, struct fuse_req *req; if (entry->d_name.len > FUSE_NAME_MAX) - return -ENAMETOOLONG; + return ERR_PTR(-ENAMETOOLONG); req = fuse_get_request(fc); if (!req) - return -EINTR; + return ERR_PTR(-EINTR); fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); err = req->out.h.error; - if (!err && invalid_nodeid(outarg.nodeid)) + if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) || + !valid_mode(outarg.attr.mode))) err = -EIO; - if (!err) { + if (!err && outarg.nodeid) { inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr); if (!inode) { fuse_send_forget(fc, req, outarg.nodeid, 1); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } } fuse_put_request(fc, req); if (err && err != -ENOENT) - return err; + return ERR_PTR(err); - if (inode) { - struct fuse_inode *fi = get_fuse_inode(inode); - entry->d_time = time_to_jiffies(outarg.entry_valid, - outarg.entry_valid_nsec); - fi->i_time = time_to_jiffies(outarg.attr_valid, - outarg.attr_valid_nsec); + if (inode && dir_alias(inode)) { + iput(inode); + return ERR_PTR(-EIO); } - + d_add(entry, inode); entry->d_op = &fuse_dentry_operations; - *inodep = inode; - return 0; -} - -void fuse_invalidate_attr(struct inode *inode) -{ - get_fuse_inode(inode)->i_time = jiffies - 1; -} - -static void fuse_invalidate_entry(struct dentry *entry) -{ - d_invalidate(entry); - entry->d_time = jiffies - 1; + if (!err) + fuse_change_timeout(entry, &outarg); + else + fuse_invalidate_entry_cache(entry); + return NULL; } +/* + * Atomic create+open operation + * + * If the filesystem doesn't support this, then fall back to separate + * 'mknod' + 'open' requests. + */ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct nameidata *nd) { @@ -163,7 +235,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct fuse_open_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; - struct fuse_inode *fi; struct fuse_file *ff; struct file *file; int flags = nd->intent.open.flags - 1; @@ -172,10 +243,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (fc->no_create) goto out; - err = -ENAMETOOLONG; - if (entry->d_name.len > FUSE_NAME_MAX) - goto out; - err = -EINTR; req = fuse_get_request(fc); if (!req) @@ -220,17 +287,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); ff->fh = outopen.fh; + /* Special release, with inode = NULL, this will + trigger a 'forget' request when the release is + complete */ fuse_send_release(fc, ff, outentry.nodeid, NULL, flags, 0); goto out_put_request; } fuse_put_request(fc, req); - entry->d_time = time_to_jiffies(outentry.entry_valid, - outentry.entry_valid_nsec); - fi = get_fuse_inode(inode); - fi->i_time = time_to_jiffies(outentry.attr_valid, - outentry.attr_valid_nsec); - d_instantiate(entry, inode); + fuse_change_timeout(entry, &outentry); file = lookup_instantiate_filp(nd, entry, generic_file_open); if (IS_ERR(file)) { ff->fh = outopen.fh; @@ -248,13 +313,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, return err; } +/* + * Code shared between mknod, mkdir, symlink and link + */ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, struct inode *dir, struct dentry *entry, int mode) { struct fuse_entry_out outarg; struct inode *inode; - struct fuse_inode *fi; int err; req->in.h.nodeid = get_node_id(dir); @@ -268,10 +335,13 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, fuse_put_request(fc, req); return err; } - if (invalid_nodeid(outarg.nodeid)) { - fuse_put_request(fc, req); - return -EIO; - } + err = -EIO; + if (invalid_nodeid(outarg.nodeid)) + goto out_put_request; + + if ((outarg.attr.mode ^ mode) & S_IFMT) + goto out_put_request; + inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr); if (!inode) { @@ -280,22 +350,19 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, } fuse_put_request(fc, req); - /* Don't allow userspace to do really stupid things... */ - if (((inode->i_mode ^ mode) & S_IFMT) || dir_alias(inode)) { + if (dir_alias(inode)) { iput(inode); return -EIO; } - entry->d_time = time_to_jiffies(outarg.entry_valid, - outarg.entry_valid_nsec); - - fi = get_fuse_inode(inode); - fi->i_time = time_to_jiffies(outarg.attr_valid, - outarg.attr_valid_nsec); - d_instantiate(entry, inode); + fuse_change_timeout(entry, &outarg); fuse_invalidate_attr(dir); return 0; + + out_put_request: + fuse_put_request(fc, req); + return err; } static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, @@ -355,12 +422,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req; - - if (len > FUSE_SYMLINK_MAX) - return -ENAMETOOLONG; - - req = fuse_get_request(fc); + struct fuse_req *req = fuse_get_request(fc); if (!req) return -EINTR; @@ -399,6 +461,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) inode->i_nlink = 0; fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -424,6 +487,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) if (!err) { entry->d_inode->i_nlink = 0; fuse_invalidate_attr(dir); + fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -459,6 +523,10 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); + + /* newent will end up negative */ + if (newent->d_inode) + fuse_invalidate_entry_cache(newent); } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation @@ -566,6 +634,15 @@ static int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task) return 0; } +/* + * Check whether the inode attributes are still valid + * + * If the attribute validity timeout has expired, then fetch the fresh + * attributes with a 'getattr' request + * + * I'm not sure why cached attributes are never returned for the root + * inode, this is probably being too cautious. + */ static int fuse_revalidate(struct dentry *entry) { struct inode *inode = entry->d_inode; @@ -613,6 +690,19 @@ static int fuse_access(struct inode *inode, int mask) return err; } +/* + * Check permission. The two basic access models of FUSE are: + * + * 1) Local access checking ('default_permissions' mount option) based + * on file mode. This is the plain old disk filesystem permission + * modell. + * + * 2) "Remote" access checking, where server is responsible for + * checking permission in each inode operation. An exception to this + * is if ->permission() was invoked from sys_access() in which case an + * access request is sent. Execute permission is still checked + * locally based on file mode. + */ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -631,14 +721,10 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd) err = generic_permission(inode, mask, NULL); } - /* FIXME: Need some mechanism to revoke permissions: - currently if the filesystem suddenly changes the - file mode, we will not be informed about it, and - continue to allow access to the file/directory. - - This is actually not so grave, since the user can - simply keep access to the file/directory anyway by - keeping it open... */ + /* Note: the opposite of the above test does not + exist. So if permissions are revoked this won't be + noticed immediately, only after the attribute + timeout has expired */ return err; } else { @@ -691,7 +777,12 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) struct page *page; struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_request(fc); if (!req) return -EINTR; @@ -806,6 +897,15 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) } } +/* + * Set attributes, and at the same time refresh them. + * + * Truncation is slightly complicated, because the 'truncate' request + * may fail, in which case we don't want to touch the mapping. + * vmtruncate() doesn't allow for this case. So do the rlimit + * checking by hand and call vmtruncate() only after the file has + * actually been truncated. + */ static int fuse_setattr(struct dentry *entry, struct iattr *attr) { struct inode *inode = entry->d_inode; @@ -883,23 +983,6 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, return err; } -static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, - struct nameidata *nd) -{ - struct inode *inode; - int err; - - err = fuse_lookup_iget(dir, entry, &inode); - if (err) - return ERR_PTR(err); - if (inode && dir_alias(inode)) { - iput(inode); - return ERR_PTR(-EIO); - } - d_add(entry, inode); - return NULL; -} - static int fuse_setxattr(struct dentry *entry, const char *name, const void *value, size_t size, int flags) { @@ -909,9 +992,6 @@ static int fuse_setxattr(struct dentry *entry, const char *name, struct fuse_setxattr_in inarg; int err; - if (size > FUSE_XATTR_SIZE_MAX) - return -E2BIG; - if (fc->no_setxattr) return -EOPNOTSUPP; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2ca86141d13a..63d2980df5c9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -163,6 +163,9 @@ static int fuse_flush(struct file *file) struct fuse_flush_in inarg; int err; + if (is_bad_inode(inode)) + return -EIO; + if (fc->no_flush) return 0; @@ -199,6 +202,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, struct fuse_fsync_in inarg; int err; + if (is_bad_inode(inode)) + return -EIO; + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; @@ -272,16 +278,22 @@ static int fuse_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - loff_t pos = (loff_t) page->index << PAGE_CACHE_SHIFT; - struct fuse_req *req = fuse_get_request(fc); - int err = -EINTR; + struct fuse_req *req; + int err; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = -EINTR; + req = fuse_get_request(fc); if (!req) goto out; req->out.page_zeroing = 1; req->num_pages = 1; req->pages[0] = page; - fuse_send_read(req, file, inode, pos, PAGE_CACHE_SIZE); + fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE); err = req->out.h.error; fuse_put_request(fc, req); if (!err) @@ -295,7 +307,7 @@ static int fuse_readpage(struct file *file, struct page *page) static int fuse_send_readpages(struct fuse_req *req, struct file *file, struct inode *inode) { - loff_t pos = (loff_t) req->pages[0]->index << PAGE_CACHE_SHIFT; + loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; unsigned i; req->out.page_zeroing = 1; @@ -345,6 +357,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_readpages_data data; int err; + + if (is_bad_inode(inode)) + return -EIO; + data.file = file; data.inode = inode; data.req = fuse_get_request(fc); @@ -402,8 +418,13 @@ static int fuse_commit_write(struct file *file, struct page *page, unsigned count = to - offset; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + offset; - struct fuse_req *req = fuse_get_request(fc); + loff_t pos = page_offset(page) + offset; + struct fuse_req *req; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_request(fc); if (!req) return -EINTR; @@ -454,7 +475,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = min(npages, FUSE_MAX_PAGES_PER_REQ); + npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ); down_read(¤t->mm->mmap_sem); npages = get_user_pages(current, current->mm, user_addr, npages, write, 0, req->pages, NULL); @@ -475,12 +496,16 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; ssize_t res = 0; - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req; + + if (is_bad_inode(inode)) + return -EIO; + + req = fuse_get_request(fc); if (!req) return -EINTR; while (count) { - size_t tmp; size_t nres; size_t nbytes = min(count, nmax); int err = fuse_get_user_pages(req, buf, nbytes, !write); @@ -488,8 +513,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, res = err; break; } - tmp = (req->num_pages << PAGE_SHIFT) - req->page_offset; - nbytes = min(nbytes, tmp); + nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; + nbytes = min(count, nbytes); if (write) nres = fuse_send_write(req, file, inode, pos, nbytes); else @@ -535,9 +560,9 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, struct inode *inode = file->f_dentry->d_inode; ssize_t res; /* Don't allow parallel writes to the same file */ - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); res = fuse_direct_io(file, buf, count, ppos, 1); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return res; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0ea5301f86be..74c8d098a14a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,9 @@ /** If more requests are outstanding, then the operation will block */ #define FUSE_MAX_OUTSTANDING 10 +/** It could be as large as PATH_MAX, but would that have any uses? */ +#define FUSE_NAME_MAX 1024 + /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem module will check permissions based on the file mode. Otherwise no permission checking is done in the kernel */ @@ -108,9 +111,6 @@ struct fuse_out { struct fuse_arg args[3]; }; -struct fuse_req; -struct fuse_conn; - /** * A request to the client */ @@ -159,7 +159,8 @@ struct fuse_req { union { struct fuse_forget_in forget_in; struct fuse_release_in release_in; - struct fuse_init_in_out init_in_out; + struct fuse_init_in init_in; + struct fuse_init_out init_out; } misc; /** page vector */ @@ -272,6 +273,9 @@ struct fuse_conn { /** Is create not implemented by fs? */ unsigned no_create : 1; + /** Negotiated minor version */ + unsigned minor; + /** Backing dev info */ struct backing_dev_info bdi; }; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e69a546844d0..04c80cc957a3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -135,12 +135,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) fuse_init_common(inode); init_special_inode(inode, inode->i_mode, new_decode_dev(attr->rdev)); - } else { - /* Don't let user create weird files */ - inode->i_mode = S_IFREG; - fuse_init_common(inode); - fuse_init_file_inode(inode); - } + } else + BUG(); } static int fuse_inode_eq(struct inode *inode, void *_nodeidp) @@ -218,6 +214,7 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr { stbuf->f_type = FUSE_SUPER_MAGIC; stbuf->f_bsize = attr->bsize; + stbuf->f_frsize = attr->frsize; stbuf->f_blocks = attr->blocks; stbuf->f_bfree = attr->bfree; stbuf->f_bavail = attr->bavail; @@ -238,10 +235,12 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) if (!req) return -EINTR; + memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; req->in.h.opcode = FUSE_STATFS; req->out.numargs = 1; - req->out.args[0].size = sizeof(outarg); + req->out.args[0].size = + fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); req->out.args[0].value = &outarg; request_send(fc, req); err = req->out.h.error; @@ -482,7 +481,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->max_read = d.max_read; if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages) fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE; - fc->max_write = FUSE_MAX_IN / 2; err = -ENOMEM; root = get_root_inode(sb, d.rootmode); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index d499393a8ae7..050a49276499 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -547,13 +547,13 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (atomic_read(&file->f_count) != 0) return 0; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index c7d316455fa0..9fb51632303c 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -29,7 +29,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma return size; dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); - down(&HFSPLUS_SB(sb).alloc_file->i_sem); + mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; page = read_cache_page(mapping, offset / PAGE_CACHE_BITS, (filler_t *)mapping->a_ops->readpage, NULL); @@ -143,7 +143,7 @@ done: sb->s_dirt = 1; dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); out: - up(&HFSPLUS_SB(sb).alloc_file->i_sem); + mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); return start; } @@ -164,7 +164,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) if ((offset + count) > HFSPLUS_SB(sb).total_blocks) return -2; - down(&HFSPLUS_SB(sb).alloc_file->i_sem); + mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; pnr = offset / PAGE_CACHE_BITS; page = read_cache_page(mapping, pnr, (filler_t *)mapping->a_ops->readpage, NULL); @@ -215,7 +215,7 @@ out: kunmap(page); HFSPLUS_SB(sb).free_blocks += len; sb->s_dirt = 1; - up(&HFSPLUS_SB(sb).alloc_file->i_sem); + mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); return 0; } diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index df16fcbff3fb..0fa1ab6250bf 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -143,9 +143,6 @@ struct hfsplus_sb_info { unsigned long flags; - atomic_t inode_cnt; - u32 last_inode_cnt; - struct hlist_head rsrc_inodes; }; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index fc98583cf045..7acff6c5464f 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -182,11 +182,6 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent igrab(dir); hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); mark_inode_dirty(inode); - { - void hfsplus_inode_check(struct super_block *sb); - atomic_inc(&HFSPLUS_SB(sb).inode_cnt); - hfsplus_inode_check(sb); - } out: d_add(dentry, inode); return NULL; @@ -276,13 +271,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (atomic_read(&file->f_count) != 0) return 0; if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); hfsplus_delete_inode(inode); } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } return 0; } @@ -317,11 +312,6 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode) if (!inode) return NULL; - { - void hfsplus_inode_check(struct super_block *sb); - atomic_inc(&HFSPLUS_SB(sb).inode_cnt); - hfsplus_inode_check(sb); - } inode->i_ino = HFSPLUS_SB(sb).next_cnid++; inode->i_mode = mode; inode->i_uid = current->fsuid; diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index e07aa096e07c..13cf848ac833 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -12,6 +12,7 @@ * hfsplus ioctls */ +#include <linux/capability.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/xattr.h> diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 8093351bd7c3..d791780def50 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -22,29 +22,12 @@ static void hfsplus_destroy_inode(struct inode *inode); #include "hfsplus_fs.h" -void hfsplus_inode_check(struct super_block *sb) -{ -#if 0 - u32 cnt = atomic_read(&HFSPLUS_SB(sb).inode_cnt); - u32 last_cnt = HFSPLUS_SB(sb).last_inode_cnt; - - if (cnt <= (last_cnt / 2) || - cnt >= (last_cnt * 2)) { - HFSPLUS_SB(sb).last_inode_cnt = cnt; - printk("inode_check: %u,%u,%u\n", cnt, last_cnt, - HFSPLUS_SB(sb).cat_tree ? HFSPLUS_SB(sb).cat_tree->node_hash_cnt : 0); - } -#endif -} - static void hfsplus_read_inode(struct inode *inode) { struct hfs_find_data fd; struct hfsplus_vh *vhdr; int err; - atomic_inc(&HFSPLUS_SB(inode->i_sb).inode_cnt); - hfsplus_inode_check(inode->i_sb); INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); init_MUTEX(&HFSPLUS_I(inode).extents_lock); HFSPLUS_I(inode).flags = 0; @@ -155,12 +138,10 @@ static int hfsplus_write_inode(struct inode *inode, int unused) static void hfsplus_clear_inode(struct inode *inode) { dprint(DBG_INODE, "hfsplus_clear_inode: %lu\n", inode->i_ino); - atomic_dec(&HFSPLUS_SB(inode->i_sb).inode_cnt); if (HFSPLUS_IS_RSRC(inode)) { HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; iput(HFSPLUS_I(inode).rsrc_inode); } - hfsplus_inode_check(inode->i_sb); } static void hfsplus_write_super(struct super_block *sb) @@ -320,7 +301,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) /* temporarily use utf8 to correctly find the hidden dir below */ nls = sbi->nls; sbi->nls = load_nls("utf8"); - if (!nls) { + if (!sbi->nls) { printk("HFS+: unable to load nls for utf8\n"); err = -EINVAL; goto cleanup; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 0217c3a04441..5591f9623aa2 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -32,19 +32,19 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; - down(&i->i_sem); + mutex_lock(&i->i_mutex); pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } - up(&i->i_sem); + mutex_unlock(&i->i_mutex); ok: unlock_kernel(); return filp->f_pos = new_off; fail: - up(&i->i_sem); + mutex_unlock(&i->i_mutex); /*printk("illegal lseek: %016llx\n", new_off);*/ unlock_kernel(); return -ESPIPE; diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index 52930915bad8..a44dc5897399 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -171,12 +171,12 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, err = -ENOMEM; parent = HPPFS_I(ino)->proc_dentry; - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); proc_dentry = d_lookup(parent, &dentry->d_name); if(proc_dentry == NULL){ proc_dentry = d_alloc(parent, &dentry->d_name); if(proc_dentry == NULL){ - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); goto out; } new = (*parent->d_inode->i_op->lookup)(parent->d_inode, @@ -186,7 +186,7 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, proc_dentry = new; } } - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); if(IS_ERR(proc_dentry)) return(proc_dentry); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8c1cef3bb677..ab4c3a9d51b8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -18,6 +18,7 @@ #include <linux/highmem.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/capability.h> #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> @@ -100,9 +101,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) loff_t len, vma_len; int ret; - if ((vma->vm_flags & (VM_MAYSHARE | VM_WRITE)) == VM_WRITE) - return -EINVAL; - if (vma->vm_pgoff & (HPAGE_SIZE / PAGE_SIZE - 1)) return -EINVAL; @@ -121,7 +119,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); file_accessed(file); vma->vm_flags |= VM_HUGETLB | VM_RESERVED; vma->vm_ops = &hugetlb_vm_ops; @@ -136,7 +134,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (inode->i_size < len) inode->i_size = len; out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/fs/inode.c b/fs/inode.c index d8d04bd72b59..108138d4e909 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -22,6 +22,7 @@ #include <linux/cdev.h> #include <linux/bootmem.h> #include <linux/inotify.h> +#include <linux/mount.h> /* * This is needed for the following functions: @@ -192,7 +193,7 @@ void inode_init_once(struct inode *inode) INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_dentry); INIT_LIST_HEAD(&inode->i_devices); - sema_init(&inode->i_sem, 1); + mutex_init(&inode->i_mutex); init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); rwlock_init(&inode->i_data.tree_lock); @@ -770,7 +771,7 @@ EXPORT_SYMBOL(igrab); * * Note, @test is called with the inode_lock held, so can't sleep. */ -static inline struct inode *ifind(struct super_block *sb, +static struct inode *ifind(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data, const int wait) { @@ -804,7 +805,7 @@ static inline struct inode *ifind(struct super_block *sb, * * Otherwise NULL is returned. */ -static inline struct inode *ifind_fast(struct super_block *sb, +static struct inode *ifind_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) { struct inode *inode; @@ -1176,22 +1177,33 @@ sector_t bmap(struct inode * inode, sector_t block) EXPORT_SYMBOL(bmap); /** - * update_atime - update the access time + * touch_atime - update the access time + * @mnt: mount the inode is accessed on * @inode: inode accessed * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void update_atime(struct inode *inode) +void touch_atime(struct vfsmount *mnt, struct dentry *dentry) { + struct inode *inode = dentry->d_inode; struct timespec now; - if (IS_NOATIME(inode)) + if (IS_RDONLY(inode)) return; - if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + + if ((inode->i_flags & S_NOATIME) || + (inode->i_sb->s_flags & MS_NOATIME) || + ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) return; - if (IS_RDONLY(inode)) + + /* + * We may have a NULL vfsmount when coming from NFSD + */ + if (mnt && + ((mnt->mnt_flags & MNT_NOATIME) || + ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))) return; now = current_fs_time(inode->i_sb); @@ -1201,19 +1213,23 @@ void update_atime(struct inode *inode) } } -EXPORT_SYMBOL(update_atime); +EXPORT_SYMBOL(touch_atime); /** - * inode_update_time - update mtime and ctime time - * @inode: inode accessed - * @ctime_too: update ctime too + * file_update_time - update mtime and ctime time + * @file: file accessed * - * Update the mtime time on an inode and mark it for writeback. - * When ctime_too is specified update the ctime too. + * Update the mtime and ctime members of an inode and mark the inode + * for writeback. Note that this function is meant exclusively for + * usage in the file write path of filesystems, and filesystems may + * choose to explicitly ignore update via this function with the + * S_NOCTIME inode flag, e.g. for network filesystem where these + * timestamps are handled by the server. */ -void inode_update_time(struct inode *inode, int ctime_too) +void file_update_time(struct file *file) { + struct inode *inode = file->f_dentry->d_inode; struct timespec now; int sync_it = 0; @@ -1227,16 +1243,15 @@ void inode_update_time(struct inode *inode, int ctime_too) sync_it = 1; inode->i_mtime = now; - if (ctime_too) { - if (!timespec_equal(&inode->i_ctime, &now)) - sync_it = 1; - inode->i_ctime = now; - } + if (!timespec_equal(&inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + if (sync_it) mark_inode_dirty_sync(inode); } -EXPORT_SYMBOL(inode_update_time); +EXPORT_SYMBOL(file_update_time); int inode_needs_sync(struct inode *inode) { diff --git a/fs/ioctl.c b/fs/ioctl.c index 569209181425..f8aeec3ca10c 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -8,6 +8,7 @@ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/smp_lock.h> +#include <linux/capability.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> diff --git a/fs/ioprio.c b/fs/ioprio.c index 4bf1c6365a19..ca77008146c0 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -22,6 +22,7 @@ #include <linux/kernel.h> #include <linux/ioprio.h> #include <linux/blkdev.h> +#include <linux/capability.h> #include <linux/syscalls.h> static int set_task_ioprio(struct task_struct *task, int ioprio) diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index 014a51fd00d7..cb3cef525c3b 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -24,29 +24,75 @@ #include <linux/slab.h> /* - * Unlink a buffer from a transaction. + * Unlink a buffer from a transaction checkpoint list. * * Called with j_list_lock held. */ -static inline void __buffer_unlink(struct journal_head *jh) +static void __buffer_unlink_first(struct journal_head *jh) { transaction_t *transaction; transaction = jh->b_cp_transaction; - jh->b_cp_transaction = NULL; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) + if (transaction->t_checkpoint_list == jh) { transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } +} + +/* + * Unlink a buffer from a transaction checkpoint(io) list. + * + * Called with j_list_lock held. + */ + +static inline void __buffer_unlink(struct journal_head *jh) +{ + transaction_t *transaction; + + transaction = jh->b_cp_transaction; + + __buffer_unlink_first(jh); + if (transaction->t_checkpoint_io_list == jh) { + transaction->t_checkpoint_io_list = jh->b_cpnext; + if (transaction->t_checkpoint_io_list == jh) + transaction->t_checkpoint_io_list = NULL; + } +} + +/* + * Move a buffer from the checkpoint list to the checkpoint io list + * + * Called with j_list_lock held + */ + +static inline void __buffer_relink_io(struct journal_head *jh) +{ + transaction_t *transaction; + + transaction = jh->b_cp_transaction; + __buffer_unlink_first(jh); + + if (!transaction->t_checkpoint_io_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_io_list; + jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_io_list = jh; } /* * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it. + * Returns 1 if we released it and 2 if we also released the + * whole transaction. + * * Requires j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ @@ -57,12 +103,11 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); - __journal_remove_checkpoint(jh); + ret = __journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); - ret = 1; } else { jbd_unlock_bh_state(bh); } @@ -117,83 +162,53 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) } /* - * Clean up a transaction's checkpoint list. - * - * We wait for any pending IO to complete and make sure any clean - * buffers are removed from the transaction. - * - * Return 1 if we performed any actions which might have destroyed the - * checkpoint. (journal_remove_checkpoint() deletes the transaction when - * the last checkpoint buffer is cleansed) + * Clean up transaction's list of buffers submitted for io. + * We wait for any pending IO to complete and remove any clean + * buffers. Note that we take the buffers in the opposite ordering + * from the one in which they were submitted for IO. * * Called with j_list_lock held. */ -static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) + +static void __wait_cp_io(journal_t *journal, transaction_t *transaction) { - struct journal_head *jh, *next_jh, *last_jh; + struct journal_head *jh; struct buffer_head *bh; - int ret = 0; - - assert_spin_locked(&journal->j_list_lock); - jh = transaction->t_checkpoint_list; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - next_jh = jh; - do { - jh = next_jh; + tid_t this_tid; + int released = 0; + + this_tid = transaction->t_tid; +restart: + /* Didn't somebody clean up the transaction in the meanwhile */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + return; + while (!released && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); + goto restart; + } if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); - goto out_return_1; - } - - /* - * This is foul - */ - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - goto out_return_1; + spin_lock(&journal->j_list_lock); + goto restart; } - - if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - goto out_return_1; - } - /* - * AKPM: I think the buffer_jbddirty test is redundant - it - * shouldn't have NULL b_transaction? + * Now in whatever state the buffer currently is, we know that + * it has been written out and so we can drop it from the list */ - next_jh = jh->b_cpnext; - if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - __brelse(bh); - ret = 1; - } else { - jbd_unlock_bh_state(bh); - } - } while (jh != last_jh); - - return ret; -out_return_1: - spin_lock(&journal->j_list_lock); - return 1; + released = __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + } } #define NR_BATCH 64 @@ -203,9 +218,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; - spin_unlock(&journal->j_list_lock); ll_rw_block(SWRITE, *batch_count, bhs); - spin_lock(&journal->j_list_lock); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -221,19 +234,46 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Return 1 if something happened which requires us to abort the current * scan of the checkpoint list. * - * Called with j_list_lock held. + * Called with j_list_lock held and drops it if 1 is returned * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ -static int __flush_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count, - int *drop_count) +static int __process_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count) { struct buffer_head *bh = jh2bh(jh); int ret = 0; - if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { - J_ASSERT_JH(jh, jh->b_transaction == NULL); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + put_bh(bh); + ret = 1; + } + else if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + ret = 1; + } + else if (!buffer_dirty(bh)) { + J_ASSERT_JH(jh, !buffer_jbddirty(bh)); + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + put_bh(bh); + ret = 1; + } + else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. @@ -246,45 +286,30 @@ static int __flush_buffer(journal_t *journal, struct journal_head *jh, J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; + __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { + spin_unlock(&journal->j_list_lock); __flush_batch(journal, bhs, batch_count); ret = 1; } - } else { - int last_buffer = 0; - if (jh->b_cpnext == jh) { - /* We may be about to drop the transaction. Tell the - * caller that the lists have changed. - */ - last_buffer = 1; - } - if (__try_to_free_cp_buf(jh)) { - (*drop_count)++; - ret = last_buffer; - } } return ret; } /* - * Perform an actual checkpoint. We don't write out only enough to - * satisfy the current blocked requests: rather we submit a reasonably - * sized chunk of the outstanding data to disk at once for - * efficiency. __log_wait_for_space() will retry if we didn't free enough. + * Perform an actual checkpoint. We take the first transaction on the + * list of transactions to be checkpointed and send all its buffers + * to disk. We submit larger chunks of data at once. * - * However, we _do_ take into account the amount requested so that once - * the IO has been queued, we can return as soon as enough of it has - * completed to disk. - * * The journal should be locked before calling this function. */ int log_do_checkpoint(journal_t *journal) { + transaction_t *transaction; + tid_t this_tid; int result; - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; jbd_debug(1, "Start checkpoint\n"); @@ -299,79 +324,70 @@ int log_do_checkpoint(journal_t *journal) return result; /* - * OK, we need to start writing disk blocks. Try to free up a - * quarter of the log in a single checkpoint if we can. + * OK, we need to start writing disk blocks. Take one transaction + * and write it. */ + spin_lock(&journal->j_list_lock); + if (!journal->j_checkpoint_transactions) + goto out; + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; +restart: /* - * AKPM: check this code. I had a feeling a while back that it - * degenerates into a busy loop at unmount time. + * If someone cleaned up this transaction while we slept, we're + * done (maybe it's a new transaction, but it fell at the same + * address). */ - spin_lock(&journal->j_list_lock); - while (journal->j_checkpoint_transactions) { - transaction_t *transaction; - struct journal_head *jh, *last_jh, *next_jh; - int drop_count = 0; - int cleanup_ret, retry = 0; - tid_t this_tid; - - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; - jh = transaction->t_checkpoint_list; - last_jh = jh->b_cpprev; - next_jh = jh; - do { + if (journal->j_checkpoint_transactions == transaction || + transaction->t_tid == this_tid) { + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + struct journal_head *jh; + int retry = 0; + + while (!retry && transaction->t_checkpoint_list) { struct buffer_head *bh; - jh = next_jh; - next_jh = jh->b_cpnext; + jh = transaction->t_checkpoint_list; bh = jh2bh(jh); if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); retry = 1; break; } - retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); - if (cond_resched_lock(&journal->j_list_lock)) { + retry = __process_buffer(journal, jh, bhs, + &batch_count); + if (!retry && + lock_need_resched(&journal->j_list_lock)) { + spin_unlock(&journal->j_list_lock); retry = 1; break; } - } while (jh != last_jh && !retry); + } if (batch_count) { + if (!retry) { + spin_unlock(&journal->j_list_lock); + retry = 1; + } __flush_batch(journal, bhs, &batch_count); - retry = 1; } + if (retry) { + spin_lock(&journal->j_list_lock); + goto restart; + } /* - * If someone cleaned up this transaction while we slept, we're - * done - */ - if (journal->j_checkpoint_transactions != transaction) - break; - if (retry) - continue; - /* - * Maybe it's a new transaction, but it fell at the same - * address - */ - if (transaction->t_tid != this_tid) - continue; - /* - * We have walked the whole transaction list without - * finding anything to write to disk. We had better be - * able to make some progress or we are in trouble. + * Now we have cleaned up the first transaction's checkpoint + * list. Let's clean up the second one. */ - cleanup_ret = __cleanup_transaction(journal, transaction); - J_ASSERT(drop_count != 0 || cleanup_ret != 0); - if (journal->j_checkpoint_transactions != transaction) - break; + __wait_cp_io(journal, transaction); } +out: spin_unlock(&journal->j_list_lock); result = cleanup_journal_tail(journal); if (result < 0) return result; - return 0; } @@ -456,52 +472,91 @@ int cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ /* + * journal_clean_one_cp_list + * + * Find all the written-back checkpoint buffers in the given list and release them. + * + * Called with the journal locked. + * Called with j_list_lock held. + * Returns number of bufers reaped (for debug) + */ + +static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +{ + struct journal_head *last_jh; + struct journal_head *next_jh = jh; + int ret, freed = 0; + + *released = 0; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranking */ + if (jbd_trylock_bh_state(jh2bh(jh))) { + ret = __try_to_free_cp_buf(jh); + if (ret) { + freed++; + if (ret == 2) { + *released = 1; + return freed; + } + } + } + /* + * This function only frees up some memory if possible so we + * dont have an obligation to finish processing. Bail out if + * preemption requested: + */ + if (need_resched()) + return freed; + } while (jh != last_jh); + + return freed; +} + +/* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) + * Returns number of buffers reaped (for debug) */ int __journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0; + int ret = 0, released; transaction = journal->j_checkpoint_transactions; - if (transaction == 0) + if (!transaction) goto out; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { - struct journal_head *jh; - transaction = next_transaction; next_transaction = transaction->t_cpnext; - jh = transaction->t_checkpoint_list; - if (jh) { - struct journal_head *last_jh = jh->b_cpprev; - struct journal_head *next_jh = jh; - - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranknig */ - if (jbd_trylock_bh_state(jh2bh(jh))) - ret += __try_to_free_cp_buf(jh); - /* - * This function only frees up some memory - * if possible so we dont have an obligation - * to finish processing. Bail out if preemption - * requested: - */ - if (need_resched()) - goto out; - } while (jh != last_jh); - } + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_list, &released); + if (need_resched()) + goto out; + if (released) + continue; + /* + * It is essential that we are as careful as in the case of + * t_checkpoint_list with removing the buffer from the list as + * we can possibly see not yet submitted buffers on io_list + */ + ret += journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list, &released); + if (need_resched()) + goto out; } while (transaction != last_transaction); out: return ret; @@ -516,18 +571,22 @@ out: * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint - * list until they have been rewritten, at which point this function is + * lists until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's - * checkpoint list. + * checkpoint lists. + * + * The function returns 1 if it frees the transaction, 0 otherwise. * * This function is called with the journal locked. * This function is called with j_list_lock held. + * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ -void __journal_remove_checkpoint(struct journal_head *jh) +int __journal_remove_checkpoint(struct journal_head *jh) { transaction_t *transaction; journal_t *journal; + int ret = 0; JBUFFER_TRACE(jh, "entry"); @@ -538,8 +597,10 @@ void __journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal; __buffer_unlink(jh); + jh->b_cp_transaction = NULL; - if (transaction->t_checkpoint_list != NULL) + if (transaction->t_checkpoint_list != NULL || + transaction->t_checkpoint_io_list != NULL) goto out; JBUFFER_TRACE(jh, "transaction has no more buffers"); @@ -565,8 +626,10 @@ void __journal_remove_checkpoint(struct journal_head *jh) /* Just in case anybody was waiting for more transactions to be checkpointed... */ wake_up(&journal->j_wait_logspace); + ret = 1; out: JBUFFER_TRACE(jh, "exit"); + return ret; } /* @@ -628,6 +691,7 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); + J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index 3dcc6d2162cb..fc3855a1aef3 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -757,7 +757,7 @@ jffs_do_readpage_nolock(struct file *file, struct page *page) read_len = 0; result = 0; - offset = page->index << PAGE_CACHE_SHIFT; + offset = page_offset(page); kmap(page); buf = page_address(page); @@ -1415,7 +1415,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count, * This will never trigger with sane page sizes. leave it in * anyway, since I'm thinking about how to merge larger writes * (the current idea is to poke a thread that does the actual - * I/O and starts by doing a down(&inode->i_sem). then we + * I/O and starts by doing a mutex_lock(&inode->i_mutex). then we * would need to get the page cache pages and have a list of * I/O requests and do write-merging here. * -- prumpf @@ -1545,7 +1545,7 @@ jffs_commit_write(struct file *filp, struct page *page, { void *addr = page_address(page) + from; /* XXX: PAGE_CACHE_SHIFT or PAGE_SHIFT */ - loff_t pos = (page->index<<PAGE_CACHE_SHIFT) + from; + loff_t pos = page_offset(page) + from; return jffs_file_write(filp, addr, to-from, &pos); } /* jffs_commit_write() */ diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d0fcc5f3497e..09e5d10b8840 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -11,6 +11,7 @@ * */ +#include <linux/capability.h> #include <linux/config.h> #include <linux/kernel.h> #include <linux/sched.h> diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 68000a50ceb6..2967b7393415 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -302,8 +302,7 @@ int dbSync(struct inode *ipbmap) /* * write out dirty pages of bmap */ - filemap_fdatawrite(ipbmap->i_mapping); - filemap_fdatawait(ipbmap->i_mapping); + filemap_write_and_wait(ipbmap->i_mapping); diWriteSpecial(ipbmap, 0); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 28201b194f53..31b4aa13dd4b 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -265,8 +265,7 @@ int diSync(struct inode *ipimap) /* * write out dirty pages of imap */ - filemap_fdatawrite(ipimap->i_mapping); - filemap_fdatawait(ipimap->i_mapping); + filemap_write_and_wait(ipimap->i_mapping); diWriteSpecial(ipimap, 0); @@ -565,8 +564,7 @@ void diFreeSpecial(struct inode *ip) jfs_err("diFreeSpecial called with NULL ip!"); return; } - filemap_fdatawrite(ip->i_mapping); - filemap_fdatawait(ip->i_mapping); + filemap_write_and_wait(ip->i_mapping); truncate_inode_pages(ip->i_mapping, 0); iput(ip); } diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index c0fd7b3eadc6..dc21a5bd54d4 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -58,7 +58,7 @@ struct jfs_inode_info { /* * rdwrlock serializes xtree between reads & writes and synchronizes * changes to special inodes. It's use would be redundant on - * directories since the i_sem taken in the VFS is sufficient. + * directories since the i_mutex taken in the VFS is sufficient. */ struct rw_semaphore rdwrlock; /* @@ -68,7 +68,7 @@ struct jfs_inode_info { * inode is blocked in txBegin or TxBeginAnon */ struct semaphore commit_sem; - /* xattr_sem allows us to access the xattrs without taking i_sem */ + /* xattr_sem allows us to access the xattrs without taking i_mutex */ struct rw_semaphore xattr_sem; lid_t xtlid; /* lid of xtree lock on directory */ #ifdef CONFIG_JFS_POSIX_ACL diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index b660c93c92de..2ddb6b892bcf 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -1231,10 +1231,8 @@ int txCommit(tid_t tid, /* transaction identifier */ * when we don't need to worry about it at all. * * if ((!S_ISDIR(ip->i_mode)) - * && (tblk->flag & COMMIT_DELETE) == 0) { - * filemap_fdatawrite(ip->i_mapping); - * filemap_fdatawait(ip->i_mapping); - * } + * && (tblk->flag & COMMIT_DELETE) == 0) + * filemap_write_and_wait(ip->i_mapping); */ /* diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c index 5cf91785b541..21eaf7ac0fcb 100644 --- a/fs/jfs/jfs_umount.c +++ b/fs/jfs/jfs_umount.c @@ -108,8 +108,7 @@ int jfs_umount(struct super_block *sb) * Make sure all metadata makes it to disk before we mark * the superblock as clean */ - filemap_fdatawrite(sbi->direct_inode->i_mapping); - filemap_fdatawait(sbi->direct_inode->i_mapping); + filemap_write_and_wait(sbi->direct_inode->i_mapping); /* * ensure all file system file pages are propagated to their @@ -161,8 +160,7 @@ int jfs_umount_rw(struct super_block *sb) * mark the superblock clean before everything is flushed to * disk. */ - filemap_fdatawrite(sbi->direct_inode->i_mapping); - filemap_fdatawait(sbi->direct_inode->i_mapping); + filemap_write_and_wait(sbi->direct_inode->i_mapping); updateSuper(sb, FM_CLEAN); diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index c6dc254d3253..45180361871c 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -376,8 +376,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * by txCommit(); */ filemap_fdatawait(ipbmap->i_mapping); - filemap_fdatawrite(ipbmap->i_mapping); - filemap_fdatawait(ipbmap->i_mapping); + filemap_write_and_wait(ipbmap->i_mapping); diWriteSpecial(ipbmap, 0); newPage = nPages; /* first new page number */ diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4226af3ea91b..8d31f1336431 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -502,8 +502,7 @@ out_no_rw: jfs_err("jfs_umount failed with return code %d", rc); } out_mount_failed: - filemap_fdatawrite(sbi->direct_inode->i_mapping); - filemap_fdatawait(sbi->direct_inode->i_mapping); + filemap_write_and_wait(sbi->direct_inode->i_mapping); truncate_inode_pages(sbi->direct_inode->i_mapping, 0); make_bad_inode(sbi->direct_inode); iput(sbi->direct_inode); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 23aa5066b5a4..f23048f9471f 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -17,6 +17,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/capability.h> #include <linux/fs.h> #include <linux/xattr.h> #include <linux/posix_acl_xattr.h> @@ -83,21 +84,6 @@ struct ea_buffer { #define EA_NEW 0x0004 #define EA_MALLOC 0x0008 -/* Namespaces */ -#define XATTR_SYSTEM_PREFIX "system." -#define XATTR_SYSTEM_PREFIX_LEN (sizeof (XATTR_SYSTEM_PREFIX) - 1) - -#define XATTR_USER_PREFIX "user." -#define XATTR_USER_PREFIX_LEN (sizeof (XATTR_USER_PREFIX) - 1) - -#define XATTR_OS2_PREFIX "os2." -#define XATTR_OS2_PREFIX_LEN (sizeof (XATTR_OS2_PREFIX) - 1) - -/* XATTR_SECURITY_PREFIX is defined in include/linux/xattr.h */ -#define XATTR_SECURITY_PREFIX_LEN (sizeof (XATTR_SECURITY_PREFIX) - 1) - -#define XATTR_TRUSTED_PREFIX "trusted." -#define XATTR_TRUSTED_PREFIX_LEN (sizeof (XATTR_TRUSTED_PREFIX) - 1) /* * These three routines are used to recognize on-disk extended attributes @@ -773,36 +759,23 @@ static int can_set_system_xattr(struct inode *inode, const char *name, static int can_set_xattr(struct inode *inode, const char *name, const void *value, size_t value_len) { - if (IS_RDONLY(inode)) - return -EROFS; - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - - if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0) - /* - * "system.*" - */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return can_set_system_xattr(inode, name, value, value_len); - if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) - return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); - -#ifdef CONFIG_JFS_SECURITY - if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) - == 0) - return 0; /* Leave it to the security module */ -#endif - - if((strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) != 0) && - (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) != 0)) + /* + * Don't allow setting an attribute in an unknown namespace. + */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) return -EOPNOTSUPP; if (!S_ISREG(inode->i_mode) && (!S_ISDIR(inode->i_mode) || inode->i_mode &S_ISVTX)) return -EPERM; - return permission(inode, MAY_WRITE, NULL); + return 0; } int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, @@ -972,22 +945,6 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, return rc; } -static int can_get_xattr(struct inode *inode, const char *name) -{ -#ifdef CONFIG_JFS_SECURITY - if(strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) - return 0; -#endif - - if(strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) - return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); - - if(strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) == 0) - return 0; - - return permission(inode, MAY_READ, NULL); -} - ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, size_t buf_size) { @@ -998,12 +955,8 @@ ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, ssize_t size; int namelen = strlen(name); char *os2name = NULL; - int rc; char *value; - if ((rc = can_get_xattr(inode, name))) - return rc; - if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { os2name = kmalloc(namelen - XATTR_OS2_PREFIX_LEN + 1, GFP_KERNEL); diff --git a/fs/libfs.c b/fs/libfs.c index 58101dff2c66..63c020e6589e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -74,7 +74,7 @@ int dcache_dir_close(struct inode *inode, struct file *file) loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) { - down(&file->f_dentry->d_inode->i_sem); + mutex_lock(&file->f_dentry->d_inode->i_mutex); switch (origin) { case 1: offset += file->f_pos; @@ -82,7 +82,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) if (offset >= 0) break; default: - up(&file->f_dentry->d_inode->i_sem); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -93,20 +93,20 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int origin) loff_t n = file->f_pos - 2; spin_lock(&dcache_lock); - list_del(&cursor->d_child); + list_del(&cursor->d_u.d_child); p = file->f_dentry->d_subdirs.next; while (n && p != &file->f_dentry->d_subdirs) { struct dentry *next; - next = list_entry(p, struct dentry, d_child); + next = list_entry(p, struct dentry, d_u.d_child); if (!d_unhashed(next) && next->d_inode) n--; p = p->next; } - list_add_tail(&cursor->d_child, p); + list_add_tail(&cursor->d_u.d_child, p); spin_unlock(&dcache_lock); } } - up(&file->f_dentry->d_inode->i_sem); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); return offset; } @@ -126,7 +126,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct dentry *dentry = filp->f_dentry; struct dentry *cursor = filp->private_data; - struct list_head *p, *q = &cursor->d_child; + struct list_head *p, *q = &cursor->d_u.d_child; ino_t ino; int i = filp->f_pos; @@ -153,7 +153,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) } for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; - next = list_entry(p, struct dentry, d_child); + next = list_entry(p, struct dentry, d_u.d_child); if (d_unhashed(next) || !next->d_inode) continue; @@ -261,7 +261,7 @@ int simple_empty(struct dentry *dentry) int ret = 0; spin_lock(&dcache_lock); - list_for_each_entry(child, &dentry->d_subdirs, d_child) + list_for_each_entry(child, &dentry->d_subdirs, d_u.d_child) if (simple_positive(child)) goto out; ret = 1; @@ -356,7 +356,7 @@ int simple_commit_write(struct file *file, struct page *page, /* * No need to use i_size_read() here, the i_size - * cannot change under us because we hold the i_sem. + * cannot change under us because we hold the i_mutex. */ if (pos > inode->i_size) i_size_write(inode, pos); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index c5a33648e9fd..145524039577 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -26,11 +26,12 @@ static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); -static void nlmclnt_unlock_callback(struct rpc_task *); -static void nlmclnt_cancel_callback(struct rpc_task *); static int nlm_stat_to_errno(u32 stat); static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); +static const struct rpc_call_ops nlmclnt_unlock_ops; +static const struct rpc_call_ops nlmclnt_cancel_ops; + /* * Cookie counter for NLM requests */ @@ -221,8 +222,7 @@ nlmclnt_proc(struct inode *inode, int cmd, struct file_lock *fl) goto done; } clnt->cl_softrtry = nfssrv->client->cl_softrtry; - clnt->cl_intr = nfssrv->client->cl_intr; - clnt->cl_chatty = nfssrv->client->cl_chatty; + clnt->cl_intr = nfssrv->client->cl_intr; } /* Keep the old signal mask */ @@ -399,8 +399,7 @@ in_grace_period: /* * Generic NLM call, async version. */ -int -nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback) +int nlmsvc_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; @@ -419,13 +418,12 @@ nlmsvc_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback) msg.rpc_proc = &clnt->cl_procinfo[proc]; /* bootstrap and kick off the async RPC call */ - status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req); + status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req); return status; } -static int -nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback) +static int nlmclnt_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; @@ -448,7 +446,7 @@ nlmclnt_async_call(struct nlm_rqst *req, u32 proc, rpc_action callback) /* Increment host refcount */ nlm_get_host(host); /* bootstrap and kick off the async RPC call */ - status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, callback, req); + status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, tk_ops, req); if (status < 0) nlm_release_host(host); return status; @@ -664,7 +662,7 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (req->a_flags & RPC_TASK_ASYNC) { status = nlmclnt_async_call(req, NLMPROC_UNLOCK, - nlmclnt_unlock_callback); + &nlmclnt_unlock_ops); /* Hrmf... Do the unlock early since locks_remove_posix() * really expects us to free the lock synchronously */ do_vfs_lock(fl); @@ -692,10 +690,9 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) return -ENOLCK; } -static void -nlmclnt_unlock_callback(struct rpc_task *task) +static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) { - struct nlm_rqst *req = (struct nlm_rqst *) task->tk_calldata; + struct nlm_rqst *req = data; int status = req->a_res.status; if (RPC_ASSASSINATED(task)) @@ -722,6 +719,10 @@ die: rpc_restart_call(task); } +static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_done = nlmclnt_unlock_callback, +}; + /* * Cancel a blocked lock request. * We always use an async RPC call for this in order not to hang a @@ -750,8 +751,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) nlmclnt_setlockargs(req, fl); - status = nlmclnt_async_call(req, NLMPROC_CANCEL, - nlmclnt_cancel_callback); + status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status < 0) { nlmclnt_release_lockargs(req); kfree(req); @@ -765,10 +765,9 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) return status; } -static void -nlmclnt_cancel_callback(struct rpc_task *task) +static void nlmclnt_cancel_callback(struct rpc_task *task, void *data) { - struct nlm_rqst *req = (struct nlm_rqst *) task->tk_calldata; + struct nlm_rqst *req = data; if (RPC_ASSASSINATED(task)) goto die; @@ -807,6 +806,10 @@ retry_cancel: rpc_delay(task, 30 * HZ); } +static const struct rpc_call_ops nlmclnt_cancel_ops = { + .rpc_call_done = nlmclnt_cancel_callback, +}; + /* * Convert an NLM status code to a generic kernel errno */ diff --git a/fs/lockd/host.c b/fs/lockd/host.c index c4c8601096e0..82f7a0b1d8ae 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -177,7 +177,7 @@ nlm_bind_host(struct nlm_host *host) if ((clnt = host->h_rpcclnt) != NULL) { xprt = clnt->cl_xprt; if (time_after_eq(jiffies, host->h_nextrebind)) { - clnt->cl_port = 0; + rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; dprintk("lockd: next rebind in %ld jiffies\n", host->h_nextrebind - jiffies); @@ -217,7 +217,7 @@ nlm_rebind_host(struct nlm_host *host) { dprintk("lockd: rebind host %s\n", host->h_name); if (host->h_rpcclnt && time_after_eq(jiffies, host->h_nextrebind)) { - host->h_rpcclnt->cl_port = 0; + rpc_force_rebind(host->h_rpcclnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; } } diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 2d144abe84ad..0edc03e67966 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -123,7 +123,6 @@ nsm_create(void) if (IS_ERR(clnt)) goto out_err; clnt->cl_softrtry = 1; - clnt->cl_chatty = 1; clnt->cl_oneshot = 1; return clnt; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 12a857c29e25..71a30b416d1a 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -178,6 +178,8 @@ lockd(struct svc_rqst *rqstp) } + flush_signals(current); + /* * Check whether there's a new lockd process before * shutting down the hosts and clearing the slot. @@ -192,8 +194,6 @@ lockd(struct svc_rqst *rqstp) "lockd: new process, skipping host shutdown\n"); wake_up(&lockd_exit); - flush_signals(current); - /* Exit the RPC thread */ svc_exit_thread(rqstp); diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 489670e21769..4063095d849e 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -22,7 +22,8 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT static u32 nlm4svc_callback(struct svc_rqst *, u32, struct nlm_res *); -static void nlm4svc_callback_exit(struct rpc_task *); + +static const struct rpc_call_ops nlm4svc_callback_ops; /* * Obtain client and file from arguments @@ -470,7 +471,6 @@ nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, } - /* * This is the generic lockd callback for async RPC calls */ @@ -494,7 +494,7 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp) call->a_host = host; memcpy(&call->a_args, resp, sizeof(*resp)); - if (nlmsvc_async_call(call, proc, nlm4svc_callback_exit) < 0) + if (nlmsvc_async_call(call, proc, &nlm4svc_callback_ops) < 0) goto error; return rpc_success; @@ -504,10 +504,9 @@ nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp) return rpc_system_err; } -static void -nlm4svc_callback_exit(struct rpc_task *task) +static void nlm4svc_callback_exit(struct rpc_task *task, void *data) { - struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; + struct nlm_rqst *call = data; if (task->tk_status < 0) { dprintk("lockd: %4d callback failed (errno = %d)\n", @@ -517,6 +516,10 @@ nlm4svc_callback_exit(struct rpc_task *task) kfree(call); } +static const struct rpc_call_ops nlm4svc_callback_ops = { + .rpc_call_done = nlm4svc_callback_exit, +}; + /* * NLM Server procedures. */ diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 49f959796b66..9cfced65d4a2 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -41,7 +41,8 @@ static void nlmsvc_insert_block(struct nlm_block *block, unsigned long); static int nlmsvc_remove_block(struct nlm_block *block); -static void nlmsvc_grant_callback(struct rpc_task *task); + +static const struct rpc_call_ops nlmsvc_grant_ops; /* * The list of blocked locks to retry @@ -226,31 +227,27 @@ failed: * It is the caller's responsibility to check whether the file * can be closed hereafter. */ -static void +static int nlmsvc_delete_block(struct nlm_block *block, int unlock) { struct file_lock *fl = &block->b_call.a_args.lock.fl; struct nlm_file *file = block->b_file; struct nlm_block **bp; + int status = 0; dprintk("lockd: deleting block %p...\n", block); /* Remove block from list */ nlmsvc_remove_block(block); - if (fl->fl_next) - posix_unblock_lock(file->f_file, fl); - if (unlock) { - fl->fl_type = F_UNLCK; - posix_lock_file(file->f_file, fl); - block->b_granted = 0; - } + if (unlock) + status = posix_unblock_lock(file->f_file, fl); /* If the block is in the middle of a GRANT callback, * don't kill it yet. */ if (block->b_incall) { nlmsvc_insert_block(block, NLM_NEVER); block->b_done = 1; - return; + return status; } /* Remove block from file's list of blocks */ @@ -265,6 +262,7 @@ nlmsvc_delete_block(struct nlm_block *block, int unlock) nlm_release_host(block->b_host); nlmclnt_freegrantargs(&block->b_call); kfree(block); + return status; } /* @@ -275,6 +273,7 @@ int nlmsvc_traverse_blocks(struct nlm_host *host, struct nlm_file *file, int action) { struct nlm_block *block, *next; + /* XXX: Will everything get cleaned up if we don't unlock here? */ down(&file->f_sema); for (block = file->f_blocks; block; block = next) { @@ -444,6 +443,7 @@ u32 nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) { struct nlm_block *block; + int status = 0; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", file->f_file->f_dentry->d_inode->i_sb->s_id, @@ -454,9 +454,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) down(&file->f_sema); if ((block = nlmsvc_lookup_block(file, lock, 1)) != NULL) - nlmsvc_delete_block(block, 1); + status = nlmsvc_delete_block(block, 1); up(&file->f_sema); - return nlm_granted; + return status ? nlm_lck_denied : nlm_granted; } /* @@ -562,7 +562,7 @@ callback: /* Call the client */ nlm_get_host(block->b_call.a_host); if (nlmsvc_async_call(&block->b_call, NLMPROC_GRANTED_MSG, - nlmsvc_grant_callback) < 0) + &nlmsvc_grant_ops) < 0) nlm_release_host(block->b_call.a_host); up(&file->f_sema); } @@ -575,10 +575,9 @@ callback: * chain once more in order to have it removed by lockd itself (which can * then sleep on the file semaphore without disrupting e.g. the nfs client). */ -static void -nlmsvc_grant_callback(struct rpc_task *task) +static void nlmsvc_grant_callback(struct rpc_task *task, void *data) { - struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; + struct nlm_rqst *call = data; struct nlm_block *block; unsigned long timeout; struct sockaddr_in *peer_addr = RPC_PEERADDR(task->tk_client); @@ -614,6 +613,10 @@ nlmsvc_grant_callback(struct rpc_task *task) nlm_release_host(call->a_host); } +static const struct rpc_call_ops nlmsvc_grant_ops = { + .rpc_call_done = nlmsvc_grant_callback, +}; + /* * We received a GRANT_RES callback. Try to find the corresponding * block. @@ -633,11 +636,12 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status file->f_count++; down(&file->f_sema); - if ((block = nlmsvc_find_block(cookie,&rqstp->rq_addr)) != NULL) { + block = nlmsvc_find_block(cookie, &rqstp->rq_addr); + if (block) { if (status == NLM_LCK_DENIED_GRACE_PERIOD) { /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); - block = NULL; + up(&file->f_sema); } else { /* Lock is now held by client, or has been rejected. * In both cases, the block should be removed. */ @@ -648,8 +652,6 @@ nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status nlmsvc_delete_block(block, 1); } } - if (!block) - up(&file->f_sema); nlm_release_file(file); } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 757e344cf200..3bc437e0cf5b 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -23,7 +23,8 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT static u32 nlmsvc_callback(struct svc_rqst *, u32, struct nlm_res *); -static void nlmsvc_callback_exit(struct rpc_task *); + +static const struct rpc_call_ops nlmsvc_callback_ops; #ifdef CONFIG_LOCKD_V4 static u32 @@ -518,7 +519,7 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp) call->a_host = host; memcpy(&call->a_args, resp, sizeof(*resp)); - if (nlmsvc_async_call(call, proc, nlmsvc_callback_exit) < 0) + if (nlmsvc_async_call(call, proc, &nlmsvc_callback_ops) < 0) goto error; return rpc_success; @@ -528,10 +529,9 @@ nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_res *resp) return rpc_system_err; } -static void -nlmsvc_callback_exit(struct rpc_task *task) +static void nlmsvc_callback_exit(struct rpc_task *task, void *data) { - struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; + struct nlm_rqst *call = data; if (task->tk_status < 0) { dprintk("lockd: %4d callback failed (errno = %d)\n", @@ -541,6 +541,10 @@ nlmsvc_callback_exit(struct rpc_task *task) kfree(call); } +static const struct rpc_call_ops nlmsvc_callback_ops = { + .rpc_call_done = nlmsvc_callback_exit, +}; + /* * NLM Server procedures. */ diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index ae4d6b426c62..fdcf105a5303 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -354,7 +354,9 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, u32 *p, struct nlm_reboot *argp) return 0; argp->state = ntohl(*p++); /* Preserve the address in network byte order */ - argp->addr = *p++; + argp->addr = *p++; + argp->vers = *p++; + argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/locks.c b/fs/locks.c index 250ef53d25ef..909eab8fb1d0 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -154,7 +154,7 @@ static struct file_lock *locks_alloc_lock(void) } /* Free a lock which is not in use. */ -static inline void locks_free_lock(struct file_lock *fl) +static void locks_free_lock(struct file_lock *fl) { if (fl == NULL) { BUG(); @@ -475,8 +475,7 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2) /* * Check whether two locks have the same owner. */ -static inline int -posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) +static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) { if (fl1->fl_lmops && fl1->fl_lmops->fl_compare_owner) return fl2->fl_lmops == fl1->fl_lmops && @@ -487,7 +486,7 @@ posix_same_owner(struct file_lock *fl1, struct file_lock *fl2) /* Remove waiter from blocker's block list. * When blocker ends up pointing to itself then the list is empty. */ -static inline void __locks_delete_block(struct file_lock *waiter) +static void __locks_delete_block(struct file_lock *waiter) { list_del_init(&waiter->fl_block); list_del_init(&waiter->fl_link); @@ -1958,22 +1957,18 @@ EXPORT_SYMBOL(posix_block_lock); * * lockd needs to block waiting for locks. */ -void +int posix_unblock_lock(struct file *filp, struct file_lock *waiter) { - /* - * A remote machine may cancel the lock request after it's been - * granted locally. If that happens, we need to delete the lock. - */ + int status = 0; + lock_kernel(); - if (waiter->fl_next) { + if (waiter->fl_next) __locks_delete_block(waiter); - unlock_kernel(); - } else { - unlock_kernel(); - waiter->fl_type = F_UNLCK; - posix_lock_file(filp, waiter); - } + else + status = -ENOENT; + unlock_kernel(); + return status; } EXPORT_SYMBOL(posix_unblock_lock); diff --git a/fs/mpage.c b/fs/mpage.c index c5adcdddf3cc..e431cb3878d6 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -184,7 +184,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, if (page_has_buffers(page)) goto confused; - block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits); + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = (i_size_read(inode) + blocksize - 1) >> blkbits; bh.b_page = page; @@ -466,7 +466,7 @@ __mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, * The page has no buffers: map it to disk */ BUG_ON(!PageUptodate(page)); - block_in_file = page->index << (PAGE_CACHE_SHIFT - blkbits); + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); last_block = (i_size - 1) >> blkbits; map_bh.b_page = page; for (page_block = 0; page_block < blocks_per_page; ) { @@ -721,7 +721,7 @@ retry: &last_block_in_bio, &ret, wbc, page->mapping->a_ops->writepage); } - if (unlikely(ret == WRITEPAGE_ACTIVATE)) + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) unlock_page(page); if (ret || (--(wbc->nr_to_write) <= 0)) done = 1; diff --git a/fs/namei.c b/fs/namei.c index 6dbbd42d8b95..1e5746eb1380 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -28,6 +28,7 @@ #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> +#include <linux/capability.h> #include <linux/file.h> #include <asm/namei.h> #include <asm/uaccess.h> @@ -438,7 +439,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s struct dentry * result; struct inode *dir = parent->d_inode; - down(&dir->i_sem); + mutex_lock(&dir->i_mutex); /* * First re-do the cached lookup just in case it was created * while we waited for the directory semaphore.. @@ -464,7 +465,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s else result = dentry; } - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); return result; } @@ -472,7 +473,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s * Uhhuh! Nasty case: the cache was re-populated while * we waited on the semaphore. Need to revalidate. */ - up(&dir->i_sem); + mutex_unlock(&dir->i_mutex); if (result->d_op && result->d_op->d_revalidate) { if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) { dput(result); @@ -1366,7 +1367,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) struct dentry *p; if (p1 == p2) { - down(&p1->d_inode->i_sem); + mutex_lock(&p1->d_inode->i_mutex); return NULL; } @@ -1374,30 +1375,30 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) for (p = p1; p->d_parent != p; p = p->d_parent) { if (p->d_parent == p2) { - down(&p2->d_inode->i_sem); - down(&p1->d_inode->i_sem); + mutex_lock(&p2->d_inode->i_mutex); + mutex_lock(&p1->d_inode->i_mutex); return p; } } for (p = p2; p->d_parent != p; p = p->d_parent) { if (p->d_parent == p1) { - down(&p1->d_inode->i_sem); - down(&p2->d_inode->i_sem); + mutex_lock(&p1->d_inode->i_mutex); + mutex_lock(&p2->d_inode->i_mutex); return p; } } - down(&p1->d_inode->i_sem); - down(&p2->d_inode->i_sem); + mutex_lock(&p1->d_inode->i_mutex); + mutex_lock(&p2->d_inode->i_mutex); return NULL; } void unlock_rename(struct dentry *p1, struct dentry *p2) { - up(&p1->d_inode->i_sem); + mutex_unlock(&p1->d_inode->i_mutex); if (p1 != p2) { - up(&p2->d_inode->i_sem); + mutex_unlock(&p2->d_inode->i_mutex); up(&p1->d_inode->i_sb->s_vfs_rename_sem); } } @@ -1491,7 +1492,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) if (!error) { DQUOT_INIT(inode); - error = do_truncate(dentry, 0, NULL); + error = do_truncate(dentry, 0, ATTR_MTIME|ATTR_CTIME, NULL); } put_write_access(inode); if (error) @@ -1563,14 +1564,14 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) dir = nd->dentry; nd->flags &= ~LOOKUP_PARENT; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); path.dentry = lookup_hash(nd); path.mnt = nd->mnt; do_last: error = PTR_ERR(path.dentry); if (IS_ERR(path.dentry)) { - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); goto exit; } @@ -1579,7 +1580,7 @@ do_last: if (!IS_POSIXACL(dir->d_inode)) mode &= ~current->fs->umask; error = vfs_create(dir->d_inode, path.dentry, mode, nd); - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); dput(nd->dentry); nd->dentry = path.dentry; if (error) @@ -1593,7 +1594,7 @@ do_last: /* * It already exists. */ - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); error = -EEXIST; if (flag & O_EXCL) @@ -1665,7 +1666,7 @@ do_link: goto exit; } dir = nd->dentry; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); path.dentry = lookup_hash(nd); path.mnt = nd->mnt; __putname(nd->last.name); @@ -1680,13 +1681,13 @@ do_link: * Simple function to lookup and return a dentry and create it * if it doesn't exist. Is SMP-safe. * - * Returns with nd->dentry->d_inode->i_sem locked. + * Returns with nd->dentry->d_inode->i_mutex locked. */ struct dentry *lookup_create(struct nameidata *nd, int is_dir) { struct dentry *dentry = ERR_PTR(-EEXIST); - down(&nd->dentry->d_inode->i_sem); + mutex_lock(&nd->dentry->d_inode->i_mutex); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) @@ -1784,7 +1785,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev) } dput(dentry); } - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); path_release(&nd); out: putname(tmp); @@ -1836,7 +1837,7 @@ asmlinkage long sys_mkdir(const char __user * pathname, int mode) error = vfs_mkdir(nd.dentry->d_inode, dentry, mode); dput(dentry); } - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); path_release(&nd); out: putname(tmp); @@ -1885,7 +1886,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) DQUOT_INIT(dir); - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); dentry_unhash(dentry); if (d_mountpoint(dentry)) error = -EBUSY; @@ -1897,7 +1898,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) dentry->d_inode->i_flags |= S_DEAD; } } - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); if (!error) { d_delete(dentry); } @@ -1932,14 +1933,14 @@ asmlinkage long sys_rmdir(const char __user * pathname) error = -EBUSY; goto exit1; } - down(&nd.dentry->d_inode->i_sem); + mutex_lock(&nd.dentry->d_inode->i_mutex); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { error = vfs_rmdir(nd.dentry->d_inode, dentry); dput(dentry); } - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); exit1: path_release(&nd); exit: @@ -1959,7 +1960,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) DQUOT_INIT(dir); - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); if (d_mountpoint(dentry)) error = -EBUSY; else { @@ -1967,7 +1968,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (!error) error = dir->i_op->unlink(dir, dentry); } - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -1979,7 +1980,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) /* * Make sure that the actual truncation of the file will occur outside its - * directory's i_sem. Truncate can take a long time if there is a lot of + * directory's i_mutex. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ @@ -2001,7 +2002,7 @@ asmlinkage long sys_unlink(const char __user * pathname) error = -EISDIR; if (nd.last_type != LAST_NORM) goto exit1; - down(&nd.dentry->d_inode->i_sem); + mutex_lock(&nd.dentry->d_inode->i_mutex); dentry = lookup_hash(&nd); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -2015,7 +2016,7 @@ asmlinkage long sys_unlink(const char __user * pathname) exit2: dput(dentry); } - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ exit1: @@ -2075,7 +2076,7 @@ asmlinkage long sys_symlink(const char __user * oldname, const char __user * new error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO); dput(dentry); } - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); path_release(&nd); out: putname(to); @@ -2113,10 +2114,10 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - down(&old_dentry->d_inode->i_sem); + mutex_lock(&old_dentry->d_inode->i_mutex); DQUOT_INIT(dir); error = dir->i_op->link(old_dentry, dir, new_dentry); - up(&old_dentry->d_inode->i_sem); + mutex_unlock(&old_dentry->d_inode->i_mutex); if (!error) fsnotify_create(dir, new_dentry->d_name.name); return error; @@ -2157,7 +2158,7 @@ asmlinkage long sys_link(const char __user * oldname, const char __user * newnam error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); dput(new_dentry); } - up(&nd.dentry->d_inode->i_sem); + mutex_unlock(&nd.dentry->d_inode->i_mutex); out_release: path_release(&nd); out: @@ -2178,7 +2179,7 @@ exit: * sb->s_vfs_rename_sem. We might be more accurate, but that's another * story. * c) we have to lock _three_ objects - parents and victim (if it exists). - * And that - after we got ->i_sem on parents (until then we don't know + * And that - after we got ->i_mutex on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_sem _and_ that parent of the object we @@ -2195,9 +2196,9 @@ exit: * stuff into VFS), but the former is not going away. Solution: the same * trick as in rmdir(). * e) conversion from fhandle to dentry may come in the wrong moment - when - * we are removing the target. Solution: we will have to grab ->i_sem + * we are removing the target. Solution: we will have to grab ->i_mutex * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on - * ->i_sem on parents, which works but leads to some truely excessive + * ->i_mutex on parents, which works but leads to some truely excessive * locking]. */ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, @@ -2222,7 +2223,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, target = new_dentry->d_inode; if (target) { - down(&target->i_sem); + mutex_lock(&target->i_mutex); dentry_unhash(new_dentry); } if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) @@ -2232,7 +2233,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (target) { if (!error) target->i_flags |= S_DEAD; - up(&target->i_sem); + mutex_unlock(&target->i_mutex); if (d_unhashed(new_dentry)) d_rehash(new_dentry); dput(new_dentry); @@ -2255,7 +2256,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, dget(new_dentry); target = new_dentry->d_inode; if (target) - down(&target->i_sem); + mutex_lock(&target->i_mutex); if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) error = -EBUSY; else @@ -2266,7 +2267,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, d_move(old_dentry, new_dentry); } if (target) - up(&target->i_sem); + mutex_unlock(&target->i_mutex); dput(new_dentry); return error; } diff --git a/fs/namespace.c b/fs/namespace.c index 2019899f2ab8..8bc15b362d23 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/quotaops.h> #include <linux/acct.h> +#include <linux/capability.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/namespace.h> @@ -355,14 +356,14 @@ static int show_vfsmnt(struct seq_file *m, void *v) { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, - { MS_NOATIME, ",noatime" }, - { MS_NODIRATIME, ",nodiratime" }, { 0, NULL } }; static struct proc_fs_info mnt_info[] = { { MNT_NOSUID, ",nosuid" }, { MNT_NODEV, ",nodev" }, { MNT_NOEXEC, ",noexec" }, + { MNT_NOATIME, ",noatime" }, + { MNT_NODIRATIME, ",nodiratime" }, { 0, NULL } }; struct proc_fs_info *fs_infop; @@ -451,7 +452,7 @@ EXPORT_SYMBOL(may_umount); void release_mounts(struct list_head *head) { struct vfsmount *mnt; - while(!list_empty(head)) { + while (!list_empty(head)) { mnt = list_entry(head->next, struct vfsmount, mnt_hash); list_del_init(&mnt->mnt_hash); if (mnt->mnt_parent != mnt) { @@ -814,7 +815,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) return -ENOTDIR; err = -ENOENT; - down(&nd->dentry->d_inode->i_sem); + mutex_lock(&nd->dentry->d_inode->i_mutex); if (IS_DEADDIR(nd->dentry->d_inode)) goto out_unlock; @@ -826,7 +827,7 @@ static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) err = attach_recursive_mnt(mnt, nd, NULL); out_unlock: - up(&nd->dentry->d_inode->i_sem); + mutex_unlock(&nd->dentry->d_inode->i_mutex); if (!err) security_sb_post_addmount(mnt, nd); return err; @@ -962,7 +963,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) goto out; err = -ENOENT; - down(&nd->dentry->d_inode->i_sem); + mutex_lock(&nd->dentry->d_inode->i_mutex); if (IS_DEADDIR(nd->dentry->d_inode)) goto out1; @@ -1004,7 +1005,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) list_del_init(&old_nd.mnt->mnt_expire); spin_unlock(&vfsmount_lock); out1: - up(&nd->dentry->d_inode->i_sem); + mutex_unlock(&nd->dentry->d_inode->i_mutex); out: up_write(&namespace_sem); if (!err) @@ -1286,7 +1287,13 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; - flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE); + if (flags & MS_NOATIME) + mnt_flags |= MNT_NOATIME; + if (flags & MS_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | + MS_NOATIME | MS_NODIRATIME); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); @@ -1526,6 +1533,10 @@ static void chroot_fs_refs(struct nameidata *old_nd, struct nameidata *new_nd) * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. + * See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives + * in this situation. + * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) @@ -1569,7 +1580,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root, user_nd.dentry = dget(current->fs->root); read_unlock(¤t->fs->lock); down_write(&namespace_sem); - down(&old_nd.dentry->d_inode->i_sem); + mutex_lock(&old_nd.dentry->d_inode->i_mutex); error = -EINVAL; if (IS_MNT_SHARED(old_nd.mnt) || IS_MNT_SHARED(new_nd.mnt->mnt_parent) || @@ -1622,7 +1633,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root, path_release(&root_parent); path_release(&parent_nd); out2: - up(&old_nd.dentry->d_inode->i_sem); + mutex_unlock(&old_nd.dentry->d_inode->i_mutex); up_write(&namespace_sem); path_release(&user_nd); path_release(&old_nd); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index a9f7a8ab1d59..cfd76f431dc0 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -365,7 +365,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) spin_lock(&dcache_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { - dent = list_entry(next, struct dentry, d_child); + dent = list_entry(next, struct dentry, d_u.d_child); if ((unsigned long)dent->d_fsdata == fpos) { if (dent->d_inode) dget_locked(dent); diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 4947d9b11fc1..973b444d6914 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -262,7 +262,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * } vfree(bouncebuffer); - inode_update_time(inode, 1); + file_update_time(file); *ppos = pos; diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index fd3efdca5ae3..d6e0c089e1b1 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -10,6 +10,7 @@ #include <linux/config.h> #include <asm/uaccess.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/ioctl.h> diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 9e4dc30c2435..799e5c2bec55 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -196,7 +196,7 @@ ncp_renew_dentries(struct dentry *parent) spin_lock(&dcache_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { - dentry = list_entry(next, struct dentry, d_child); + dentry = list_entry(next, struct dentry, d_u.d_child); if (dentry->d_fsdata == NULL) ncp_age_dentry(server, dentry); @@ -218,7 +218,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent) spin_lock(&dcache_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { - dentry = list_entry(next, struct dentry, d_child); + dentry = list_entry(next, struct dentry, d_u.d_child); dentry->d_fsdata = NULL; ncp_age_dentry(server, dentry); next = next->next; diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8b3bb715d177..ec61fd56a1a9 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -13,4 +13,5 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o nfs-$(CONFIG_NFS_DIRECTIO) += direct.o +nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-objs := $(nfs-y) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f2ca782aba33..fcd97406a778 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -14,6 +14,9 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> + +#include <net/inet_sock.h> + #include "nfs4_fs.h" #include "callback.h" @@ -31,6 +34,7 @@ static struct nfs_callback_data nfs_callback_info; static DECLARE_MUTEX(nfs_callback_sema); static struct svc_program nfs4_callback_program; +unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; /* @@ -95,7 +99,7 @@ int nfs_callback_up(void) if (!serv) goto out_err; /* FIXME: We don't want to register this socket with the portmapper */ - ret = svc_makesock(serv, IPPROTO_TCP, 0); + ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport); if (ret < 0) goto out_destroy; if (!list_empty(&serv->sv_permsocks)) { diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index a0db2d4f9415..b252e7fe53a5 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -65,6 +65,7 @@ extern unsigned nfs4_callback_recall(struct cb_recallargs *args, void *dummy); extern int nfs_callback_up(void); extern int nfs_callback_down(void); +extern unsigned int nfs_callback_set_tcpport; extern unsigned short nfs_callback_tcpport; #endif /* __LINUX_FS_NFS_CALLBACK_H */ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 65f1e19e4d19..462cfceb50c5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -35,7 +35,9 @@ unsigned nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); - res->change_attr = NFS_CHANGE_ATTR(inode); + res->change_attr = delegation->change_attr; + if (nfsi->npages != 0) + res->change_attr++; res->ctime = inode->i_ctime; res->mtime = inode->i_mtime; res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 618a327027b3..c6f07c1c71e6 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -8,6 +8,7 @@ */ #include <linux/config.h> #include <linux/completion.h> +#include <linux/kthread.h> #include <linux/module.h> #include <linux/sched.h> #include <linux/spinlock.h> @@ -130,6 +131,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct sizeof(delegation->stateid.data)); delegation->type = res->delegation_type; delegation->maxsize = res->maxsize; + delegation->change_attr = nfsi->change_attr; delegation->cred = get_rpccred(cred); delegation->inode = inode; @@ -157,8 +159,6 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * { int res = 0; - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid); nfs_free_delegation(delegation); return res; @@ -231,6 +231,49 @@ restart: spin_unlock(&clp->cl_lock); } +int nfs_do_expire_all_delegations(void *ptr) +{ + struct nfs4_client *clp = ptr; + struct nfs_delegation *delegation; + struct inode *inode; + + allow_signal(SIGKILL); +restart: + spin_lock(&clp->cl_lock); + if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0) + goto out; + if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) + goto out; + list_for_each_entry(delegation, &clp->cl_delegations, super_list) { + inode = igrab(delegation->inode); + if (inode == NULL) + continue; + spin_unlock(&clp->cl_lock); + nfs_inode_return_delegation(inode); + iput(inode); + goto restart; + } +out: + spin_unlock(&clp->cl_lock); + nfs4_put_client(clp); + module_put_and_exit(0); +} + +void nfs_expire_all_delegations(struct nfs4_client *clp) +{ + struct task_struct *task; + + __module_get(THIS_MODULE); + atomic_inc(&clp->cl_count); + task = kthread_run(nfs_do_expire_all_delegations, clp, + "%u.%u.%u.%u-delegreturn", + NIPQUAD(clp->cl_addr)); + if (!IS_ERR(task)) + return; + nfs4_put_client(clp); + module_put(THIS_MODULE); +} + /* * Return all delegations following an NFS4ERR_CB_PATH_DOWN error. */ diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 2fcc30de924b..7a0b2bfce771 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -21,6 +21,7 @@ struct nfs_delegation { #define NFS_DELEGATION_NEED_RECLAIM 1 long flags; loff_t maxsize; + __u64 change_attr; }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); @@ -30,6 +31,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s struct inode *nfs_delegation_find_inode(struct nfs4_client *clp, const struct nfs_fh *fhandle); void nfs_return_all_delegations(struct super_block *sb); +void nfs_expire_all_delegations(struct nfs4_client *clp); void nfs_handle_cb_pathdown(struct nfs4_client *clp); void nfs_delegation_mark_reclaim(struct nfs4_client *clp); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c0d1a214572c..a1554bead692 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -194,7 +194,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) spin_unlock(&inode->i_lock); /* Ensure consistent page alignment of the data. * Note: assumes we have exclusive access to this mapping either - * through inode->i_sem or some other mechanism. + * through inode->i_mutex or some other mechanism. */ if (page->index == 0) invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1); @@ -573,7 +573,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) { - down(&filp->f_dentry->d_inode->i_sem); + mutex_lock(&filp->f_dentry->d_inode->i_mutex); switch (origin) { case 1: offset += filp->f_pos; @@ -589,7 +589,7 @@ loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin) ((struct nfs_open_context *)filp->private_data)->dir_cookie = 0; } out: - up(&filp->f_dentry->d_inode->i_sem); + mutex_unlock(&filp->f_dentry->d_inode->i_mutex); return offset; } @@ -1001,7 +1001,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) openflags &= ~(O_CREAT|O_TRUNC); /* - * Note: we're not holding inode->i_sem and so may be racing with + * Note: we're not holding inode->i_mutex and so may be racing with * operations that change the directory. We therefore save the * change attribute *before* we do the RPC call. */ @@ -1051,7 +1051,7 @@ static struct dentry *nfs_readdir_lookup(nfs_readdir_descriptor_t *desc) return dentry; if (!desc->plus || !(entry->fattr->valid & NFS_ATTR_FATTR)) return NULL; - /* Note: caller is already holding the dir->i_sem! */ + /* Note: caller is already holding the dir->i_mutex! */ dentry = d_alloc(parent, &name); if (dentry == NULL) return NULL; @@ -1550,8 +1550,10 @@ go_ahead: } nfs_inode_return_delegation(old_inode); - if (new_inode) + if (new_inode != NULL) { + nfs_inode_return_delegation(new_inode); d_delete(new_dentry); + } nfs_begin_data_update(old_dir); nfs_begin_data_update(new_dir); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 079228817603..10ae377e68ff 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -122,9 +122,10 @@ nfs_free_user_pages(struct page **pages, int npages, int do_dirty) { int i; for (i = 0; i < npages; i++) { - if (do_dirty) - set_page_dirty_lock(pages[i]); - page_cache_release(pages[i]); + struct page *page = pages[i]; + if (do_dirty && !PageCompound(page)) + set_page_dirty_lock(page); + page_cache_release(page); } kfree(pages); } @@ -154,6 +155,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int struct list_head *list; struct nfs_direct_req *dreq; unsigned int reads = 0; + unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL); if (!dreq) @@ -167,7 +169,7 @@ static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, unsigned int list = &dreq->list; for(;;) { - struct nfs_read_data *data = nfs_readdata_alloc(); + struct nfs_read_data *data = nfs_readdata_alloc(rpages); if (unlikely(!data)) { while (!list_empty(list)) { @@ -268,8 +270,6 @@ static void nfs_direct_read_schedule(struct nfs_direct_req *dreq, NFS_PROTO(inode)->read_setup(data); data->task.tk_cookie = (unsigned long) inode; - data->task.tk_calldata = data; - data->task.tk_release = nfs_readdata_release; data->complete = nfs_direct_read_result; lock_kernel(); @@ -433,7 +433,7 @@ static ssize_t nfs_direct_write_seg(struct inode *inode, struct nfs_writeverf first_verf; struct nfs_write_data *wdata; - wdata = nfs_writedata_alloc(); + wdata = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); if (!wdata) return -ENOMEM; @@ -662,10 +662,10 @@ nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t .iov_len = count, }; - dprintk("nfs: direct read(%s/%s, %lu@%lu)\n", + dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n", file->f_dentry->d_parent->d_name.name, file->f_dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + (unsigned long) count, (long long) pos); if (!is_sync_kiocb(iocb)) goto out; @@ -718,9 +718,7 @@ out: ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos) { - ssize_t retval = -EINVAL; - loff_t *ppos = &iocb->ki_pos; - unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + ssize_t retval; struct file *file = iocb->ki_filp; struct nfs_open_context *ctx = (struct nfs_open_context *) file->private_data; @@ -728,35 +726,32 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, struct inode *inode = mapping->host; struct iovec iov = { .iov_base = (char __user *)buf, - .iov_len = count, }; - dfprintk(VFS, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n", + dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n", file->f_dentry->d_parent->d_name.name, - file->f_dentry->d_name.name, inode->i_ino, - (unsigned long) count, (unsigned long) pos); + file->f_dentry->d_name.name, + (unsigned long) count, (long long) pos); + retval = -EINVAL; if (!is_sync_kiocb(iocb)) goto out; - if (count < 0) - goto out; - if (pos < 0) + + retval = generic_write_checks(file, &pos, &count, 0); + if (retval) goto out; - retval = -EFAULT; - if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) + + retval = -EINVAL; + if ((ssize_t) count < 0) goto out; - retval = -EFBIG; - if (limit != RLIM_INFINITY) { - if (pos >= limit) { - send_sig(SIGXFSZ, current, 0); - goto out; - } - if (count > limit - (unsigned long) pos) - count = limit - (unsigned long) pos; - } retval = 0; if (!count) goto out; + iov.iov_len = count, + + retval = -EFAULT; + if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len)) + goto out; retval = nfs_sync_mapping(mapping); if (retval) @@ -766,7 +761,7 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, if (mapping->nrpages) invalidate_inode_pages2(mapping); if (retval > 0) - *ppos = pos + retval; + iocb->ki_pos = pos + retval; out: return retval; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index ffb8df91dc34..821edd30333b 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -54,7 +54,11 @@ #define IDMAP_HASH_SZ 128 +/* Default cache timeout is 10 minutes */ +unsigned int nfs_idmap_cache_timeout = 600 * HZ; + struct idmap_hashent { + unsigned long ih_expires; __u32 ih_id; int ih_namelen; char ih_name[IDMAP_NAMESZ]; @@ -149,6 +153,8 @@ idmap_lookup_name(struct idmap_hashtable *h, const char *name, size_t len) if (he->ih_namelen != len || memcmp(he->ih_name, name, len) != 0) return NULL; + if (time_after(jiffies, he->ih_expires)) + return NULL; return he; } @@ -164,6 +170,8 @@ idmap_lookup_id(struct idmap_hashtable *h, __u32 id) struct idmap_hashent *he = idmap_id_hash(h, id); if (he->ih_id != id || he->ih_namelen == 0) return NULL; + if (time_after(jiffies, he->ih_expires)) + return NULL; return he; } @@ -192,6 +200,7 @@ idmap_update_entry(struct idmap_hashent *he, const char *name, memcpy(he->ih_name, name, namelen); he->ih_name[namelen] = '\0'; he->ih_namelen = namelen; + he->ih_expires = jiffies + nfs_idmap_cache_timeout; } /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 432f41cd75e6..a77ee95b7efb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -40,6 +40,7 @@ #include <asm/uaccess.h> #include "nfs4_fs.h" +#include "callback.h" #include "delegation.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -221,10 +222,10 @@ nfs_calc_block_size(u64 tsize) static inline unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) { - if (bsize < 1024) - bsize = NFS_DEF_FILE_IO_BUFFER_SIZE; - else if (bsize >= NFS_MAX_FILE_IO_BUFFER_SIZE) - bsize = NFS_MAX_FILE_IO_BUFFER_SIZE; + if (bsize < NFS_MIN_FILE_IO_SIZE) + bsize = NFS_DEF_FILE_IO_SIZE; + else if (bsize >= NFS_MAX_FILE_IO_SIZE) + bsize = NFS_MAX_FILE_IO_SIZE; return nfs_block_bits(bsize, nrbitsp); } @@ -307,20 +308,15 @@ nfs_sb_init(struct super_block *sb, rpc_authflavor_t authflavor) max_rpc_payload = nfs_block_size(rpc_max_payload(server->client), NULL); if (server->rsize > max_rpc_payload) server->rsize = max_rpc_payload; - if (server->wsize > max_rpc_payload) - server->wsize = max_rpc_payload; - + if (server->rsize > NFS_MAX_FILE_IO_SIZE) + server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->rpages > NFS_READ_MAXIOV) { - server->rpages = NFS_READ_MAXIOV; - server->rsize = server->rpages << PAGE_CACHE_SHIFT; - } + if (server->wsize > max_rpc_payload) + server->wsize = max_rpc_payload; + if (server->wsize > NFS_MAX_FILE_IO_SIZE) + server->wsize = NFS_MAX_FILE_IO_SIZE; server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->wpages > NFS_WRITE_MAXIOV) { - server->wpages = NFS_WRITE_MAXIOV; - server->wsize = server->wpages << PAGE_CACHE_SHIFT; - } if (sb->s_blocksize == 0) sb->s_blocksize = nfs_block_bits(server->wsize, @@ -417,7 +413,6 @@ nfs_create_client(struct nfs_server *server, const struct nfs_mount_data *data) clnt->cl_intr = 1; clnt->cl_softrtry = 1; - clnt->cl_chatty = 1; return clnt; @@ -575,11 +570,10 @@ nfs_statfs(struct super_block *sb, struct kstatfs *buf) buf->f_namelen = server->namelen; out: unlock_kernel(); - return 0; out_err: - printk(KERN_WARNING "nfs_statfs: statfs error = %d\n", -error); + dprintk("%s: statfs error = %d\n", __FUNCTION__, -error); buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; goto out; @@ -650,10 +644,7 @@ int nfs_sync_mapping(struct address_space *mapping) if (mapping->nrpages == 0) return 0; unmap_mapping_range(mapping, 0, 0, 0); - ret = filemap_fdatawrite(mapping); - if (ret != 0) - goto out; - ret = filemap_fdatawait(mapping); + ret = filemap_write_and_wait(mapping); if (ret != 0) goto out; ret = nfs_wb_all(mapping->host); @@ -870,8 +861,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) nfs_begin_data_update(inode); /* Write all dirty data if we're changing file permissions or size */ if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE)) != 0) { - if (filemap_fdatawrite(inode->i_mapping) == 0) - filemap_fdatawait(inode->i_mapping); + filemap_write_and_wait(inode->i_mapping); nfs_wb_all(inode); } /* @@ -958,11 +948,22 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err; - if (__IS_FLG(inode, MS_NOATIME)) - need_atime = 0; - else if (__IS_FLG(inode, MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + /* Flush out writes to the server in order to update c/mtime */ + nfs_sync_inode(inode, 0, 0, FLUSH_WAIT|FLUSH_NOCOMMIT); + + /* + * We may force a getattr if the user cares about atime. + * + * Note that we only have to check the vfsmount flags here: + * - NFS always sets S_NOATIME by so checking it would give a + * bogus result + * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * no point in checking those. + */ + if ((mnt->mnt_flags & MNT_NOATIME) || + ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) need_atime = 0; - /* We may force a getattr if the user cares about atime */ + if (need_atime) err = __nfs_revalidate_inode(NFS_SERVER(inode), inode); else @@ -1252,6 +1253,33 @@ void nfs_end_data_update(struct inode *inode) atomic_dec(&nfsi->data_updates); } +static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0 + && nfsi->change_attr == fattr->pre_change_attr) { + nfsi->change_attr = fattr->change_attr; + nfsi->cache_change_attribute = jiffies; + } + + /* If we have atomic WCC data, we may update some attributes */ + if ((fattr->valid & NFS_ATTR_WCC) != 0) { + if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) { + memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); + nfsi->cache_change_attribute = jiffies; + } + if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) { + memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); + nfsi->cache_change_attribute = jiffies; + } + if (inode->i_size == fattr->pre_size && nfsi->npages == 0) { + inode->i_size = fattr->size; + nfsi->cache_change_attribute = jiffies; + } + } +} + /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache * @inode - pointer to inode @@ -1268,22 +1296,20 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat int data_unstable; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + /* Are we in the process of updating data on the server? */ data_unstable = nfs_caches_unstable(inode); - if (fattr->valid & NFS_ATTR_FATTR_V4) { - if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0 - && nfsi->change_attr == fattr->pre_change_attr) - nfsi->change_attr = fattr->change_attr; - if (nfsi->change_attr != fattr->change_attr) { - nfsi->cache_validity |= NFS_INO_INVALID_ATTR; - if (!data_unstable) - nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; - } - } + /* Do atomic weak cache consistency updates */ + nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR) == 0) { - return 0; + if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && + nfsi->change_attr != fattr->change_attr) { + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; + if (!data_unstable) + nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE; } /* Has the inode gone and changed behind our back? */ @@ -1295,14 +1321,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); - /* If we have atomic WCC data, we may update some attributes */ - if ((fattr->valid & NFS_ATTR_WCC) != 0) { - if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime)) - memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); - if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) - memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); - } - /* Verify a few of the more important attributes */ if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) { nfsi->cache_validity |= NFS_INO_INVALID_ATTR; @@ -1410,14 +1428,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; - if (nfsi->fileid != fattr->fileid) { - printk(KERN_ERR "%s: inode number mismatch\n" - "expected (%s/0x%Lx), got (%s/0x%Lx)\n", - __FUNCTION__, - inode->i_sb->s_id, (long long)nfsi->fileid, - inode->i_sb->s_id, (long long)fattr->fileid); - goto out_err; - } + if (nfsi->fileid != fattr->fileid) + goto out_fileid; /* * Make sure the inode's type hasn't changed. @@ -1436,6 +1448,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (data_stable) nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME); + /* Do atomic weak cache consistency updates */ + nfs_wcc_update_inode(inode, fattr); + /* Check if our cached file size is stale */ new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); @@ -1539,6 +1554,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfs_invalidate_inode(inode); return -ESTALE; + + out_fileid: + printk(KERN_ERR "NFS: server %s error: fileid changed\n" + "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", + NFS_SERVER(inode)->hostname, inode->i_sb->s_id, + (long long)nfsi->fileid, (long long)fattr->fileid); + goto out_err; } /* @@ -1820,25 +1842,10 @@ static int nfs4_fill_super(struct super_block *sb, struct nfs4_mount_data *data, } clnt->cl_intr = 1; clnt->cl_softrtry = 1; - clnt->cl_chatty = 1; clp->cl_rpcclient = clnt; - clp->cl_cred = rpcauth_lookupcred(clnt->cl_auth, 0); - if (IS_ERR(clp->cl_cred)) { - up_write(&clp->cl_sem); - err = PTR_ERR(clp->cl_cred); - clp->cl_cred = NULL; - goto out_fail; - } memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); nfs_idmap_new(clp); } - if (list_empty(&clp->cl_superblocks)) { - err = nfs4_init_client(clp); - if (err != 0) { - up_write(&clp->cl_sem); - goto out_fail; - } - } list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); clnt = rpc_clone_client(clp->cl_rpcclient); if (!IS_ERR(clnt)) @@ -2033,6 +2040,35 @@ static struct file_system_type nfs4_fs_type = { .fs_flags = FS_ODD_RENAME|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +static int param_set_port(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + return -EINVAL; + *((int *)kp->arg) = num; + return 0; +} + +module_param_call(callback_tcpport, param_set_port, param_get_int, + &nfs_callback_set_tcpport, 0644); + +static int param_set_idmap_timeout(const char *val, struct kernel_param *kp) +{ + char *endp; + int num = simple_strtol(val, &endp, 0); + int jif = num * HZ; + if (endp == val || *endp || num < 0 || jif < num) + return -EINVAL; + *((int *)kp->arg) = jif; + return 0; +} + +module_param_call(idmap_cache_timeout, param_set_idmap_timeout, param_get_int, + &nfs_idmap_cache_timeout, 0644); + #define nfs4_init_once(nfsi) \ do { \ INIT_LIST_HEAD(&(nfsi)->open_states); \ @@ -2040,8 +2076,25 @@ static struct file_system_type nfs4_fs_type = { nfsi->delegation_state = 0; \ init_rwsem(&nfsi->rwsem); \ } while(0) -#define register_nfs4fs() register_filesystem(&nfs4_fs_type) -#define unregister_nfs4fs() unregister_filesystem(&nfs4_fs_type) + +static inline int register_nfs4fs(void) +{ + int ret; + + ret = nfs_register_sysctl(); + if (ret != 0) + return ret; + ret = register_filesystem(&nfs4_fs_type); + if (ret != 0) + nfs_unregister_sysctl(); + return ret; +} + +static inline void unregister_nfs4fs(void) +{ + unregister_filesystem(&nfs4_fs_type); + nfs_unregister_sysctl(); +} #else #define nfs4_init_once(nfsi) \ do { } while (0) @@ -2166,11 +2219,11 @@ out: #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif - nfs_destroy_writepagecache(); #ifdef CONFIG_NFS_DIRECTIO -out0: nfs_destroy_directcache(); +out0: #endif + nfs_destroy_writepagecache(); out1: nfs_destroy_readpagecache(); out2: diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 0e82617f2de0..db99b8f678f8 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -82,7 +82,6 @@ mnt_create(char *hostname, struct sockaddr_in *srvaddr, int version, RPC_AUTH_UNIX); if (!IS_ERR(clnt)) { clnt->cl_softrtry = 1; - clnt->cl_chatty = 1; clnt->cl_oneshot = 1; clnt->cl_intr = 1; } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 59049e864ca7..7fc0560c89c9 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -146,23 +146,23 @@ xdr_decode_fattr(u32 *p, struct nfs_fattr *fattr) return p; } -#define SATTR(p, attr, flag, field) \ - *p++ = (attr->ia_valid & flag) ? htonl(attr->field) : ~(u32) 0 static inline u32 * xdr_encode_sattr(u32 *p, struct iattr *attr) { - SATTR(p, attr, ATTR_MODE, ia_mode); - SATTR(p, attr, ATTR_UID, ia_uid); - SATTR(p, attr, ATTR_GID, ia_gid); - SATTR(p, attr, ATTR_SIZE, ia_size); + const u32 not_set = __constant_htonl(0xFFFFFFFF); + + *p++ = (attr->ia_valid & ATTR_MODE) ? htonl(attr->ia_mode) : not_set; + *p++ = (attr->ia_valid & ATTR_UID) ? htonl(attr->ia_uid) : not_set; + *p++ = (attr->ia_valid & ATTR_GID) ? htonl(attr->ia_gid) : not_set; + *p++ = (attr->ia_valid & ATTR_SIZE) ? htonl(attr->ia_size) : not_set; if (attr->ia_valid & ATTR_ATIME_SET) { p = xdr_encode_time(p, &attr->ia_atime); } else if (attr->ia_valid & ATTR_ATIME) { p = xdr_encode_current_server_time(p, &attr->ia_atime); } else { - *p++ = ~(u32) 0; - *p++ = ~(u32) 0; + *p++ = not_set; + *p++ = not_set; } if (attr->ia_valid & ATTR_MTIME_SET) { @@ -170,12 +170,11 @@ xdr_encode_sattr(u32 *p, struct iattr *attr) } else if (attr->ia_valid & ATTR_MTIME) { p = xdr_encode_current_server_time(p, &attr->ia_mtime); } else { - *p++ = ~(u32) 0; - *p++ = ~(u32) 0; + *p++ = not_set; + *p++ = not_set; } return p; } -#undef SATTR /* * NFS encode functions diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 92c870d19ccd..ed67567f0556 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -68,27 +68,39 @@ nfs3_async_handle_jukebox(struct rpc_task *task) return 1; } -/* - * Bare-bones access to getattr: this is for nfs_read_super. - */ static int -nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) +do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) { int status; dprintk("%s: call fsinfo\n", __FUNCTION__); nfs_fattr_init(info->fattr); - status = rpc_call(server->client_sys, NFS3PROC_FSINFO, fhandle, info, 0); + status = rpc_call(client, NFS3PROC_FSINFO, fhandle, info, 0); dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status); if (!(info->fattr->valid & NFS_ATTR_FATTR)) { - status = rpc_call(server->client_sys, NFS3PROC_GETATTR, fhandle, info->fattr, 0); + status = rpc_call(client, NFS3PROC_GETATTR, fhandle, info->fattr, 0); dprintk("%s: reply getattr: %d\n", __FUNCTION__, status); } return status; } /* + * Bare-bones access to getattr: this is for nfs_read_super. + */ +static int +nfs3_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_get_root(server->client, fhandle, info); + if (status && server->client_sys != server->client) + status = do_proc_get_root(server->client_sys, fhandle, info); + return status; +} + +/* * One function for each procedure in the NFS protocol. */ static int @@ -732,19 +744,23 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, extern u32 *nfs3_decode_dirent(u32 *, struct nfs_entry *, int); -static void -nfs3_read_done(struct rpc_task *task) +static void nfs3_read_done(struct rpc_task *task, void *calldata) { - struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; + struct nfs_read_data *data = calldata; if (nfs3_async_handle_jukebox(task)) return; /* Call back common NFS readpage processing */ if (task->tk_status >= 0) nfs_refresh_inode(data->inode, &data->fattr); - nfs_readpage_result(task); + nfs_readpage_result(task, calldata); } +static const struct rpc_call_ops nfs3_read_ops = { + .rpc_call_done = nfs3_read_done, + .rpc_release = nfs_readdata_release, +}; + static void nfs3_proc_read_setup(struct nfs_read_data *data) { @@ -762,23 +778,26 @@ nfs3_proc_read_setup(struct nfs_read_data *data) flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs3_read_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_read_ops, data); rpc_call_setup(task, &msg, 0); } -static void -nfs3_write_done(struct rpc_task *task) +static void nfs3_write_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data; + struct nfs_write_data *data = calldata; if (nfs3_async_handle_jukebox(task)) return; - data = (struct nfs_write_data *)task->tk_calldata; if (task->tk_status >= 0) nfs_post_op_update_inode(data->inode, data->res.fattr); - nfs_writeback_done(task); + nfs_writeback_done(task, calldata); } +static const struct rpc_call_ops nfs3_write_ops = { + .rpc_call_done = nfs3_write_done, + .rpc_release = nfs_writedata_release, +}; + static void nfs3_proc_write_setup(struct nfs_write_data *data, int how) { @@ -806,23 +825,26 @@ nfs3_proc_write_setup(struct nfs_write_data *data, int how) flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs3_write_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_write_ops, data); rpc_call_setup(task, &msg, 0); } -static void -nfs3_commit_done(struct rpc_task *task) +static void nfs3_commit_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data; + struct nfs_write_data *data = calldata; if (nfs3_async_handle_jukebox(task)) return; - data = (struct nfs_write_data *)task->tk_calldata; if (task->tk_status >= 0) nfs_post_op_update_inode(data->inode, data->res.fattr); - nfs_commit_done(task); + nfs_commit_done(task, calldata); } +static const struct rpc_call_ops nfs3_commit_ops = { + .rpc_call_done = nfs3_commit_done, + .rpc_release = nfs_commit_release, +}; + static void nfs3_proc_commit_setup(struct nfs_write_data *data, int how) { @@ -840,7 +862,7 @@ nfs3_proc_commit_setup(struct nfs_write_data *data, int how) flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs3_commit_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs3_commit_ops, data); rpc_call_setup(task, &msg, 0); } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 0498bd36602c..b6c0b5012bce 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -182,7 +182,7 @@ xdr_encode_sattr(u32 *p, struct iattr *attr) { if (attr->ia_valid & ATTR_MODE) { *p++ = xdr_one; - *p++ = htonl(attr->ia_mode); + *p++ = htonl(attr->ia_mode & S_IALLUGO); } else { *p++ = xdr_zero; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index b7f262dcb6e3..0f5e4e7cddec 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -38,7 +38,8 @@ struct idmap; ((err) != NFSERR_NOFILEHANDLE)) enum nfs4_client_state { - NFS4CLNT_OK = 0, + NFS4CLNT_STATE_RECOVER = 0, + NFS4CLNT_LEASE_EXPIRED, }; /* @@ -67,7 +68,6 @@ struct nfs4_client { atomic_t cl_count; struct rpc_clnt * cl_rpcclient; - struct rpc_cred * cl_cred; struct list_head cl_superblocks; /* List of nfs_server structs */ @@ -76,7 +76,6 @@ struct nfs4_client { struct work_struct cl_renewd; struct work_struct cl_recoverd; - wait_queue_head_t cl_waitq; struct rpc_wait_queue cl_rpcwaitq; /* used for the setclientid verifier */ @@ -182,8 +181,9 @@ struct nfs4_state { nfs4_stateid stateid; - unsigned int nreaders; - unsigned int nwriters; + unsigned int n_rdonly; + unsigned int n_wronly; + unsigned int n_rdwr; int state; /* State on the server (R,W, or RW) */ atomic_t count; }; @@ -210,10 +210,10 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t); /* nfs4proc.c */ extern int nfs4_map_errors(int err); -extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short); -extern int nfs4_proc_setclientid_confirm(struct nfs4_client *); -extern int nfs4_proc_async_renew(struct nfs4_client *); -extern int nfs4_proc_renew(struct nfs4_client *); +extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short, struct rpc_cred *); +extern int nfs4_proc_setclientid_confirm(struct nfs4_client *, struct rpc_cred *); +extern int nfs4_proc_async_renew(struct nfs4_client *, struct rpc_cred *); +extern int nfs4_proc_renew(struct nfs4_client *, struct rpc_cred *); extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state); extern struct dentry *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *); extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nameidata *); @@ -237,8 +237,8 @@ extern void init_nfsv4_state(struct nfs_server *); extern void destroy_nfsv4_state(struct nfs_server *); extern struct nfs4_client *nfs4_get_client(struct in_addr *); extern void nfs4_put_client(struct nfs4_client *clp); -extern int nfs4_init_client(struct nfs4_client *clp); extern struct nfs4_client *nfs4_find_client(struct in_addr *); +struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp); extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *); extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f988a9417b13..984ca3454d04 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -57,11 +57,13 @@ #define NFS4_POLL_RETRY_MIN (1*HZ) #define NFS4_POLL_RETRY_MAX (15*HZ) -static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid); +struct nfs4_opendata; +static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *); static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry); static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception); +static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp); extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; @@ -173,8 +175,7 @@ static void nfs4_setup_readdir(u64 cookie, u32 *verifier, struct dentry *dentry, kunmap_atomic(start, KM_USER0); } -static void -renew_lease(struct nfs_server *server, unsigned long timestamp) +static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { struct nfs4_client *clp = server->nfs4_state; spin_lock(&clp->cl_lock); @@ -194,21 +195,123 @@ static void update_changeattr(struct inode *inode, struct nfs4_change_info *cinf spin_unlock(&inode->i_lock); } +struct nfs4_opendata { + atomic_t count; + struct nfs_openargs o_arg; + struct nfs_openres o_res; + struct nfs_open_confirmargs c_arg; + struct nfs_open_confirmres c_res; + struct nfs_fattr f_attr; + struct nfs_fattr dir_attr; + struct dentry *dentry; + struct dentry *dir; + struct nfs4_state_owner *owner; + struct iattr attrs; + unsigned long timestamp; + int rpc_status; + int cancelled; +}; + +static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, + struct nfs4_state_owner *sp, int flags, + const struct iattr *attrs) +{ + struct dentry *parent = dget_parent(dentry); + struct inode *dir = parent->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + goto err; + p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); + if (p->o_arg.seqid == NULL) + goto err_free; + atomic_set(&p->count, 1); + p->dentry = dget(dentry); + p->dir = parent; + p->owner = sp; + atomic_inc(&sp->so_count); + p->o_arg.fh = NFS_FH(dir); + p->o_arg.open_flags = flags, + p->o_arg.clientid = server->nfs4_state->cl_clientid; + p->o_arg.id = sp->so_id; + p->o_arg.name = &dentry->d_name; + p->o_arg.server = server; + p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; + p->o_res.f_attr = &p->f_attr; + p->o_res.dir_attr = &p->dir_attr; + p->o_res.server = server; + nfs_fattr_init(&p->f_attr); + nfs_fattr_init(&p->dir_attr); + if (flags & O_EXCL) { + u32 *s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; + } else if (flags & O_CREAT) { + p->o_arg.u.attrs = &p->attrs; + memcpy(&p->attrs, attrs, sizeof(p->attrs)); + } + p->c_arg.fh = &p->o_res.fh; + p->c_arg.stateid = &p->o_res.stateid; + p->c_arg.seqid = p->o_arg.seqid; + return p; +err_free: + kfree(p); +err: + dput(parent); + return NULL; +} + +static void nfs4_opendata_free(struct nfs4_opendata *p) +{ + if (p != NULL && atomic_dec_and_test(&p->count)) { + nfs_free_seqid(p->o_arg.seqid); + nfs4_put_state_owner(p->owner); + dput(p->dir); + dput(p->dentry); + kfree(p); + } +} + /* Helper for asynchronous RPC calls */ -static int nfs4_call_async(struct rpc_clnt *clnt, rpc_action tk_begin, - rpc_action tk_exit, void *calldata) +static int nfs4_call_async(struct rpc_clnt *clnt, + const struct rpc_call_ops *tk_ops, void *calldata) { struct rpc_task *task; - if (!(task = rpc_new_task(clnt, tk_exit, RPC_TASK_ASYNC))) + if (!(task = rpc_new_task(clnt, RPC_TASK_ASYNC, tk_ops, calldata))) return -ENOMEM; - - task->tk_calldata = calldata; - task->tk_action = tk_begin; rpc_execute(task); return 0; } +static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task) +{ + sigset_t oldset; + int ret; + + rpc_clnt_sigmask(task->tk_client, &oldset); + ret = rpc_wait_for_completion_task(task); + rpc_clnt_sigunmask(task->tk_client, &oldset); + return ret; +} + +static inline void update_open_stateflags(struct nfs4_state *state, mode_t open_flags) +{ + switch (open_flags) { + case FMODE_WRITE: + state->n_wronly++; + break; + case FMODE_READ: + state->n_rdonly++; + break; + case FMODE_READ|FMODE_WRITE: + state->n_rdwr++; + } +} + static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags) { struct inode *inode = state->inode; @@ -218,41 +321,134 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, spin_lock(&state->owner->so_lock); spin_lock(&inode->i_lock); memcpy(&state->stateid, stateid, sizeof(state->stateid)); - if ((open_flags & FMODE_WRITE)) - state->nwriters++; - if (open_flags & FMODE_READ) - state->nreaders++; + update_open_stateflags(state, open_flags); nfs4_state_set_mode_locked(state, state->state | open_flags); spin_unlock(&inode->i_lock); spin_unlock(&state->owner->so_lock); } +static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + struct inode *inode; + struct nfs4_state *state = NULL; + + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) + goto out; + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr); + if (inode == NULL) + goto out; + state = nfs4_get_open_state(inode, data->owner); + if (state == NULL) + goto put_inode; + update_open_stateid(state, &data->o_res.stateid, data->o_arg.open_flags); +put_inode: + iput(inode); +out: + return state; +} + +static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state) +{ + struct nfs_inode *nfsi = NFS_I(state->inode); + struct nfs_open_context *ctx; + + spin_lock(&state->inode->i_lock); + list_for_each_entry(ctx, &nfsi->open_files, list) { + if (ctx->state != state) + continue; + get_nfs_open_context(ctx); + spin_unlock(&state->inode->i_lock); + return ctx; + } + spin_unlock(&state->inode->i_lock); + return ERR_PTR(-ENOENT); +} + +static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, nfs4_stateid *stateid) +{ + int ret; + + opendata->o_arg.open_flags = openflags; + ret = _nfs4_proc_open(opendata); + if (ret != 0) + return ret; + memcpy(stateid->data, opendata->o_res.stateid.data, + sizeof(stateid->data)); + return 0; +} + +static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state) +{ + nfs4_stateid stateid; + struct nfs4_state *newstate; + int mode = 0; + int delegation = 0; + int ret; + + /* memory barrier prior to reading state->n_* */ + smp_rmb(); + if (state->n_rdwr != 0) { + ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &stateid); + if (ret != 0) + return ret; + mode |= FMODE_READ|FMODE_WRITE; + if (opendata->o_res.delegation_type != 0) + delegation = opendata->o_res.delegation_type; + smp_rmb(); + } + if (state->n_wronly != 0) { + ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &stateid); + if (ret != 0) + return ret; + mode |= FMODE_WRITE; + if (opendata->o_res.delegation_type != 0) + delegation = opendata->o_res.delegation_type; + smp_rmb(); + } + if (state->n_rdonly != 0) { + ret = nfs4_open_recover_helper(opendata, FMODE_READ, &stateid); + if (ret != 0) + return ret; + mode |= FMODE_READ; + } + clear_bit(NFS_DELEGATED_STATE, &state->flags); + if (mode == 0) + return 0; + if (opendata->o_res.delegation_type == 0) + opendata->o_res.delegation_type = delegation; + opendata->o_arg.open_flags |= mode; + newstate = nfs4_opendata_to_nfs4_state(opendata); + if (newstate != NULL) { + if (opendata->o_res.delegation_type != 0) { + struct nfs_inode *nfsi = NFS_I(newstate->inode); + int delegation_flags = 0; + if (nfsi->delegation) + delegation_flags = nfsi->delegation->flags; + if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM)) + nfs_inode_set_delegation(newstate->inode, + opendata->owner->so_cred, + &opendata->o_res); + else + nfs_inode_reclaim_delegation(newstate->inode, + opendata->owner->so_cred, + &opendata->o_res); + } + nfs4_close_state(newstate, opendata->o_arg.open_flags); + } + if (newstate != state) + return -ESTALE; + return 0; +} + /* * OPEN_RECLAIM: * reclaim state on the server after a reboot. */ -static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) +static int _nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) { - struct inode *inode = state->inode; - struct nfs_server *server = NFS_SERVER(inode); - struct nfs_delegation *delegation = NFS_I(inode)->delegation; - struct nfs_openargs o_arg = { - .fh = NFS_FH(inode), - .id = sp->so_id, - .open_flags = state->state, - .clientid = server->nfs4_state->cl_clientid, - .claim = NFS4_OPEN_CLAIM_PREVIOUS, - .bitmask = server->attr_bitmask, - }; - struct nfs_openres o_res = { - .server = server, /* Grrr */ - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR], - .rpc_argp = &o_arg, - .rpc_resp = &o_res, - .rpc_cred = sp->so_cred, - }; + struct nfs_delegation *delegation = NFS_I(state->inode)->delegation; + struct nfs4_opendata *opendata; + int delegation_type = 0; int status; if (delegation != NULL) { @@ -262,38 +458,27 @@ static int _nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *st set_bit(NFS_DELEGATED_STATE, &state->flags); return 0; } - o_arg.u.delegation_type = delegation->type; + delegation_type = delegation->type; } - o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); - if (o_arg.seqid == NULL) + opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL); + if (opendata == NULL) return -ENOMEM; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - /* Confirm the sequence as being established */ - nfs_confirm_seqid(&sp->so_seqid, status); - nfs_increment_open_seqid(status, o_arg.seqid); - if (status == 0) { - memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); - if (o_res.delegation_type != 0) { - nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res); - /* Did the server issue an immediate delegation recall? */ - if (o_res.do_recall) - nfs_async_inode_return_delegation(inode, &o_res.stateid); - } - } - nfs_free_seqid(o_arg.seqid); - clear_bit(NFS_DELEGATED_STATE, &state->flags); - /* Ensure we update the inode attributes */ - NFS_CACHEINV(inode); + opendata->o_arg.claim = NFS4_OPEN_CLAIM_PREVIOUS; + opendata->o_arg.fh = NFS_FH(state->inode); + nfs_copy_fh(&opendata->o_res.fh, opendata->o_arg.fh); + opendata->o_arg.u.delegation_type = delegation_type; + status = nfs4_open_recover(opendata, state); + nfs4_opendata_free(opendata); return status; } -static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) +static int nfs4_do_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) { struct nfs_server *server = NFS_SERVER(state->inode); struct nfs4_exception exception = { }; int err; do { - err = _nfs4_open_reclaim(sp, state); + err = _nfs4_do_open_reclaim(sp, state, dentry); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -301,63 +486,36 @@ static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *sta return err; } +static int nfs4_open_reclaim(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + struct nfs_open_context *ctx; + int ret; + + ctx = nfs4_state_find_open_context(state); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = nfs4_do_open_reclaim(sp, state, ctx->dentry); + put_nfs_open_context(ctx); + return ret; +} + static int _nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) { struct nfs4_state_owner *sp = state->owner; - struct inode *inode = dentry->d_inode; - struct nfs_server *server = NFS_SERVER(inode); - struct dentry *parent = dget_parent(dentry); - struct nfs_openargs arg = { - .fh = NFS_FH(parent->d_inode), - .clientid = server->nfs4_state->cl_clientid, - .name = &dentry->d_name, - .id = sp->so_id, - .server = server, - .bitmask = server->attr_bitmask, - .claim = NFS4_OPEN_CLAIM_DELEGATE_CUR, - }; - struct nfs_openres res = { - .server = server, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR], - .rpc_argp = &arg, - .rpc_resp = &res, - .rpc_cred = sp->so_cred, - }; - int status = 0; + struct nfs4_opendata *opendata; + int ret; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) - goto out; - if (state->state == 0) - goto out; - arg.seqid = nfs_alloc_seqid(&sp->so_seqid); - status = -ENOMEM; - if (arg.seqid == NULL) - goto out; - arg.open_flags = state->state; - memcpy(arg.u.delegation.data, state->stateid.data, sizeof(arg.u.delegation.data)); - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - nfs_increment_open_seqid(status, arg.seqid); - if (status != 0) - goto out_free; - if(res.rflags & NFS4_OPEN_RESULT_CONFIRM) { - status = _nfs4_proc_open_confirm(server->client, NFS_FH(inode), - sp, &res.stateid, arg.seqid); - if (status != 0) - goto out_free; - } - nfs_confirm_seqid(&sp->so_seqid, 0); - if (status >= 0) { - memcpy(state->stateid.data, res.stateid.data, - sizeof(state->stateid.data)); - clear_bit(NFS_DELEGATED_STATE, &state->flags); - } -out_free: - nfs_free_seqid(arg.seqid); -out: - dput(parent); - return status; + return 0; + opendata = nfs4_opendata_alloc(dentry, sp, 0, NULL); + if (opendata == NULL) + return -ENOMEM; + opendata->o_arg.claim = NFS4_OPEN_CLAIM_DELEGATE_CUR; + memcpy(opendata->o_arg.u.delegation.data, state->stateid.data, + sizeof(opendata->o_arg.u.delegation.data)); + ret = nfs4_open_recover(opendata, state); + nfs4_opendata_free(opendata); + return ret; } int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) @@ -382,82 +540,202 @@ int nfs4_open_delegation_recall(struct dentry *dentry, struct nfs4_state *state) return err; } -static int _nfs4_proc_open_confirm(struct rpc_clnt *clnt, const struct nfs_fh *fh, struct nfs4_state_owner *sp, nfs4_stateid *stateid, struct nfs_seqid *seqid) +static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata) { - struct nfs_open_confirmargs arg = { - .fh = fh, - .seqid = seqid, - .stateid = *stateid, - }; - struct nfs_open_confirmres res; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], - .rpc_argp = &arg, - .rpc_resp = &res, - .rpc_cred = sp->so_cred, + struct nfs4_opendata *data = calldata; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], + .rpc_argp = &data->c_arg, + .rpc_resp = &data->c_res, + .rpc_cred = data->owner->so_cred, }; + data->timestamp = jiffies; + rpc_call_setup(task, &msg, 0); +} + +static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) + return; + if (data->rpc_status == 0) { + memcpy(data->o_res.stateid.data, data->c_res.stateid.data, + sizeof(data->o_res.stateid.data)); + renew_lease(data->o_res.server, data->timestamp); + } + nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); + nfs_confirm_seqid(&data->owner->so_seqid, data->rpc_status); +} + +static void nfs4_open_confirm_release(void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state *state = NULL; + + /* If this request hasn't been cancelled, do nothing */ + if (data->cancelled == 0) + goto out_free; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0) + goto out_free; + nfs_confirm_seqid(&data->owner->so_seqid, 0); + state = nfs4_opendata_to_nfs4_state(data); + if (state != NULL) + nfs4_close_state(state, data->o_arg.open_flags); +out_free: + nfs4_opendata_free(data); +} + +static const struct rpc_call_ops nfs4_open_confirm_ops = { + .rpc_call_prepare = nfs4_open_confirm_prepare, + .rpc_call_done = nfs4_open_confirm_done, + .rpc_release = nfs4_open_confirm_release, +}; + +/* + * Note: On error, nfs4_proc_open_confirm will free the struct nfs4_opendata + */ +static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) +{ + struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + struct rpc_task *task; int status; - status = rpc_call_sync(clnt, &msg, RPC_TASK_NOINTR); - /* Confirm the sequence as being established */ - nfs_confirm_seqid(&sp->so_seqid, status); - nfs_increment_open_seqid(status, seqid); - if (status >= 0) - memcpy(stateid, &res.stateid, sizeof(*stateid)); + atomic_inc(&data->count); + task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_confirm_ops, data); + if (IS_ERR(task)) { + nfs4_opendata_free(data); + return PTR_ERR(task); + } + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_release_task(task); return status; } -static int _nfs4_proc_open(struct inode *dir, struct nfs4_state_owner *sp, struct nfs_openargs *o_arg, struct nfs_openres *o_res) +static void nfs4_open_prepare(struct rpc_task *task, void *calldata) { - struct nfs_server *server = NFS_SERVER(dir); + struct nfs4_opendata *data = calldata; + struct nfs4_state_owner *sp = data->owner; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN], - .rpc_argp = o_arg, - .rpc_resp = o_res, + .rpc_argp = &data->o_arg, + .rpc_resp = &data->o_res, .rpc_cred = sp->so_cred, }; - int status; + + if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0) + return; + /* Update sequence id. */ + data->o_arg.id = sp->so_id; + data->o_arg.clientid = sp->so_client->cl_clientid; + if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) + msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; + data->timestamp = jiffies; + rpc_call_setup(task, &msg, 0); +} - /* Update sequence id. The caller must serialize! */ - o_arg->id = sp->so_id; - o_arg->clientid = sp->so_client->cl_clientid; +static void nfs4_open_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_opendata *data = calldata; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - if (status == 0) { - /* OPEN on anything except a regular file is disallowed in NFSv4 */ - switch (o_res->f_attr->mode & S_IFMT) { + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) + return; + if (task->tk_status == 0) { + switch (data->o_res.f_attr->mode & S_IFMT) { case S_IFREG: break; case S_IFLNK: - status = -ELOOP; + data->rpc_status = -ELOOP; break; case S_IFDIR: - status = -EISDIR; + data->rpc_status = -EISDIR; break; default: - status = -ENOTDIR; + data->rpc_status = -ENOTDIR; } + renew_lease(data->o_res.server, data->timestamp); } + nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); +} + +static void nfs4_open_release(void *calldata) +{ + struct nfs4_opendata *data = calldata; + struct nfs4_state *state = NULL; - nfs_increment_open_seqid(status, o_arg->seqid); + /* If this request hasn't been cancelled, do nothing */ + if (data->cancelled == 0) + goto out_free; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0) + goto out_free; + /* In case we need an open_confirm, no cleanup! */ + if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) + goto out_free; + nfs_confirm_seqid(&data->owner->so_seqid, 0); + state = nfs4_opendata_to_nfs4_state(data); + if (state != NULL) + nfs4_close_state(state, data->o_arg.open_flags); +out_free: + nfs4_opendata_free(data); +} + +static const struct rpc_call_ops nfs4_open_ops = { + .rpc_call_prepare = nfs4_open_prepare, + .rpc_call_done = nfs4_open_done, + .rpc_release = nfs4_open_release, +}; + +/* + * Note: On error, nfs4_proc_open will free the struct nfs4_opendata + */ +static int _nfs4_proc_open(struct nfs4_opendata *data) +{ + struct inode *dir = data->dir->d_inode; + struct nfs_server *server = NFS_SERVER(dir); + struct nfs_openargs *o_arg = &data->o_arg; + struct nfs_openres *o_res = &data->o_res; + struct rpc_task *task; + int status; + + atomic_inc(&data->count); + task = rpc_run_task(server->client, RPC_TASK_ASYNC, &nfs4_open_ops, data); + if (IS_ERR(task)) { + nfs4_opendata_free(data); + return PTR_ERR(task); + } + status = nfs4_wait_for_completion_rpc_task(task); + if (status != 0) { + data->cancelled = 1; + smp_wmb(); + } else + status = data->rpc_status; + rpc_release_task(task); if (status != 0) - goto out; + return status; + if (o_arg->open_flags & O_CREAT) { update_changeattr(dir, &o_res->cinfo); nfs_post_op_update_inode(dir, o_res->dir_attr); } else nfs_refresh_inode(dir, o_res->dir_attr); if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { - status = _nfs4_proc_open_confirm(server->client, &o_res->fh, - sp, &o_res->stateid, o_arg->seqid); + status = _nfs4_proc_open_confirm(data); if (status != 0) - goto out; + return status; } - nfs_confirm_seqid(&sp->so_seqid, 0); + nfs_confirm_seqid(&data->owner->so_seqid, 0); if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) - status = server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); -out: - return status; + return server->rpc_ops->getattr(server, &o_res->fh, o_res->f_attr); + return 0; } static int _nfs4_do_access(struct inode *inode, struct rpc_cred *cred, int openflags) @@ -488,6 +766,15 @@ out: return -EACCES; } +int nfs4_recover_expired_lease(struct nfs_server *server) +{ + struct nfs4_client *clp = server->nfs4_state; + + if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) + nfs4_schedule_state_recovery(clp); + return nfs4_wait_clnt_recover(server->client, clp); +} + /* * OPEN_EXPIRED: * reclaim state on the server after a network partition. @@ -495,77 +782,31 @@ out: */ static int _nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) { - struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; struct inode *inode = state->inode; - struct nfs_server *server = NFS_SERVER(dir); struct nfs_delegation *delegation = NFS_I(inode)->delegation; - struct nfs_fattr f_attr, dir_attr; - struct nfs_openargs o_arg = { - .fh = NFS_FH(dir), - .open_flags = state->state, - .name = &dentry->d_name, - .bitmask = server->attr_bitmask, - .claim = NFS4_OPEN_CLAIM_NULL, - }; - struct nfs_openres o_res = { - .f_attr = &f_attr, - .dir_attr = &dir_attr, - .server = server, - }; - int status = 0; + struct nfs4_opendata *opendata; + int openflags = state->state & (FMODE_READ|FMODE_WRITE); + int ret; if (delegation != NULL && !(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) { - status = _nfs4_do_access(inode, sp->so_cred, state->state); - if (status < 0) - goto out; + ret = _nfs4_do_access(inode, sp->so_cred, openflags); + if (ret < 0) + return ret; memcpy(&state->stateid, &delegation->stateid, sizeof(state->stateid)); set_bit(NFS_DELEGATED_STATE, &state->flags); - goto out; + return 0; } - o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); - status = -ENOMEM; - if (o_arg.seqid == NULL) - goto out; - nfs_fattr_init(&f_attr); - nfs_fattr_init(&dir_attr); - status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); - if (status != 0) - goto out_nodeleg; - /* Check if files differ */ - if ((f_attr.mode & S_IFMT) != (inode->i_mode & S_IFMT)) - goto out_stale; - /* Has the file handle changed? */ - if (nfs_compare_fh(&o_res.fh, NFS_FH(inode)) != 0) { - /* Verify if the change attributes are the same */ - if (f_attr.change_attr != NFS_I(inode)->change_attr) - goto out_stale; - if (nfs_size_to_loff_t(f_attr.size) != inode->i_size) - goto out_stale; - /* Lets just pretend that this is the same file */ - nfs_copy_fh(NFS_FH(inode), &o_res.fh); - NFS_I(inode)->fileid = f_attr.fileid; - } - memcpy(&state->stateid, &o_res.stateid, sizeof(state->stateid)); - if (o_res.delegation_type != 0) { - if (!(delegation->flags & NFS_DELEGATION_NEED_RECLAIM)) - nfs_inode_set_delegation(inode, sp->so_cred, &o_res); - else - nfs_inode_reclaim_delegation(inode, sp->so_cred, &o_res); + opendata = nfs4_opendata_alloc(dentry, sp, openflags, NULL); + if (opendata == NULL) + return -ENOMEM; + ret = nfs4_open_recover(opendata, state); + if (ret == -ESTALE) { + /* Invalidate the state owner so we don't ever use it again */ + nfs4_drop_state_owner(sp); + d_drop(dentry); } -out_nodeleg: - nfs_free_seqid(o_arg.seqid); - clear_bit(NFS_DELEGATED_STATE, &state->flags); -out: - dput(parent); - return status; -out_stale: - status = -ESTALE; - /* Invalidate the state owner so we don't ever use it again */ - nfs4_drop_state_owner(sp); - d_drop(dentry); - /* Should we be trying to close that stateid? */ - goto out_nodeleg; + nfs4_opendata_free(opendata); + return ret; } static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state, struct dentry *dentry) @@ -584,26 +825,19 @@ static inline int nfs4_do_open_expired(struct nfs4_state_owner *sp, struct nfs4_ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) { - struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_open_context *ctx; - int status; + int ret; - spin_lock(&state->inode->i_lock); - list_for_each_entry(ctx, &nfsi->open_files, list) { - if (ctx->state != state) - continue; - get_nfs_open_context(ctx); - spin_unlock(&state->inode->i_lock); - status = nfs4_do_open_expired(sp, state, ctx->dentry); - put_nfs_open_context(ctx); - return status; - } - spin_unlock(&state->inode->i_lock); - return -ENOENT; + ctx = nfs4_state_find_open_context(state); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = nfs4_do_open_expired(sp, state, ctx->dentry); + put_nfs_open_context(ctx); + return ret; } /* - * Returns an nfs4_state + an extra reference to the inode + * Returns a referenced nfs4_state if there is an open delegation on the file */ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred *cred, struct nfs4_state **res) { @@ -616,6 +850,14 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred int open_flags = flags & (FMODE_READ|FMODE_WRITE); int err; + err = -ENOMEM; + if (!(sp = nfs4_get_state_owner(server, cred))) { + dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); + return err; + } + err = nfs4_recover_expired_lease(server); + if (err != 0) + goto out_put_state_owner; /* Protect against reboot recovery - NOTE ORDER! */ down_read(&clp->cl_sem); /* Protect against delegation recall */ @@ -625,10 +867,6 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred if (delegation == NULL || (delegation->type & open_flags) != open_flags) goto out_err; err = -ENOMEM; - if (!(sp = nfs4_get_state_owner(server, cred))) { - dprintk("%s: nfs4_get_state_owner failed!\n", __FUNCTION__); - goto out_err; - } state = nfs4_get_open_state(inode, sp); if (state == NULL) goto out_err; @@ -636,39 +874,34 @@ static int _nfs4_open_delegated(struct inode *inode, int flags, struct rpc_cred err = -ENOENT; if ((state->state & open_flags) == open_flags) { spin_lock(&inode->i_lock); - if (open_flags & FMODE_READ) - state->nreaders++; - if (open_flags & FMODE_WRITE) - state->nwriters++; + update_open_stateflags(state, open_flags); spin_unlock(&inode->i_lock); goto out_ok; } else if (state->state != 0) - goto out_err; + goto out_put_open_state; lock_kernel(); err = _nfs4_do_access(inode, cred, open_flags); unlock_kernel(); if (err != 0) - goto out_err; + goto out_put_open_state; set_bit(NFS_DELEGATED_STATE, &state->flags); update_open_stateid(state, &delegation->stateid, open_flags); out_ok: nfs4_put_state_owner(sp); up_read(&nfsi->rwsem); up_read(&clp->cl_sem); - igrab(inode); *res = state; - return 0; + return 0; +out_put_open_state: + nfs4_put_open_state(state); out_err: - if (sp != NULL) { - if (state != NULL) - nfs4_put_open_state(state); - nfs4_put_state_owner(sp); - } up_read(&nfsi->rwsem); up_read(&clp->cl_sem); if (err != -EACCES) nfs_inode_return_delegation(inode); +out_put_state_owner: + nfs4_put_state_owner(sp); return err; } @@ -689,7 +922,7 @@ static struct nfs4_state *nfs4_open_delegated(struct inode *inode, int flags, st } /* - * Returns an nfs4_state + an referenced inode + * Returns a referenced nfs4_state */ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res) { @@ -697,73 +930,46 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st struct nfs4_state *state = NULL; struct nfs_server *server = NFS_SERVER(dir); struct nfs4_client *clp = server->nfs4_state; - struct inode *inode = NULL; + struct nfs4_opendata *opendata; int status; - struct nfs_fattr f_attr, dir_attr; - struct nfs_openargs o_arg = { - .fh = NFS_FH(dir), - .open_flags = flags, - .name = &dentry->d_name, - .server = server, - .bitmask = server->attr_bitmask, - .claim = NFS4_OPEN_CLAIM_NULL, - }; - struct nfs_openres o_res = { - .f_attr = &f_attr, - .dir_attr = &dir_attr, - .server = server, - }; /* Protect against reboot recovery conflicts */ - down_read(&clp->cl_sem); status = -ENOMEM; if (!(sp = nfs4_get_state_owner(server, cred))) { dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); goto out_err; } - if (flags & O_EXCL) { - u32 *p = (u32 *) o_arg.u.verifier.data; - p[0] = jiffies; - p[1] = current->pid; - } else - o_arg.u.attrs = sattr; - /* Serialization for the sequence id */ + status = nfs4_recover_expired_lease(server); + if (status != 0) + goto err_put_state_owner; + down_read(&clp->cl_sem); + status = -ENOMEM; + opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr); + if (opendata == NULL) + goto err_put_state_owner; - o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid); - if (o_arg.seqid == NULL) - return -ENOMEM; - nfs_fattr_init(&f_attr); - nfs_fattr_init(&dir_attr); - status = _nfs4_proc_open(dir, sp, &o_arg, &o_res); + status = _nfs4_proc_open(opendata); if (status != 0) - goto out_err; + goto err_opendata_free; status = -ENOMEM; - inode = nfs_fhget(dir->i_sb, &o_res.fh, &f_attr); - if (!inode) - goto out_err; - state = nfs4_get_open_state(inode, sp); - if (!state) - goto out_err; - update_open_stateid(state, &o_res.stateid, flags); - if (o_res.delegation_type != 0) - nfs_inode_set_delegation(inode, cred, &o_res); - nfs_free_seqid(o_arg.seqid); + state = nfs4_opendata_to_nfs4_state(opendata); + if (state == NULL) + goto err_opendata_free; + if (opendata->o_res.delegation_type != 0) + nfs_inode_set_delegation(state->inode, cred, &opendata->o_res); + nfs4_opendata_free(opendata); nfs4_put_state_owner(sp); up_read(&clp->cl_sem); *res = state; return 0; +err_opendata_free: + nfs4_opendata_free(opendata); +err_put_state_owner: + nfs4_put_state_owner(sp); out_err: - if (sp != NULL) { - if (state != NULL) - nfs4_put_open_state(state); - nfs_free_seqid(o_arg.seqid); - nfs4_put_state_owner(sp); - } /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */ up_read(&clp->cl_sem); - if (inode != NULL) - iput(inode); *res = NULL; return status; } @@ -830,6 +1036,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, .rpc_argp = &arg, .rpc_resp = &res, }; + unsigned long timestamp = jiffies; int status; nfs_fattr_init(fattr); @@ -841,6 +1048,8 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr, memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); status = rpc_call_sync(server->client, &msg, 0); + if (status == 0 && state != NULL) + renew_lease(server, timestamp); return status; } @@ -865,12 +1074,13 @@ struct nfs4_closedata { struct nfs_closeargs arg; struct nfs_closeres res; struct nfs_fattr fattr; + unsigned long timestamp; }; -static void nfs4_free_closedata(struct nfs4_closedata *calldata) +static void nfs4_free_closedata(void *data) { - struct nfs4_state *state = calldata->state; - struct nfs4_state_owner *sp = state->owner; + struct nfs4_closedata *calldata = data; + struct nfs4_state_owner *sp = calldata->state->owner; nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); @@ -878,12 +1088,14 @@ static void nfs4_free_closedata(struct nfs4_closedata *calldata) kfree(calldata); } -static void nfs4_close_done(struct rpc_task *task) +static void nfs4_close_done(struct rpc_task *task, void *data) { - struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; + struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); + if (RPC_ASSASSINATED(task)) + return; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ @@ -892,6 +1104,7 @@ static void nfs4_close_done(struct rpc_task *task) case 0: memcpy(&state->stateid, &calldata->res.stateid, sizeof(state->stateid)); + renew_lease(server, calldata->timestamp); break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -904,12 +1117,11 @@ static void nfs4_close_done(struct rpc_task *task) } } nfs_refresh_inode(calldata->inode, calldata->res.fattr); - nfs4_free_closedata(calldata); } -static void nfs4_close_begin(struct rpc_task *task) +static void nfs4_close_prepare(struct rpc_task *task, void *data) { - struct nfs4_closedata *calldata = (struct nfs4_closedata *)task->tk_calldata; + struct nfs4_closedata *calldata = data; struct nfs4_state *state = calldata->state; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE], @@ -918,10 +1130,8 @@ static void nfs4_close_begin(struct rpc_task *task) .rpc_cred = state->owner->so_cred, }; int mode = 0, old_mode; - int status; - status = nfs_wait_on_sequence(calldata->arg.seqid, task); - if (status != 0) + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; /* Recalculate the new open mode in case someone reopened the file * while we were waiting in line to be scheduled. @@ -929,26 +1139,34 @@ static void nfs4_close_begin(struct rpc_task *task) spin_lock(&state->owner->so_lock); spin_lock(&calldata->inode->i_lock); mode = old_mode = state->state; - if (state->nreaders == 0) - mode &= ~FMODE_READ; - if (state->nwriters == 0) - mode &= ~FMODE_WRITE; + if (state->n_rdwr == 0) { + if (state->n_rdonly == 0) + mode &= ~FMODE_READ; + if (state->n_wronly == 0) + mode &= ~FMODE_WRITE; + } nfs4_state_set_mode_locked(state, mode); spin_unlock(&calldata->inode->i_lock); spin_unlock(&state->owner->so_lock); if (mode == old_mode || test_bit(NFS_DELEGATED_STATE, &state->flags)) { - nfs4_free_closedata(calldata); - task->tk_exit = NULL; - rpc_exit(task, 0); + /* Note: exit _without_ calling nfs4_close_done */ + task->tk_action = NULL; return; } nfs_fattr_init(calldata->res.fattr); if (mode != 0) msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; calldata->arg.open_flags = mode; + calldata->timestamp = jiffies; rpc_call_setup(task, &msg, 0); } +static const struct rpc_call_ops nfs4_close_ops = { + .rpc_call_prepare = nfs4_close_prepare, + .rpc_call_done = nfs4_close_done, + .rpc_release = nfs4_free_closedata, +}; + /* * It is possible for data to be read/written from a mem-mapped file * after the sys_close call (which hits the vfs layer as a flush). @@ -981,8 +1199,7 @@ int nfs4_do_close(struct inode *inode, struct nfs4_state *state) calldata->res.fattr = &calldata->fattr; calldata->res.server = server; - status = nfs4_call_async(server->client, nfs4_close_begin, - nfs4_close_done, calldata); + status = nfs4_call_async(server->client, &nfs4_close_ops, calldata); if (status == 0) goto out; @@ -1034,7 +1251,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) d_add(dentry, NULL); return (struct dentry *)state; } - res = d_add_unique(dentry, state->inode); + res = d_add_unique(dentry, igrab(state->inode)); if (res != NULL) dentry = res; nfs4_intent_set_file(nd, dentry, state); @@ -1046,7 +1263,6 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st { struct rpc_cred *cred; struct nfs4_state *state; - struct inode *inode; cred = rpcauth_lookupcred(NFS_SERVER(dir)->client->cl_auth, 0); if (IS_ERR(cred)) @@ -1070,9 +1286,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st } goto out_drop; } - inode = state->inode; - iput(inode); - if (inode == dentry->d_inode) { + if (state->inode == dentry->d_inode) { nfs4_intent_set_file(nd, dentry, state); return 1; } @@ -1508,11 +1722,13 @@ static int _nfs4_proc_write(struct nfs_write_data *wdata) wdata->args.bitmask = server->attr_bitmask; wdata->res.server = server; + wdata->timestamp = jiffies; nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, rpcflags); dprintk("NFS reply write: %d\n", status); if (status < 0) return status; + renew_lease(server, wdata->timestamp); nfs_post_op_update_inode(inode, fattr); return wdata->res.count; } @@ -1547,8 +1763,11 @@ static int _nfs4_proc_commit(struct nfs_write_data *cdata) cdata->args.bitmask = server->attr_bitmask; cdata->res.server = server; + cdata->timestamp = jiffies; nfs_fattr_init(fattr); status = rpc_call_sync(server->client, &msg, 0); + if (status >= 0) + renew_lease(server, cdata->timestamp); dprintk("NFS reply commit: %d\n", status); if (status >= 0) nfs_post_op_update_inode(inode, fattr); @@ -1601,7 +1820,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, status = PTR_ERR(state); goto out; } - d_instantiate(dentry, state->inode); + d_instantiate(dentry, igrab(state->inode)); if (flags & O_EXCL) { struct nfs_fattr fattr; status = nfs4_do_setattr(NFS_SERVER(dir), &fattr, @@ -2125,10 +2344,9 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } -static void -nfs4_read_done(struct rpc_task *task) +static void nfs4_read_done(struct rpc_task *task, void *calldata) { - struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; + struct nfs_read_data *data = calldata; struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { @@ -2138,9 +2356,14 @@ nfs4_read_done(struct rpc_task *task) if (task->tk_status > 0) renew_lease(NFS_SERVER(inode), data->timestamp); /* Call back common NFS readpage processing */ - nfs_readpage_result(task); + nfs_readpage_result(task, calldata); } +static const struct rpc_call_ops nfs4_read_ops = { + .rpc_call_done = nfs4_read_done, + .rpc_release = nfs_readdata_release, +}; + static void nfs4_proc_read_setup(struct nfs_read_data *data) { @@ -2160,14 +2383,13 @@ nfs4_proc_read_setup(struct nfs_read_data *data) flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs4_read_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_read_ops, data); rpc_call_setup(task, &msg, 0); } -static void -nfs4_write_done(struct rpc_task *task) +static void nfs4_write_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_write_data *data = calldata; struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { @@ -2179,9 +2401,14 @@ nfs4_write_done(struct rpc_task *task) nfs_post_op_update_inode(inode, data->res.fattr); } /* Call back common NFS writeback processing */ - nfs_writeback_done(task); + nfs_writeback_done(task, calldata); } +static const struct rpc_call_ops nfs4_write_ops = { + .rpc_call_done = nfs4_write_done, + .rpc_release = nfs_writedata_release, +}; + static void nfs4_proc_write_setup(struct nfs_write_data *data, int how) { @@ -2214,14 +2441,13 @@ nfs4_proc_write_setup(struct nfs_write_data *data, int how) flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs4_write_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_write_ops, data); rpc_call_setup(task, &msg, 0); } -static void -nfs4_commit_done(struct rpc_task *task) +static void nfs4_commit_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_write_data *data = calldata; struct inode *inode = data->inode; if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { @@ -2231,9 +2457,14 @@ nfs4_commit_done(struct rpc_task *task) if (task->tk_status >= 0) nfs_post_op_update_inode(inode, data->res.fattr); /* Call back common NFS writeback processing */ - nfs_commit_done(task); + nfs_commit_done(task, calldata); } +static const struct rpc_call_ops nfs4_commit_ops = { + .rpc_call_done = nfs4_commit_done, + .rpc_release = nfs_commit_release, +}; + static void nfs4_proc_commit_setup(struct nfs_write_data *data, int how) { @@ -2255,7 +2486,7 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how) flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs4_commit_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs4_commit_ops, data); rpc_call_setup(task, &msg, 0); } @@ -2263,11 +2494,10 @@ nfs4_proc_commit_setup(struct nfs_write_data *data, int how) * nfs4_proc_async_renew(): This is not one of the nfs_rpc_ops; it is a special * standalone procedure for queueing an asynchronous RENEW. */ -static void -renew_done(struct rpc_task *task) +static void nfs4_renew_done(struct rpc_task *task, void *data) { struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; - unsigned long timestamp = (unsigned long)task->tk_calldata; + unsigned long timestamp = (unsigned long)data; if (task->tk_status < 0) { switch (task->tk_status) { @@ -2284,26 +2514,28 @@ renew_done(struct rpc_task *task) spin_unlock(&clp->cl_lock); } -int -nfs4_proc_async_renew(struct nfs4_client *clp) +static const struct rpc_call_ops nfs4_renew_ops = { + .rpc_call_done = nfs4_renew_done, +}; + +int nfs4_proc_async_renew(struct nfs4_client *clp, struct rpc_cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], .rpc_argp = clp, - .rpc_cred = clp->cl_cred, + .rpc_cred = cred, }; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - renew_done, (void *)jiffies); + &nfs4_renew_ops, (void *)jiffies); } -int -nfs4_proc_renew(struct nfs4_client *clp) +int nfs4_proc_renew(struct nfs4_client *clp, struct rpc_cred *cred) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], .rpc_argp = clp, - .rpc_cred = clp->cl_cred, + .rpc_cred = cred, }; unsigned long now = jiffies; int status; @@ -2519,7 +2751,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) case -NFS4ERR_EXPIRED: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); nfs4_schedule_state_recovery(clp); - if (test_bit(NFS4CLNT_OK, &clp->cl_state)) + if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) rpc_wake_up_task(task); task->tk_status = 0; return -EAGAIN; @@ -2536,25 +2768,25 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) return 0; } +static int nfs4_wait_bit_interruptible(void *word) +{ + if (signal_pending(current)) + return -ERESTARTSYS; + schedule(); + return 0; +} + static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp) { - DEFINE_WAIT(wait); sigset_t oldset; - int interruptible, res = 0; + int res; might_sleep(); rpc_clnt_sigmask(clnt, &oldset); - interruptible = TASK_UNINTERRUPTIBLE; - if (clnt->cl_intr) - interruptible = TASK_INTERRUPTIBLE; - prepare_to_wait(&clp->cl_waitq, &wait, interruptible); - nfs4_schedule_state_recovery(clp); - if (clnt->cl_intr && signalled()) - res = -ERESTARTSYS; - else if (!test_bit(NFS4CLNT_OK, &clp->cl_state)) - schedule(); - finish_wait(&clp->cl_waitq, &wait); + res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER, + nfs4_wait_bit_interruptible, + TASK_INTERRUPTIBLE); rpc_clnt_sigunmask(clnt, &oldset); return res; } @@ -2597,6 +2829,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: + nfs4_schedule_state_recovery(clp); ret = nfs4_wait_clnt_recover(server->client, clp); if (ret == 0) exception->retry = 1; @@ -2613,7 +2846,7 @@ int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct return nfs4_map_errors(ret); } -int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port) +int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short port, struct rpc_cred *cred) { nfs4_verifier sc_verifier; struct nfs4_setclientid setclientid = { @@ -2624,7 +2857,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], .rpc_argp = &setclientid, .rpc_resp = clp, - .rpc_cred = clp->cl_cred, + .rpc_cred = cred, }; u32 *p; int loop = 0; @@ -2638,7 +2871,7 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p setclientid.sc_name_len = scnprintf(setclientid.sc_name, sizeof(setclientid.sc_name), "%s/%u.%u.%u.%u %s %u", clp->cl_ipaddr, NIPQUAD(clp->cl_addr.s_addr), - clp->cl_cred->cr_ops->cr_name, + cred->cr_ops->cr_name, clp->cl_id_uniquifier); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), "tcp"); @@ -2661,14 +2894,14 @@ int nfs4_proc_setclientid(struct nfs4_client *clp, u32 program, unsigned short p } int -nfs4_proc_setclientid_confirm(struct nfs4_client *clp) +nfs4_proc_setclientid_confirm(struct nfs4_client *clp, struct rpc_cred *cred) { struct nfs_fsinfo fsinfo; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID_CONFIRM], .rpc_argp = clp, .rpc_resp = &fsinfo, - .rpc_cred = clp->cl_cred, + .rpc_cred = cred, }; unsigned long now; int status; @@ -2679,24 +2912,92 @@ nfs4_proc_setclientid_confirm(struct nfs4_client *clp) spin_lock(&clp->cl_lock); clp->cl_lease_time = fsinfo.lease_time * HZ; clp->cl_last_renewal = now; + clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); spin_unlock(&clp->cl_lock); } return status; } -static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) +struct nfs4_delegreturndata { + struct nfs4_delegreturnargs args; + struct nfs4_delegreturnres res; + struct nfs_fh fh; + nfs4_stateid stateid; + struct rpc_cred *cred; + unsigned long timestamp; + struct nfs_fattr fattr; + int rpc_status; +}; + +static void nfs4_delegreturn_prepare(struct rpc_task *task, void *calldata) { - struct nfs4_delegreturnargs args = { - .fhandle = NFS_FH(inode), - .stateid = stateid, - }; + struct nfs4_delegreturndata *data = calldata; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_DELEGRETURN], - .rpc_argp = &args, - .rpc_cred = cred, + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, }; + nfs_fattr_init(data->res.fattr); + rpc_call_setup(task, &msg, 0); +} - return rpc_call_sync(NFS_CLIENT(inode), &msg, 0); +static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_delegreturndata *data = calldata; + data->rpc_status = task->tk_status; + if (data->rpc_status == 0) + renew_lease(data->res.server, data->timestamp); +} + +static void nfs4_delegreturn_release(void *calldata) +{ + struct nfs4_delegreturndata *data = calldata; + + put_rpccred(data->cred); + kfree(calldata); +} + +const static struct rpc_call_ops nfs4_delegreturn_ops = { + .rpc_call_prepare = nfs4_delegreturn_prepare, + .rpc_call_done = nfs4_delegreturn_done, + .rpc_release = nfs4_delegreturn_release, +}; + +static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) +{ + struct nfs4_delegreturndata *data; + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_task *task; + int status; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + data->args.fhandle = &data->fh; + data->args.stateid = &data->stateid; + data->args.bitmask = server->attr_bitmask; + nfs_copy_fh(&data->fh, NFS_FH(inode)); + memcpy(&data->stateid, stateid, sizeof(data->stateid)); + data->res.fattr = &data->fattr; + data->res.server = server; + data->cred = get_rpccred(cred); + data->timestamp = jiffies; + data->rpc_status = 0; + + task = rpc_run_task(NFS_CLIENT(inode), RPC_TASK_ASYNC, &nfs4_delegreturn_ops, data); + if (IS_ERR(task)) { + nfs4_delegreturn_release(data); + return PTR_ERR(task); + } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) { + status = data->rpc_status; + if (status == 0) + nfs_post_op_update_inode(inode, &data->fattr); + } + rpc_release_task(task); + return status; } int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid) @@ -2734,43 +3035,17 @@ nfs4_set_lock_task_retry(unsigned long timeout) return timeout; } -static inline int -nfs4_lck_type(int cmd, struct file_lock *request) -{ - /* set lock type */ - switch (request->fl_type) { - case F_RDLCK: - return IS_SETLKW(cmd) ? NFS4_READW_LT : NFS4_READ_LT; - case F_WRLCK: - return IS_SETLKW(cmd) ? NFS4_WRITEW_LT : NFS4_WRITE_LT; - case F_UNLCK: - return NFS4_WRITE_LT; - } - BUG(); - return 0; -} - -static inline uint64_t -nfs4_lck_length(struct file_lock *request) -{ - if (request->fl_end == OFFSET_MAX) - return ~(uint64_t)0; - return request->fl_end - request->fl_start + 1; -} - static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request) { struct inode *inode = state->inode; struct nfs_server *server = NFS_SERVER(inode); struct nfs4_client *clp = server->nfs4_state; - struct nfs_lockargs arg = { + struct nfs_lockt_args arg = { .fh = NFS_FH(inode), - .type = nfs4_lck_type(cmd, request), - .offset = request->fl_start, - .length = nfs4_lck_length(request), + .fl = request, }; - struct nfs_lockres res = { - .server = server, + struct nfs_lockt_res res = { + .denied = request, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKT], @@ -2778,36 +3053,23 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock .rpc_resp = &res, .rpc_cred = state->owner->so_cred, }; - struct nfs_lowner nlo; struct nfs4_lock_state *lsp; int status; down_read(&clp->cl_sem); - nlo.clientid = clp->cl_clientid; + arg.lock_owner.clientid = clp->cl_clientid; status = nfs4_set_lock_state(state, request); if (status != 0) goto out; lsp = request->fl_u.nfs4_fl.owner; - nlo.id = lsp->ls_id; - arg.u.lockt = &nlo; + arg.lock_owner.id = lsp->ls_id; status = rpc_call_sync(server->client, &msg, 0); - if (!status) { - request->fl_type = F_UNLCK; - } else if (status == -NFS4ERR_DENIED) { - int64_t len, start, end; - start = res.u.denied.offset; - len = res.u.denied.length; - end = start + len - 1; - if (end < 0 || len == 0) - request->fl_end = OFFSET_MAX; - else - request->fl_end = (loff_t)end; - request->fl_start = (loff_t)start; - request->fl_type = F_WRLCK; - if (res.u.denied.type & 1) - request->fl_type = F_RDLCK; - request->fl_pid = 0; - status = 0; + switch (status) { + case 0: + request->fl_type = F_UNLCK; + break; + case -NFS4ERR_DENIED: + status = 0; } out: up_read(&clp->cl_sem); @@ -2847,196 +3109,314 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl) } struct nfs4_unlockdata { - struct nfs_lockargs arg; - struct nfs_locku_opargs luargs; - struct nfs_lockres res; + struct nfs_locku_args arg; + struct nfs_locku_res res; struct nfs4_lock_state *lsp; struct nfs_open_context *ctx; - atomic_t refcount; - struct completion completion; + struct file_lock fl; + const struct nfs_server *server; + unsigned long timestamp; }; -static void nfs4_locku_release_calldata(struct nfs4_unlockdata *calldata) -{ - if (atomic_dec_and_test(&calldata->refcount)) { - nfs_free_seqid(calldata->luargs.seqid); - nfs4_put_lock_state(calldata->lsp); - put_nfs_open_context(calldata->ctx); - kfree(calldata); - } +static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, + struct nfs_open_context *ctx, + struct nfs4_lock_state *lsp, + struct nfs_seqid *seqid) +{ + struct nfs4_unlockdata *p; + struct inode *inode = lsp->ls_state->inode; + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; + p->arg.seqid = seqid; + p->arg.stateid = &lsp->ls_stateid; + p->lsp = lsp; + atomic_inc(&lsp->ls_count); + /* Ensure we don't close file until we're done freeing locks! */ + p->ctx = get_nfs_open_context(ctx); + memcpy(&p->fl, fl, sizeof(p->fl)); + p->server = NFS_SERVER(inode); + return p; } -static void nfs4_locku_complete(struct nfs4_unlockdata *calldata) +static void nfs4_locku_release_calldata(void *data) { - complete(&calldata->completion); - nfs4_locku_release_calldata(calldata); + struct nfs4_unlockdata *calldata = data; + nfs_free_seqid(calldata->arg.seqid); + nfs4_put_lock_state(calldata->lsp); + put_nfs_open_context(calldata->ctx); + kfree(calldata); } -static void nfs4_locku_done(struct rpc_task *task) +static void nfs4_locku_done(struct rpc_task *task, void *data) { - struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; + struct nfs4_unlockdata *calldata = data; - nfs_increment_lock_seqid(task->tk_status, calldata->luargs.seqid); + if (RPC_ASSASSINATED(task)) + return; + nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: memcpy(calldata->lsp->ls_stateid.data, - calldata->res.u.stateid.data, + calldata->res.stateid.data, sizeof(calldata->lsp->ls_stateid.data)); + renew_lease(calldata->server, calldata->timestamp); break; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(calldata->res.server->nfs4_state); + nfs4_schedule_state_recovery(calldata->server->nfs4_state); break; default: - if (nfs4_async_handle_error(task, calldata->res.server) == -EAGAIN) { + if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN) { rpc_restart_call(task); - return; } } - nfs4_locku_complete(calldata); } -static void nfs4_locku_begin(struct rpc_task *task) +static void nfs4_locku_prepare(struct rpc_task *task, void *data) { - struct nfs4_unlockdata *calldata = (struct nfs4_unlockdata *)task->tk_calldata; + struct nfs4_unlockdata *calldata = data; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCKU], .rpc_argp = &calldata->arg, .rpc_resp = &calldata->res, .rpc_cred = calldata->lsp->ls_state->owner->so_cred, }; - int status; - status = nfs_wait_on_sequence(calldata->luargs.seqid, task); - if (status != 0) + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) return; if ((calldata->lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) { - nfs4_locku_complete(calldata); - task->tk_exit = NULL; - rpc_exit(task, 0); + /* Note: exit _without_ running nfs4_locku_done */ + task->tk_action = NULL; return; } + calldata->timestamp = jiffies; rpc_call_setup(task, &msg, 0); } +static const struct rpc_call_ops nfs4_locku_ops = { + .rpc_call_prepare = nfs4_locku_prepare, + .rpc_call_done = nfs4_locku_done, + .rpc_release = nfs4_locku_release_calldata, +}; + +static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, + struct nfs_open_context *ctx, + struct nfs4_lock_state *lsp, + struct nfs_seqid *seqid) +{ + struct nfs4_unlockdata *data; + struct rpc_task *task; + + data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); + if (data == NULL) { + nfs_free_seqid(seqid); + return ERR_PTR(-ENOMEM); + } + + /* Unlock _before_ we do the RPC call */ + do_vfs_lock(fl->fl_file, fl); + task = rpc_run_task(NFS_CLIENT(lsp->ls_state->inode), RPC_TASK_ASYNC, &nfs4_locku_ops, data); + if (IS_ERR(task)) + nfs4_locku_release_calldata(data); + return task; +} + static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request) { - struct nfs4_unlockdata *calldata; - struct inode *inode = state->inode; - struct nfs_server *server = NFS_SERVER(inode); + struct nfs_seqid *seqid; struct nfs4_lock_state *lsp; - int status; + struct rpc_task *task; + int status = 0; /* Is this a delegated lock? */ if (test_bit(NFS_DELEGATED_STATE, &state->flags)) - return do_vfs_lock(request->fl_file, request); + goto out_unlock; + /* Is this open_owner holding any locks on the server? */ + if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) + goto out_unlock; status = nfs4_set_lock_state(state, request); if (status != 0) - return status; + goto out_unlock; lsp = request->fl_u.nfs4_fl.owner; - /* We might have lost the locks! */ - if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0) - return 0; - calldata = kmalloc(sizeof(*calldata), GFP_KERNEL); - if (calldata == NULL) - return -ENOMEM; - calldata->luargs.seqid = nfs_alloc_seqid(&lsp->ls_seqid); - if (calldata->luargs.seqid == NULL) { - kfree(calldata); - return -ENOMEM; - } - calldata->luargs.stateid = &lsp->ls_stateid; - calldata->arg.fh = NFS_FH(inode); - calldata->arg.type = nfs4_lck_type(cmd, request); - calldata->arg.offset = request->fl_start; - calldata->arg.length = nfs4_lck_length(request); - calldata->arg.u.locku = &calldata->luargs; - calldata->res.server = server; - calldata->lsp = lsp; - atomic_inc(&lsp->ls_count); - - /* Ensure we don't close file until we're done freeing locks! */ - calldata->ctx = get_nfs_open_context((struct nfs_open_context*)request->fl_file->private_data); - - atomic_set(&calldata->refcount, 2); - init_completion(&calldata->completion); - - status = nfs4_call_async(NFS_SERVER(inode)->client, nfs4_locku_begin, - nfs4_locku_done, calldata); - if (status == 0) - wait_for_completion_interruptible(&calldata->completion); + status = -ENOMEM; + seqid = nfs_alloc_seqid(&lsp->ls_seqid); + if (seqid == NULL) + goto out_unlock; + task = nfs4_do_unlck(request, request->fl_file->private_data, lsp, seqid); + status = PTR_ERR(task); + if (IS_ERR(task)) + goto out_unlock; + status = nfs4_wait_for_completion_rpc_task(task); + rpc_release_task(task); + return status; +out_unlock: do_vfs_lock(request->fl_file, request); - nfs4_locku_release_calldata(calldata); return status; } -static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *request, int reclaim) +struct nfs4_lockdata { + struct nfs_lock_args arg; + struct nfs_lock_res res; + struct nfs4_lock_state *lsp; + struct nfs_open_context *ctx; + struct file_lock fl; + unsigned long timestamp; + int rpc_status; + int cancelled; +}; + +static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, + struct nfs_open_context *ctx, struct nfs4_lock_state *lsp) { - struct inode *inode = state->inode; + struct nfs4_lockdata *p; + struct inode *inode = lsp->ls_state->inode; struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner; - struct nfs_lock_opargs largs = { - .lock_stateid = &lsp->ls_stateid, - .open_stateid = &state->stateid, - .lock_owner = { - .clientid = server->nfs4_state->cl_clientid, - .id = lsp->ls_id, - }, - .reclaim = reclaim, - }; - struct nfs_lockargs arg = { - .fh = NFS_FH(inode), - .type = nfs4_lck_type(cmd, request), - .offset = request->fl_start, - .length = nfs4_lck_length(request), - .u = { - .lock = &largs, - }, - }; - struct nfs_lockres res = { - .server = server, - }; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + + p->arg.fh = NFS_FH(inode); + p->arg.fl = &p->fl; + p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); + if (p->arg.lock_seqid == NULL) + goto out_free; + p->arg.lock_stateid = &lsp->ls_stateid; + p->arg.lock_owner.clientid = server->nfs4_state->cl_clientid; + p->arg.lock_owner.id = lsp->ls_id; + p->lsp = lsp; + atomic_inc(&lsp->ls_count); + p->ctx = get_nfs_open_context(ctx); + memcpy(&p->fl, fl, sizeof(p->fl)); + return p; +out_free: + kfree(p); + return NULL; +} + +static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs4_lockdata *data = calldata; + struct nfs4_state *state = data->lsp->ls_state; + struct nfs4_state_owner *sp = state->owner; struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK], - .rpc_argp = &arg, - .rpc_resp = &res, - .rpc_cred = state->owner->so_cred, + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LOCK], + .rpc_argp = &data->arg, + .rpc_resp = &data->res, + .rpc_cred = sp->so_cred, }; - int status = -ENOMEM; - - largs.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid); - if (largs.lock_seqid == NULL) - return -ENOMEM; - if (!(lsp->ls_seqid.flags & NFS_SEQID_CONFIRMED)) { - struct nfs4_state_owner *owner = state->owner; - largs.open_seqid = nfs_alloc_seqid(&owner->so_seqid); - if (largs.open_seqid == NULL) + if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0) + return; + dprintk("%s: begin!\n", __FUNCTION__); + /* Do we need to do an open_to_lock_owner? */ + if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) { + data->arg.open_seqid = nfs_alloc_seqid(&sp->so_seqid); + if (data->arg.open_seqid == NULL) { + data->rpc_status = -ENOMEM; + task->tk_action = NULL; goto out; - largs.new_lock_owner = 1; - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - /* increment open seqid on success, and seqid mutating errors */ - if (largs.new_lock_owner != 0) { - nfs_increment_open_seqid(status, largs.open_seqid); - if (status == 0) - nfs_confirm_seqid(&lsp->ls_seqid, 0); } - nfs_free_seqid(largs.open_seqid); - } else - status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR); - /* increment lock seqid on success, and seqid mutating errors*/ - nfs_increment_lock_seqid(status, largs.lock_seqid); - /* save the returned stateid. */ - if (status == 0) { - memcpy(lsp->ls_stateid.data, res.u.stateid.data, - sizeof(lsp->ls_stateid.data)); - lsp->ls_flags |= NFS_LOCK_INITIALIZED; - } else if (status == -NFS4ERR_DENIED) - status = -EAGAIN; + data->arg.open_stateid = &state->stateid; + data->arg.new_lock_owner = 1; + } + data->timestamp = jiffies; + rpc_call_setup(task, &msg, 0); out: - nfs_free_seqid(largs.lock_seqid); - return status; + dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status); +} + +static void nfs4_lock_done(struct rpc_task *task, void *calldata) +{ + struct nfs4_lockdata *data = calldata; + + dprintk("%s: begin!\n", __FUNCTION__); + + data->rpc_status = task->tk_status; + if (RPC_ASSASSINATED(task)) + goto out; + if (data->arg.new_lock_owner != 0) { + nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid); + if (data->rpc_status == 0) + nfs_confirm_seqid(&data->lsp->ls_seqid, 0); + else + goto out; + } + if (data->rpc_status == 0) { + memcpy(data->lsp->ls_stateid.data, data->res.stateid.data, + sizeof(data->lsp->ls_stateid.data)); + data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; + renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp); + } + nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid); +out: + dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status); +} + +static void nfs4_lock_release(void *calldata) +{ + struct nfs4_lockdata *data = calldata; + + dprintk("%s: begin!\n", __FUNCTION__); + if (data->arg.open_seqid != NULL) + nfs_free_seqid(data->arg.open_seqid); + if (data->cancelled != 0) { + struct rpc_task *task; + task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, + data->arg.lock_seqid); + if (!IS_ERR(task)) + rpc_release_task(task); + dprintk("%s: cancelling lock!\n", __FUNCTION__); + } else + nfs_free_seqid(data->arg.lock_seqid); + nfs4_put_lock_state(data->lsp); + put_nfs_open_context(data->ctx); + kfree(data); + dprintk("%s: done!\n", __FUNCTION__); +} + +static const struct rpc_call_ops nfs4_lock_ops = { + .rpc_call_prepare = nfs4_lock_prepare, + .rpc_call_done = nfs4_lock_done, + .rpc_release = nfs4_lock_release, +}; + +static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim) +{ + struct nfs4_lockdata *data; + struct rpc_task *task; + int ret; + + dprintk("%s: begin!\n", __FUNCTION__); + data = nfs4_alloc_lockdata(fl, fl->fl_file->private_data, + fl->fl_u.nfs4_fl.owner); + if (data == NULL) + return -ENOMEM; + if (IS_SETLKW(cmd)) + data->arg.block = 1; + if (reclaim != 0) + data->arg.reclaim = 1; + task = rpc_run_task(NFS_CLIENT(state->inode), RPC_TASK_ASYNC, + &nfs4_lock_ops, data); + if (IS_ERR(task)) { + nfs4_lock_release(data); + return PTR_ERR(task); + } + ret = nfs4_wait_for_completion_rpc_task(task); + if (ret == 0) { + ret = data->rpc_status; + if (ret == -NFS4ERR_DENIED) + ret = -EAGAIN; + } else + data->cancelled = 1; + rpc_release_task(task); + dprintk("%s: done, ret = %d!\n", __FUNCTION__, ret); + return ret; } static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index a3001628ad32..5d764d8e6d8a 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -54,6 +54,7 @@ #include <linux/nfs4.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -61,6 +62,7 @@ void nfs4_renew_state(void *data) { struct nfs4_client *clp = (struct nfs4_client *)data; + struct rpc_cred *cred; long lease, timeout; unsigned long last, now; @@ -68,7 +70,7 @@ nfs4_renew_state(void *data) dprintk("%s: start\n", __FUNCTION__); /* Are there any active superblocks? */ if (list_empty(&clp->cl_superblocks)) - goto out; + goto out; spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; last = clp->cl_last_renewal; @@ -76,9 +78,17 @@ nfs4_renew_state(void *data) timeout = (2 * lease) / 3 + (long)last - (long)now; /* Are we close to a lease timeout? */ if (time_after(now, last + lease/3)) { + cred = nfs4_get_renew_cred(clp); + if (cred == NULL) { + set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); + spin_unlock(&clp->cl_lock); + nfs_expire_all_delegations(clp); + goto out; + } spin_unlock(&clp->cl_lock); /* Queue an asynchronous RENEW. */ - nfs4_proc_async_renew(clp); + nfs4_proc_async_renew(clp, cred); + put_rpccred(cred); timeout = (2 * lease) / 3; spin_lock(&clp->cl_lock); } else diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 5ef4c57618fe..afad0255e7db 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -43,6 +43,8 @@ #include <linux/smp_lock.h> #include <linux/nfs_fs.h> #include <linux/nfs_idmap.h> +#include <linux/kthread.h> +#include <linux/module.h> #include <linux/workqueue.h> #include <linux/bitops.h> @@ -57,8 +59,6 @@ const nfs4_stateid zero_stateid; static DEFINE_SPINLOCK(state_spinlock); static LIST_HEAD(nfs4_clientid_list); -static void nfs4_recover_state(void *); - void init_nfsv4_state(struct nfs_server *server) { @@ -91,11 +91,10 @@ nfs4_alloc_client(struct in_addr *addr) if (nfs_callback_up() < 0) return NULL; - if ((clp = kmalloc(sizeof(*clp), GFP_KERNEL)) == NULL) { + if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) { nfs_callback_down(); return NULL; } - memset(clp, 0, sizeof(*clp)); memcpy(&clp->cl_addr, addr, sizeof(clp->cl_addr)); init_rwsem(&clp->cl_sem); INIT_LIST_HEAD(&clp->cl_delegations); @@ -103,14 +102,12 @@ nfs4_alloc_client(struct in_addr *addr) INIT_LIST_HEAD(&clp->cl_unused); spin_lock_init(&clp->cl_lock); atomic_set(&clp->cl_count, 1); - INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp); INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); INIT_LIST_HEAD(&clp->cl_superblocks); - init_waitqueue_head(&clp->cl_waitq); rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS4 client"); clp->cl_rpcclient = ERR_PTR(-EINVAL); clp->cl_boot_time = CURRENT_TIME; - clp->cl_state = 1 << NFS4CLNT_OK; + clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; return clp; } @@ -127,8 +124,6 @@ nfs4_free_client(struct nfs4_client *clp) kfree(sp); } BUG_ON(!list_empty(&clp->cl_state_owners)); - if (clp->cl_cred) - put_rpccred(clp->cl_cred); nfs_idmap_delete(clp); if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); @@ -193,27 +188,22 @@ nfs4_put_client(struct nfs4_client *clp) list_del(&clp->cl_servers); spin_unlock(&state_spinlock); BUG_ON(!list_empty(&clp->cl_superblocks)); - wake_up_all(&clp->cl_waitq); rpc_wake_up(&clp->cl_rpcwaitq); nfs4_kill_renewd(clp); nfs4_free_client(clp); } -static int __nfs4_init_client(struct nfs4_client *clp) +static int nfs4_init_client(struct nfs4_client *clp, struct rpc_cred *cred) { - int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, nfs_callback_tcpport); + int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, + nfs_callback_tcpport, cred); if (status == 0) - status = nfs4_proc_setclientid_confirm(clp); + status = nfs4_proc_setclientid_confirm(clp, cred); if (status == 0) nfs4_schedule_state_renewal(clp); return status; } -int nfs4_init_client(struct nfs4_client *clp) -{ - return nfs4_map_errors(__nfs4_init_client(clp)); -} - u32 nfs4_alloc_lockowner_id(struct nfs4_client *clp) { @@ -235,6 +225,32 @@ nfs4_client_grab_unused(struct nfs4_client *clp, struct rpc_cred *cred) return sp; } +struct rpc_cred *nfs4_get_renew_cred(struct nfs4_client *clp) +{ + struct nfs4_state_owner *sp; + struct rpc_cred *cred = NULL; + + list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + if (list_empty(&sp->so_states)) + continue; + cred = get_rpccred(sp->so_cred); + break; + } + return cred; +} + +struct rpc_cred *nfs4_get_setclientid_cred(struct nfs4_client *clp) +{ + struct nfs4_state_owner *sp; + + if (!list_empty(&clp->cl_state_owners)) { + sp = list_entry(clp->cl_state_owners.next, + struct nfs4_state_owner, so_list); + return get_rpccred(sp->so_cred); + } + return NULL; +} + static struct nfs4_state_owner * nfs4_find_state_owner(struct nfs4_client *clp, struct rpc_cred *cred) { @@ -349,14 +365,9 @@ nfs4_alloc_open_state(void) { struct nfs4_state *state; - state = kmalloc(sizeof(*state), GFP_KERNEL); + state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; - state->state = 0; - state->nreaders = 0; - state->nwriters = 0; - state->flags = 0; - memset(state->stateid.data, 0, sizeof(state->stateid.data)); atomic_set(&state->count, 1); INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); @@ -475,15 +486,23 @@ void nfs4_close_state(struct nfs4_state *state, mode_t mode) /* Protect against nfs4_find_state() */ spin_lock(&owner->so_lock); spin_lock(&inode->i_lock); - if (mode & FMODE_READ) - state->nreaders--; - if (mode & FMODE_WRITE) - state->nwriters--; + switch (mode & (FMODE_READ | FMODE_WRITE)) { + case FMODE_READ: + state->n_rdonly--; + break; + case FMODE_WRITE: + state->n_wronly--; + break; + case FMODE_READ|FMODE_WRITE: + state->n_rdwr--; + } oldstate = newstate = state->state; - if (state->nreaders == 0) - newstate &= ~FMODE_READ; - if (state->nwriters == 0) - newstate &= ~FMODE_WRITE; + if (state->n_rdwr == 0) { + if (state->n_rdonly == 0) + newstate &= ~FMODE_READ; + if (state->n_wronly == 0) + newstate &= ~FMODE_WRITE; + } if (test_bit(NFS_DELEGATED_STATE, &state->flags)) { nfs4_state_set_mode_locked(state, newstate); oldstate = newstate; @@ -733,45 +752,43 @@ out: } static int reclaimer(void *); -struct reclaimer_args { - struct nfs4_client *clp; - struct completion complete; -}; + +static inline void nfs4_clear_recover_bit(struct nfs4_client *clp) +{ + smp_mb__before_clear_bit(); + clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state); + smp_mb__after_clear_bit(); + wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER); + rpc_wake_up(&clp->cl_rpcwaitq); +} /* * State recovery routine */ -void -nfs4_recover_state(void *data) +static void nfs4_recover_state(struct nfs4_client *clp) { - struct nfs4_client *clp = (struct nfs4_client *)data; - struct reclaimer_args args = { - .clp = clp, - }; - might_sleep(); - - init_completion(&args.complete); + struct task_struct *task; - if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0) - goto out_failed_clear; - wait_for_completion(&args.complete); - return; -out_failed_clear: - set_bit(NFS4CLNT_OK, &clp->cl_state); - wake_up_all(&clp->cl_waitq); - rpc_wake_up(&clp->cl_rpcwaitq); + __module_get(THIS_MODULE); + atomic_inc(&clp->cl_count); + task = kthread_run(reclaimer, clp, "%u.%u.%u.%u-reclaim", + NIPQUAD(clp->cl_addr)); + if (!IS_ERR(task)) + return; + nfs4_clear_recover_bit(clp); + nfs4_put_client(clp); + module_put(THIS_MODULE); } /* * Schedule a state recovery attempt */ -void -nfs4_schedule_state_recovery(struct nfs4_client *clp) +void nfs4_schedule_state_recovery(struct nfs4_client *clp) { if (!clp) return; - if (test_and_clear_bit(NFS4CLNT_OK, &clp->cl_state)) - schedule_work(&clp->cl_recoverd); + if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) + nfs4_recover_state(clp); } static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state) @@ -887,18 +904,14 @@ static void nfs4_state_mark_reclaim(struct nfs4_client *clp) static int reclaimer(void *ptr) { - struct reclaimer_args *args = (struct reclaimer_args *)ptr; - struct nfs4_client *clp = args->clp; + struct nfs4_client *clp = ptr; struct nfs4_state_owner *sp; struct nfs4_state_recovery_ops *ops; + struct rpc_cred *cred; int status = 0; - daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr)); allow_signal(SIGKILL); - atomic_inc(&clp->cl_count); - complete(&args->complete); - /* Ensure exclusive access to NFSv4 state */ lock_kernel(); down_write(&clp->cl_sem); @@ -906,20 +919,33 @@ static int reclaimer(void *ptr) if (list_empty(&clp->cl_superblocks)) goto out; restart_loop: - status = nfs4_proc_renew(clp); - switch (status) { - case 0: - case -NFS4ERR_CB_PATH_DOWN: - goto out; - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_LEASE_MOVED: - ops = &nfs4_reboot_recovery_ops; - break; - default: - ops = &nfs4_network_partition_recovery_ops; - }; + ops = &nfs4_network_partition_recovery_ops; + /* Are there any open files on this volume? */ + cred = nfs4_get_renew_cred(clp); + if (cred != NULL) { + /* Yes there are: try to renew the old lease */ + status = nfs4_proc_renew(clp, cred); + switch (status) { + case 0: + case -NFS4ERR_CB_PATH_DOWN: + put_rpccred(cred); + goto out; + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_LEASE_MOVED: + ops = &nfs4_reboot_recovery_ops; + } + } else { + /* "reboot" to ensure we clear all state on the server */ + clp->cl_boot_time = CURRENT_TIME; + cred = nfs4_get_setclientid_cred(clp); + } + /* We're going to have to re-establish a clientid */ nfs4_state_mark_reclaim(clp); - status = __nfs4_init_client(clp); + status = -ENOENT; + if (cred != NULL) { + status = nfs4_init_client(clp, cred); + put_rpccred(cred); + } if (status) goto out_error; /* Mark all delegations for reclaim */ @@ -940,14 +966,13 @@ restart_loop: } nfs_delegation_reap_unclaimed(clp); out: - set_bit(NFS4CLNT_OK, &clp->cl_state); up_write(&clp->cl_sem); unlock_kernel(); - wake_up_all(&clp->cl_waitq); - rpc_wake_up(&clp->cl_rpcwaitq); if (status == -NFS4ERR_CB_PATH_DOWN) nfs_handle_cb_pathdown(clp); + nfs4_clear_recover_bit(clp); nfs4_put_client(clp); + module_put_and_exit(0); return 0; out_error: printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u with error %d\n", diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fbbace8a30c4..4bbf5ef57785 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -392,9 +392,11 @@ static int nfs_stat_to_errno(int); decode_getattr_maxsz) #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_delegreturn_maxsz) + encode_delegreturn_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ - decode_delegreturn_maxsz) + decode_delegreturn_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_getattr_maxsz) @@ -564,7 +566,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode); + WRITE32(iap->ia_mode & S_IALLUGO); } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; @@ -742,69 +744,80 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name) return 0; } +static inline int nfs4_lock_type(struct file_lock *fl, int block) +{ + if ((fl->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) == F_RDLCK) + return block ? NFS4_READW_LT : NFS4_READ_LT; + return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT; +} + +static inline uint64_t nfs4_lock_length(struct file_lock *fl) +{ + if (fl->fl_end == OFFSET_MAX) + return ~(uint64_t)0; + return fl->fl_end - fl->fl_start + 1; +} + /* * opcode,type,reclaim,offset,length,new_lock_owner = 32 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 */ -static int encode_lock(struct xdr_stream *xdr, const struct nfs_lockargs *arg) +static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args) { uint32_t *p; - struct nfs_lock_opargs *opargs = arg->u.lock; RESERVE_SPACE(32); WRITE32(OP_LOCK); - WRITE32(arg->type); - WRITE32(opargs->reclaim); - WRITE64(arg->offset); - WRITE64(arg->length); - WRITE32(opargs->new_lock_owner); - if (opargs->new_lock_owner){ + WRITE32(nfs4_lock_type(args->fl, args->block)); + WRITE32(args->reclaim); + WRITE64(args->fl->fl_start); + WRITE64(nfs4_lock_length(args->fl)); + WRITE32(args->new_lock_owner); + if (args->new_lock_owner){ RESERVE_SPACE(40); - WRITE32(opargs->open_seqid->sequence->counter); - WRITEMEM(opargs->open_stateid->data, sizeof(opargs->open_stateid->data)); - WRITE32(opargs->lock_seqid->sequence->counter); - WRITE64(opargs->lock_owner.clientid); + WRITE32(args->open_seqid->sequence->counter); + WRITEMEM(args->open_stateid->data, sizeof(args->open_stateid->data)); + WRITE32(args->lock_seqid->sequence->counter); + WRITE64(args->lock_owner.clientid); WRITE32(4); - WRITE32(opargs->lock_owner.id); + WRITE32(args->lock_owner.id); } else { RESERVE_SPACE(20); - WRITEMEM(opargs->lock_stateid->data, sizeof(opargs->lock_stateid->data)); - WRITE32(opargs->lock_seqid->sequence->counter); + WRITEMEM(args->lock_stateid->data, sizeof(args->lock_stateid->data)); + WRITE32(args->lock_seqid->sequence->counter); } return 0; } -static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockargs *arg) +static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args) { uint32_t *p; - struct nfs_lowner *opargs = arg->u.lockt; RESERVE_SPACE(40); WRITE32(OP_LOCKT); - WRITE32(arg->type); - WRITE64(arg->offset); - WRITE64(arg->length); - WRITE64(opargs->clientid); + WRITE32(nfs4_lock_type(args->fl, 0)); + WRITE64(args->fl->fl_start); + WRITE64(nfs4_lock_length(args->fl)); + WRITE64(args->lock_owner.clientid); WRITE32(4); - WRITE32(opargs->id); + WRITE32(args->lock_owner.id); return 0; } -static int encode_locku(struct xdr_stream *xdr, const struct nfs_lockargs *arg) +static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args) { uint32_t *p; - struct nfs_locku_opargs *opargs = arg->u.locku; RESERVE_SPACE(44); WRITE32(OP_LOCKU); - WRITE32(arg->type); - WRITE32(opargs->seqid->sequence->counter); - WRITEMEM(opargs->stateid->data, sizeof(opargs->stateid->data)); - WRITE64(arg->offset); - WRITE64(arg->length); + WRITE32(nfs4_lock_type(args->fl, 0)); + WRITE32(args->seqid->sequence->counter); + WRITEMEM(args->stateid->data, sizeof(args->stateid->data)); + WRITE64(args->fl->fl_start); + WRITE64(nfs4_lock_length(args->fl)); return 0; } @@ -964,9 +977,9 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con { uint32_t *p; - RESERVE_SPACE(8+sizeof(arg->stateid.data)); + RESERVE_SPACE(8+sizeof(arg->stateid->data)); WRITE32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid.data, sizeof(arg->stateid.data)); + WRITEMEM(arg->stateid->data, sizeof(arg->stateid->data)); WRITE32(arg->seqid->sequence->counter); return 0; @@ -1499,9 +1512,6 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, uint32_t *p, struct nfs_opena }; int status; - status = nfs_wait_on_sequence(args->seqid, req->rq_task); - if (status != 0) - goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1538,9 +1548,6 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, uint32_t *p, struct n }; int status; - status = nfs_wait_on_sequence(args->seqid, req->rq_task); - if (status != 0) - goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1558,19 +1565,19 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, uint32_t *p, struct nf { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; - status = nfs_wait_on_sequence(args->seqid, req->rq_task); - if (status != 0) - goto out; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); if (status) goto out; status = encode_open(&xdr, args); + if (status) + goto out; + status = encode_getfattr(&xdr, args->bitmask); out: return status; } @@ -1602,21 +1609,14 @@ out: /* * Encode a LOCK request */ -static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args) +static int nfs4_xdr_enc_lock(struct rpc_rqst *req, uint32_t *p, struct nfs_lock_args *args) { struct xdr_stream xdr; struct compound_hdr hdr = { .nops = 2, }; - struct nfs_lock_opargs *opargs = args->u.lock; int status; - status = nfs_wait_on_sequence(opargs->lock_seqid, req->rq_task); - if (status != 0) - goto out; - /* Do we need to do an open_to_lock_owner? */ - if (opargs->lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED) - opargs->new_lock_owner = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); status = encode_putfh(&xdr, args->fh); @@ -1630,7 +1630,7 @@ out: /* * Encode a LOCKT request */ -static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args) +static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, uint32_t *p, struct nfs_lockt_args *args) { struct xdr_stream xdr; struct compound_hdr hdr = { @@ -1651,7 +1651,7 @@ out: /* * Encode a LOCKU request */ -static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_lockargs *args) +static int nfs4_xdr_enc_locku(struct rpc_rqst *req, uint32_t *p, struct nfs_locku_args *args) { struct xdr_stream xdr; struct compound_hdr hdr = { @@ -1985,14 +1985,20 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, uint32_t *p, const str { struct xdr_stream xdr; struct compound_hdr hdr = { - .nops = 2, + .nops = 3, }; int status; xdr_init_encode(&xdr, &req->rq_snd_buf, p); encode_compound_hdr(&xdr, &hdr); - if ((status = encode_putfh(&xdr, args->fhandle)) == 0) - status = encode_delegreturn(&xdr, args->stateid); + status = encode_putfh(&xdr, args->fhandle); + if (status != 0) + goto out; + status = encode_delegreturn(&xdr, args->stateid); + if (status != 0) + goto out; + status = encode_getfattr(&xdr, args->bitmask); +out: return status; } @@ -2955,55 +2961,64 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) /* * We create the owner, so we know a proper owner.id length is 4. */ -static int decode_lock_denied (struct xdr_stream *xdr, struct nfs_lock_denied *denied) +static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) { + uint64_t offset, length, clientid; uint32_t *p; - uint32_t namelen; + uint32_t namelen, type; READ_BUF(32); - READ64(denied->offset); - READ64(denied->length); - READ32(denied->type); - READ64(denied->owner.clientid); + READ64(offset); + READ64(length); + READ32(type); + if (fl != NULL) { + fl->fl_start = (loff_t)offset; + fl->fl_end = fl->fl_start + (loff_t)length - 1; + if (length == ~(uint64_t)0) + fl->fl_end = OFFSET_MAX; + fl->fl_type = F_WRLCK; + if (type & 1) + fl->fl_type = F_RDLCK; + fl->fl_pid = 0; + } + READ64(clientid); READ32(namelen); READ_BUF(namelen); - if (namelen == 4) - READ32(denied->owner.id); return -NFS4ERR_DENIED; } -static int decode_lock(struct xdr_stream *xdr, struct nfs_lockres *res) +static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { uint32_t *p; int status; status = decode_op_hdr(xdr, OP_LOCK); if (status == 0) { - READ_BUF(sizeof(res->u.stateid.data)); - COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data)); + READ_BUF(sizeof(res->stateid.data)); + COPYMEM(res->stateid.data, sizeof(res->stateid.data)); } else if (status == -NFS4ERR_DENIED) - return decode_lock_denied(xdr, &res->u.denied); + return decode_lock_denied(xdr, NULL); return status; } -static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockres *res) +static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) { int status; status = decode_op_hdr(xdr, OP_LOCKT); if (status == -NFS4ERR_DENIED) - return decode_lock_denied(xdr, &res->u.denied); + return decode_lock_denied(xdr, res->denied); return status; } -static int decode_locku(struct xdr_stream *xdr, struct nfs_lockres *res) +static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { uint32_t *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); if (status == 0) { - READ_BUF(sizeof(res->u.stateid.data)); - COPYMEM(res->u.stateid.data, sizeof(res->u.stateid.data)); + READ_BUF(sizeof(res->stateid.data)); + COPYMEM(res->stateid.data, sizeof(res->stateid.data)); } return status; } @@ -3831,6 +3846,9 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, uint32_t *p, struct if (status) goto out; status = decode_open(&xdr, res); + if (status) + goto out; + decode_getfattr(&xdr, res->f_attr, res->server); out: return status; } @@ -3864,7 +3882,7 @@ out: /* * Decode LOCK response */ -static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res) +static int nfs4_xdr_dec_lock(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lock_res *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -3885,7 +3903,7 @@ out: /* * Decode LOCKT response */ -static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res) +static int nfs4_xdr_dec_lockt(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockt_res *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -3906,7 +3924,7 @@ out: /* * Decode LOCKU response */ -static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_lockres *res) +static int nfs4_xdr_dec_locku(struct rpc_rqst *rqstp, uint32_t *p, struct nfs_locku_res *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -4174,7 +4192,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, uint32_t *p, s /* * DELEGRETURN request */ -static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *dummy) +static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, struct nfs4_delegreturnres *res) { struct xdr_stream xdr; struct compound_hdr hdr; @@ -4182,11 +4200,14 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, uint32_t *p, void *d xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); status = decode_compound_hdr(&xdr, &hdr); - if (status == 0) { - status = decode_putfh(&xdr); - if (status == 0) - status = decode_delegreturn(&xdr); - } + if (status != 0) + goto out; + status = decode_putfh(&xdr); + if (status != 0) + goto out; + status = decode_delegreturn(&xdr); + decode_getfattr(&xdr, res->fattr, res->server); +out: return status; } diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 1b272a135a31..e897e00c2c9d 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -275,7 +275,9 @@ static int __init root_nfs_parse(char *name, char *buf) case Opt_noacl: nfs_data.flags |= NFS_MOUNT_NOACL; break; - default : + default: + printk(KERN_WARNING "Root-NFS: unknown " + "option: %s\n", p); return 0; } } @@ -296,8 +298,8 @@ static int __init root_nfs_name(char *name) nfs_port = -1; nfs_data.version = NFS_MOUNT_VERSION; nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */ - nfs_data.rsize = NFS_DEF_FILE_IO_BUFFER_SIZE; - nfs_data.wsize = NFS_DEF_FILE_IO_BUFFER_SIZE; + nfs_data.rsize = NFS_DEF_FILE_IO_SIZE; + nfs_data.wsize = NFS_DEF_FILE_IO_SIZE; nfs_data.acregmin = 3; nfs_data.acregmax = 60; nfs_data.acdirmin = 30; diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index e1e3ca5d746b..f5150d71c03d 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -111,6 +111,9 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, }; int status; + /* Mask out the non-modebit related stuff from attr->ia_mode */ + sattr->ia_mode &= S_IALLUGO; + dprintk("NFS call setattr\n"); nfs_fattr_init(fattr); status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0); @@ -547,10 +550,9 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); -static void -nfs_read_done(struct rpc_task *task) +static void nfs_read_done(struct rpc_task *task, void *calldata) { - struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; + struct nfs_read_data *data = calldata; if (task->tk_status >= 0) { nfs_refresh_inode(data->inode, data->res.fattr); @@ -560,9 +562,14 @@ nfs_read_done(struct rpc_task *task) if (data->args.offset + data->args.count >= data->res.fattr->size) data->res.eof = 1; } - nfs_readpage_result(task); + nfs_readpage_result(task, calldata); } +static const struct rpc_call_ops nfs_read_ops = { + .rpc_call_done = nfs_read_done, + .rpc_release = nfs_readdata_release, +}; + static void nfs_proc_read_setup(struct nfs_read_data *data) { @@ -580,20 +587,24 @@ nfs_proc_read_setup(struct nfs_read_data *data) flags = RPC_TASK_ASYNC | (IS_SWAPFILE(inode)? NFS_RPC_SWAPFLAGS : 0); /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs_read_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_read_ops, data); rpc_call_setup(task, &msg, 0); } -static void -nfs_write_done(struct rpc_task *task) +static void nfs_write_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_write_data *data = calldata; if (task->tk_status >= 0) nfs_post_op_update_inode(data->inode, data->res.fattr); - nfs_writeback_done(task); + nfs_writeback_done(task, calldata); } +static const struct rpc_call_ops nfs_write_ops = { + .rpc_call_done = nfs_write_done, + .rpc_release = nfs_writedata_release, +}; + static void nfs_proc_write_setup(struct nfs_write_data *data, int how) { @@ -614,7 +625,7 @@ nfs_proc_write_setup(struct nfs_write_data *data, int how) flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; /* Finalize the task. */ - rpc_init_task(task, NFS_CLIENT(inode), nfs_write_done, flags); + rpc_init_task(task, NFS_CLIENT(inode), flags, &nfs_write_ops, data); rpc_call_setup(task, &msg, 0); } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 5f20eafba8ec..05eb43fadf8e 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -42,9 +42,8 @@ mempool_t *nfs_rdata_mempool; #define MIN_POOL_READ (32) -void nfs_readdata_release(struct rpc_task *task) +void nfs_readdata_release(void *data) { - struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; nfs_readdata_free(data); } @@ -84,7 +83,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode, int result; struct nfs_read_data *rdata; - rdata = nfs_readdata_alloc(); + rdata = nfs_readdata_alloc(1); if (!rdata) return -ENOMEM; @@ -220,9 +219,6 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, NFS_PROTO(inode)->read_setup(data); data->task.tk_cookie = (unsigned long)inode; - data->task.tk_calldata = data; - /* Release requests */ - data->task.tk_release = nfs_readdata_release; dprintk("NFS: %4d initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", data->task.tk_pid, @@ -287,7 +283,7 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode) nbytes = req->wb_bytes; for(;;) { - data = nfs_readdata_alloc(); + data = nfs_readdata_alloc(1); if (!data) goto out_bad; INIT_LIST_HEAD(&data->pages); @@ -343,7 +339,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode) if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) return nfs_pagein_multi(head, inode); - data = nfs_readdata_alloc(); + data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages); if (!data) goto out_bad; @@ -452,9 +448,9 @@ static void nfs_readpage_result_full(struct nfs_read_data *data, int status) * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). */ -void nfs_readpage_result(struct rpc_task *task) +void nfs_readpage_result(struct rpc_task *task, void *calldata) { - struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; + struct nfs_read_data *data = calldata; struct nfs_readargs *argp = &data->args; struct nfs_readres *resp = &data->res; int status = task->tk_status; diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c new file mode 100644 index 000000000000..4c486eb867ca --- /dev/null +++ b/fs/nfs/sysctl.c @@ -0,0 +1,84 @@ +/* + * linux/fs/nfs/sysctl.c + * + * Sysctl interface to NFS parameters + */ +#include <linux/config.h> +#include <linux/types.h> +#include <linux/linkage.h> +#include <linux/ctype.h> +#include <linux/fs.h> +#include <linux/sysctl.h> +#include <linux/module.h> +#include <linux/nfs4.h> +#include <linux/nfs_idmap.h> + +#include "callback.h" + +static const int nfs_set_port_min = 0; +static const int nfs_set_port_max = 65535; +static struct ctl_table_header *nfs_callback_sysctl_table; +/* + * Something that isn't CTL_ANY, CTL_NONE or a value that may clash. + * Use the same values as fs/lockd/svc.c + */ +#define CTL_UNNUMBERED -2 + +static ctl_table nfs_cb_sysctls[] = { +#ifdef CONFIG_NFS_V4 + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs_callback_tcpport", + .data = &nfs_callback_set_tcpport, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (int *)&nfs_set_port_min, + .extra2 = (int *)&nfs_set_port_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "idmap_cache_timeout", + .data = &nfs_idmap_cache_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, +#endif + { .ctl_name = 0 } +}; + +static ctl_table nfs_cb_sysctl_dir[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nfs", + .mode = 0555, + .child = nfs_cb_sysctls, + }, + { .ctl_name = 0 } +}; + +static ctl_table nfs_cb_sysctl_root[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = nfs_cb_sysctl_dir, + }, + { .ctl_name = 0 } +}; + +int nfs_register_sysctl(void) +{ + nfs_callback_sysctl_table = register_sysctl_table(nfs_cb_sysctl_root, 0); + if (nfs_callback_sysctl_table == NULL) + return -ENOMEM; + return 0; +} + +void nfs_unregister_sysctl(void) +{ + unregister_sysctl_table(nfs_callback_sysctl_table); + nfs_callback_sysctl_table = NULL; +} diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index d639d172d568..a65c7b53d558 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -87,10 +87,9 @@ nfs_copy_dname(struct dentry *dentry, struct nfs_unlinkdata *data) * We delay initializing RPC info until after the call to dentry_iput() * in order to minimize races against rename(). */ -static void -nfs_async_unlink_init(struct rpc_task *task) +static void nfs_async_unlink_init(struct rpc_task *task, void *calldata) { - struct nfs_unlinkdata *data = (struct nfs_unlinkdata *)task->tk_calldata; + struct nfs_unlinkdata *data = calldata; struct dentry *dir = data->dir; struct rpc_message msg = { .rpc_cred = data->cred, @@ -116,10 +115,9 @@ nfs_async_unlink_init(struct rpc_task *task) * * Do the directory attribute update. */ -static void -nfs_async_unlink_done(struct rpc_task *task) +static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) { - struct nfs_unlinkdata *data = (struct nfs_unlinkdata *)task->tk_calldata; + struct nfs_unlinkdata *data = calldata; struct dentry *dir = data->dir; struct inode *dir_i; @@ -141,13 +139,18 @@ nfs_async_unlink_done(struct rpc_task *task) * We need to call nfs_put_unlinkdata as a 'tk_release' task since the * rpc_task would be freed too. */ -static void -nfs_async_unlink_release(struct rpc_task *task) +static void nfs_async_unlink_release(void *calldata) { - struct nfs_unlinkdata *data = (struct nfs_unlinkdata *)task->tk_calldata; + struct nfs_unlinkdata *data = calldata; nfs_put_unlinkdata(data); } +static const struct rpc_call_ops nfs_unlink_ops = { + .rpc_call_prepare = nfs_async_unlink_init, + .rpc_call_done = nfs_async_unlink_done, + .rpc_release = nfs_async_unlink_release, +}; + /** * nfs_async_unlink - asynchronous unlinking of a file * @dentry: dentry to unlink @@ -157,7 +160,6 @@ nfs_async_unlink(struct dentry *dentry) { struct dentry *dir = dentry->d_parent; struct nfs_unlinkdata *data; - struct rpc_task *task; struct rpc_clnt *clnt = NFS_CLIENT(dir->d_inode); int status = -ENOMEM; @@ -178,17 +180,13 @@ nfs_async_unlink(struct dentry *dentry) nfs_deletes = data; data->count = 1; - task = &data->task; - rpc_init_task(task, clnt, nfs_async_unlink_done , RPC_TASK_ASYNC); - task->tk_calldata = data; - task->tk_action = nfs_async_unlink_init; - task->tk_release = nfs_async_unlink_release; + rpc_init_task(&data->task, clnt, RPC_TASK_ASYNC, &nfs_unlink_ops, data); spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NFSFS_RENAMED; spin_unlock(&dentry->d_lock); - rpc_sleep_on(&nfs_delete_queue, task, NULL, NULL); + rpc_sleep_on(&nfs_delete_queue, &data->task, NULL, NULL); status = 0; out: return status; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3107908e5f3f..9449b6835509 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -89,24 +89,38 @@ static mempool_t *nfs_commit_mempool; static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion); -static inline struct nfs_write_data *nfs_commit_alloc(void) +static inline struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount) { struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS); + if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->pages); + if (pagecount < NFS_PAGEVEC_SIZE) + p->pagevec = &p->page_array[0]; + else { + size_t size = ++pagecount * sizeof(struct page *); + p->pagevec = kmalloc(size, GFP_NOFS); + if (p->pagevec) { + memset(p->pagevec, 0, size); + } else { + mempool_free(p, nfs_commit_mempool); + p = NULL; + } + } } return p; } static inline void nfs_commit_free(struct nfs_write_data *p) { + if (p && (p->pagevec != &p->page_array[0])) + kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } -static void nfs_writedata_release(struct rpc_task *task) +void nfs_writedata_release(void *wdata) { - struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; nfs_writedata_free(wdata); } @@ -168,7 +182,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode, int result, written = 0; struct nfs_write_data *wdata; - wdata = nfs_writedata_alloc(); + wdata = nfs_writedata_alloc(1); if (!wdata) return -ENOMEM; @@ -232,19 +246,16 @@ static int nfs_writepage_async(struct nfs_open_context *ctx, unsigned int offset, unsigned int count) { struct nfs_page *req; - int status; req = nfs_update_request(ctx, inode, page, offset, count); - status = (IS_ERR(req)) ? PTR_ERR(req) : 0; - if (status < 0) - goto out; + if (IS_ERR(req)) + return PTR_ERR(req); /* Update file length */ nfs_grow_file(page, offset, count); /* Set the PG_uptodate flag? */ nfs_mark_uptodate(page, offset, count); nfs_unlock_request(req); - out: - return status; + return 0; } static int wb_priority(struct writeback_control *wbc) @@ -304,11 +315,8 @@ do_it: lock_kernel(); if (!IS_SYNC(inode) && inode_referenced) { err = nfs_writepage_async(ctx, inode, page, 0, offset); - if (err >= 0) { - err = 0; - if (wbc->for_reclaim) - nfs_flush_inode(inode, 0, 0, FLUSH_STABLE); - } + if (!wbc->for_writepages) + nfs_flush_inode(inode, 0, 0, wb_priority(wbc)); } else { err = nfs_writepage_sync(ctx, inode, page, 0, offset, priority); @@ -877,9 +885,6 @@ static void nfs_write_rpcsetup(struct nfs_page *req, data->task.tk_priority = flush_task_priority(how); data->task.tk_cookie = (unsigned long)inode; - data->task.tk_calldata = data; - /* Release requests */ - data->task.tk_release = nfs_writedata_release; dprintk("NFS: %4d initiated write call (req %s/%Ld, %u bytes @ offset %Lu)\n", data->task.tk_pid, @@ -919,7 +924,7 @@ static int nfs_flush_multi(struct list_head *head, struct inode *inode, int how) nbytes = req->wb_bytes; for (;;) { - data = nfs_writedata_alloc(); + data = nfs_writedata_alloc(1); if (!data) goto out_bad; list_add(&data->pages, &list); @@ -983,7 +988,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how) if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE) return nfs_flush_multi(head, inode, how); - data = nfs_writedata_alloc(); + data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages); if (!data) goto out_bad; @@ -1137,9 +1142,9 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status) /* * This function is called when the WRITE call is complete. */ -void nfs_writeback_done(struct rpc_task *task) +void nfs_writeback_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct nfs_write_data *data = calldata; struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; @@ -1206,9 +1211,8 @@ void nfs_writeback_done(struct rpc_task *task) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -static void nfs_commit_release(struct rpc_task *task) +void nfs_commit_release(void *wdata) { - struct nfs_write_data *wdata = (struct nfs_write_data *)task->tk_calldata; nfs_commit_free(wdata); } @@ -1244,9 +1248,6 @@ static void nfs_commit_rpcsetup(struct list_head *head, data->task.tk_priority = flush_task_priority(how); data->task.tk_cookie = (unsigned long)inode; - data->task.tk_calldata = data; - /* Release requests */ - data->task.tk_release = nfs_commit_release; dprintk("NFS: %4d initiated commit call\n", data->task.tk_pid); } @@ -1255,12 +1256,12 @@ static void nfs_commit_rpcsetup(struct list_head *head, * Commit dirty pages */ static int -nfs_commit_list(struct list_head *head, int how) +nfs_commit_list(struct inode *inode, struct list_head *head, int how) { struct nfs_write_data *data; struct nfs_page *req; - data = nfs_commit_alloc(); + data = nfs_commit_alloc(NFS_SERVER(inode)->wpages); if (!data) goto out_bad; @@ -1283,10 +1284,9 @@ nfs_commit_list(struct list_head *head, int how) /* * COMMIT call returned */ -void -nfs_commit_done(struct rpc_task *task) +void nfs_commit_done(struct rpc_task *task, void *calldata) { - struct nfs_write_data *data = (struct nfs_write_data *)task->tk_calldata; + struct nfs_write_data *data = calldata; struct nfs_page *req; int res = 0; @@ -1366,7 +1366,7 @@ int nfs_commit_inode(struct inode *inode, int how) res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&nfsi->req_lock); if (res) { - error = nfs_commit_list(&head, how); + error = nfs_commit_list(inode, &head, how); if (error < 0) return error; } @@ -1377,22 +1377,23 @@ int nfs_commit_inode(struct inode *inode, int how) int nfs_sync_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how) { - int error, - wait; + int nocommit = how & FLUSH_NOCOMMIT; + int wait = how & FLUSH_WAIT; + int error; - wait = how & FLUSH_WAIT; - how &= ~FLUSH_WAIT; + how &= ~(FLUSH_WAIT|FLUSH_NOCOMMIT); do { - error = 0; - if (wait) + if (wait) { error = nfs_wait_on_requests(inode, idx_start, npages); - if (error == 0) - error = nfs_flush_inode(inode, idx_start, npages, how); -#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) - if (error == 0) + if (error != 0) + continue; + } + error = nfs_flush_inode(inode, idx_start, npages, how); + if (error != 0) + continue; + if (!nocommit) error = nfs_commit_inode(inode, how); -#endif } while (error > 0); return error; } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 041380fe667b..6d2dfed1de08 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -56,13 +56,20 @@ static int nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, struct nfsd3_attrstat *resp) { - int nfserr; + int err, nfserr; dprintk("nfsd: GETATTR(3) %s\n", - SVCFH_fmt(&argp->fh)); + SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + err = vfs_getattr(resp->fh.fh_export->ex_mnt, + resp->fh.fh_dentry, &resp->stat); + nfserr = nfserrno(err); + RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9147b8524d05..243d94b9653a 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -154,37 +154,34 @@ decode_sattr3(u32 *p, struct iattr *iap) } static inline u32 * -encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) +encode_fattr3(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp, + struct kstat *stat) { - struct vfsmount *mnt = fhp->fh_export->ex_mnt; struct dentry *dentry = fhp->fh_dentry; - struct kstat stat; struct timespec time; - vfs_getattr(mnt, dentry, &stat); - - *p++ = htonl(nfs3_ftypes[(stat.mode & S_IFMT) >> 12]); - *p++ = htonl((u32) stat.mode); - *p++ = htonl((u32) stat.nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid)); - if (S_ISLNK(stat.mode) && stat.size > NFS3_MAXPATHLEN) { + *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); + *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); + if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) { p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN); } else { - p = xdr_encode_hyper(p, (u64) stat.size); + p = xdr_encode_hyper(p, (u64) stat->size); } - p = xdr_encode_hyper(p, ((u64)stat.blocks) << 9); - *p++ = htonl((u32) MAJOR(stat.rdev)); - *p++ = htonl((u32) MINOR(stat.rdev)); + p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9); + *p++ = htonl((u32) MAJOR(stat->rdev)); + *p++ = htonl((u32) MINOR(stat->rdev)); if (is_fsid(fhp, rqstp->rq_reffh)) p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); else - p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat.dev)); - p = xdr_encode_hyper(p, (u64) stat.ino); - p = encode_time3(p, &stat.atime); + p = xdr_encode_hyper(p, (u64) huge_encode_dev(stat->dev)); + p = xdr_encode_hyper(p, (u64) stat->ino); + p = encode_time3(p, &stat->atime); lease_get_mtime(dentry->d_inode, &time); p = encode_time3(p, &time); - p = encode_time3(p, &stat.ctime); + p = encode_time3(p, &stat->ctime); return p; } @@ -232,8 +229,14 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; if (dentry && dentry->d_inode != NULL) { - *p++ = xdr_one; /* attributes follow */ - return encode_fattr3(rqstp, p, fhp); + int err; + struct kstat stat; + + err = vfs_getattr(fhp->fh_export->ex_mnt, dentry, &stat); + if (!err) { + *p++ = xdr_one; /* attributes follow */ + return encode_fattr3(rqstp, p, fhp, &stat); + } } *p++ = xdr_zero; return p; @@ -616,7 +619,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, u32 *p, struct nfsd3_attrstat *resp) { if (resp->status == 0) - p = encode_fattr3(rqstp, p, &resp->fh); + p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 583c0710e45e..d828662d737d 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -53,7 +53,7 @@ #define NFSPROC4_CB_COMPOUND 1 /* declarations */ -static void nfs4_cb_null(struct rpc_task *task); +static const struct rpc_call_ops nfs4_cb_null_ops; /* Index of predefined Linux callback client operations */ @@ -431,7 +431,6 @@ nfsd4_probe_callback(struct nfs4_client *clp) } clnt->cl_intr = 0; clnt->cl_softrtry = 1; - clnt->cl_chatty = 1; /* Kick rpciod, put the call on the wire. */ @@ -447,7 +446,7 @@ nfsd4_probe_callback(struct nfs4_client *clp) msg.rpc_cred = nfsd4_lookupcred(clp,0); if (IS_ERR(msg.rpc_cred)) goto out_rpciod; - status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, nfs4_cb_null, NULL); + status = rpc_call_async(clnt, &msg, RPC_TASK_ASYNC, &nfs4_cb_null_ops, NULL); put_rpccred(msg.rpc_cred); if (status != 0) { @@ -469,7 +468,7 @@ out_err: } static void -nfs4_cb_null(struct rpc_task *task) +nfs4_cb_null(struct rpc_task *task, void *dummy) { struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; struct nfs4_callback *cb = &clp->cl_callback; @@ -488,6 +487,10 @@ out: put_nfs4_client(clp); } +static const struct rpc_call_ops nfs4_cb_null_ops = { + .rpc_call_done = nfs4_cb_null, +}; + /* * called with dp->dl_count inc'ed. * nfs4_lock_state() may or may not have been called. diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 954cf893d50c..be963a133aaa 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -121,9 +121,9 @@ out: static void nfsd4_sync_rec_dir(void) { - down(&rec_dir.dentry->d_inode->i_sem); + mutex_lock(&rec_dir.dentry->d_inode->i_mutex); nfsd_sync_dir(rec_dir.dentry); - up(&rec_dir.dentry->d_inode->i_sem); + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); } int @@ -143,7 +143,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) nfs4_save_user(&uid, &gid); /* lock the parent */ - down(&rec_dir.dentry->d_inode->i_sem); + mutex_lock(&rec_dir.dentry->d_inode->i_mutex); dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); if (IS_ERR(dentry)) { @@ -159,7 +159,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) out_put: dput(dentry); out_unlock: - up(&rec_dir.dentry->d_inode->i_sem); + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); if (status == 0) { clp->cl_firststate = 1; nfsd4_sync_rec_dir(); @@ -259,9 +259,9 @@ nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry) printk("nfsd4: non-file found in client recovery directory\n"); return -EINVAL; } - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); status = vfs_unlink(dir->d_inode, dentry); - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return status; } @@ -274,9 +274,9 @@ nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry) * any regular files anyway, just in case the directory was created by * a kernel from the future.... */ nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); status = vfs_rmdir(dir->d_inode, dentry); - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return status; } @@ -288,9 +288,9 @@ nfsd4_unlink_clid_dir(char *name, int namlen) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); - down(&rec_dir.dentry->d_inode->i_sem); + mutex_lock(&rec_dir.dentry->d_inode->i_mutex); dentry = lookup_one_len(name, rec_dir.dentry, namlen); - up(&rec_dir.dentry->d_inode->i_sem); + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); return status; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index b45999ff33e6..aa7bb41b293d 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -152,46 +152,44 @@ decode_sattr(u32 *p, struct iattr *iap) } static inline u32 * -encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) +encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp, + struct kstat *stat) { - struct vfsmount *mnt = fhp->fh_export->ex_mnt; struct dentry *dentry = fhp->fh_dentry; - struct kstat stat; int type; struct timespec time; - vfs_getattr(mnt, dentry, &stat); - type = (stat.mode & S_IFMT); + type = (stat->mode & S_IFMT); *p++ = htonl(nfs_ftypes[type >> 12]); - *p++ = htonl((u32) stat.mode); - *p++ = htonl((u32) stat.nlink); - *p++ = htonl((u32) nfsd_ruid(rqstp, stat.uid)); - *p++ = htonl((u32) nfsd_rgid(rqstp, stat.gid)); + *p++ = htonl((u32) stat->mode); + *p++ = htonl((u32) stat->nlink); + *p++ = htonl((u32) nfsd_ruid(rqstp, stat->uid)); + *p++ = htonl((u32) nfsd_rgid(rqstp, stat->gid)); - if (S_ISLNK(type) && stat.size > NFS_MAXPATHLEN) { + if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) { *p++ = htonl(NFS_MAXPATHLEN); } else { - *p++ = htonl((u32) stat.size); + *p++ = htonl((u32) stat->size); } - *p++ = htonl((u32) stat.blksize); + *p++ = htonl((u32) stat->blksize); if (S_ISCHR(type) || S_ISBLK(type)) - *p++ = htonl(new_encode_dev(stat.rdev)); + *p++ = htonl(new_encode_dev(stat->rdev)); else *p++ = htonl(0xffffffff); - *p++ = htonl((u32) stat.blocks); + *p++ = htonl((u32) stat->blocks); if (is_fsid(fhp, rqstp->rq_reffh)) *p++ = htonl((u32) fhp->fh_export->ex_fsid); else - *p++ = htonl(new_encode_dev(stat.dev)); - *p++ = htonl((u32) stat.ino); - *p++ = htonl((u32) stat.atime.tv_sec); - *p++ = htonl(stat.atime.tv_nsec ? stat.atime.tv_nsec / 1000 : 0); + *p++ = htonl(new_encode_dev(stat->dev)); + *p++ = htonl((u32) stat->ino); + *p++ = htonl((u32) stat->atime.tv_sec); + *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); lease_get_mtime(dentry->d_inode, &time); *p++ = htonl((u32) time.tv_sec); *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); - *p++ = htonl((u32) stat.ctime.tv_sec); - *p++ = htonl(stat.ctime.tv_nsec ? stat.ctime.tv_nsec / 1000 : 0); + *p++ = htonl((u32) stat->ctime.tv_sec); + *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0); return p; } @@ -199,7 +197,9 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) /* Helper function for NFSv2 ACL code */ u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp) { - return encode_fattr(rqstp, p, fhp); + struct kstat stat; + vfs_getattr(fhp->fh_export->ex_mnt, fhp->fh_dentry, &stat); + return encode_fattr(rqstp, p, fhp, &stat); } /* @@ -394,7 +394,7 @@ int nfssvc_encode_attrstat(struct svc_rqst *rqstp, u32 *p, struct nfsd_attrstat *resp) { - p = encode_fattr(rqstp, p, &resp->fh); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); return xdr_ressize_check(rqstp, p); } @@ -403,7 +403,7 @@ nfssvc_encode_diropres(struct svc_rqst *rqstp, u32 *p, struct nfsd_diropres *resp) { p = encode_fh(p, &resp->fh); - p = encode_fattr(rqstp, p, &resp->fh); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); return xdr_ressize_check(rqstp, p); } @@ -428,7 +428,7 @@ int nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p, struct nfsd_readres *resp) { - p = encode_fattr(rqstp, p, &resp->fh); + p = encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->count); xdr_ressize_check(rqstp, p); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index af7c3c3074b0..eef0576a7785 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -48,8 +48,8 @@ #include <linux/fsnotify.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -#ifdef CONFIG_NFSD_V4 #include <linux/xattr.h> +#ifdef CONFIG_NFSD_V4 #include <linux/nfs4.h> #include <linux/nfs4_acl.h> #include <linux/nfsd_idmap.h> @@ -365,8 +365,30 @@ out_nfserr: goto out; } -#if defined(CONFIG_NFSD_V4) +#if defined(CONFIG_NFSD_V2_ACL) || \ + defined(CONFIG_NFSD_V3_ACL) || \ + defined(CONFIG_NFSD_V4) +static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) +{ + ssize_t buflen; + int error; + + buflen = vfs_getxattr(dentry, key, NULL, 0); + if (buflen <= 0) + return buflen; + + *buf = kmalloc(buflen, GFP_KERNEL); + if (!*buf) + return -ENOMEM; + error = vfs_getxattr(dentry, key, *buf, buflen); + if (error < 0) + return error; + return buflen; +} +#endif + +#if defined(CONFIG_NFSD_V4) static int set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) { @@ -374,7 +396,6 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) size_t buflen; char *buf = NULL; int error = 0; - struct inode *inode = dentry->d_inode; buflen = posix_acl_xattr_size(pacl->a_count); buf = kmalloc(buflen, GFP_KERNEL); @@ -388,15 +409,7 @@ set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) goto out; } - error = -EOPNOTSUPP; - if (inode->i_op && inode->i_op->setxattr) { - down(&inode->i_sem); - security_inode_setxattr(dentry, key, buf, len, 0); - error = inode->i_op->setxattr(dentry, key, buf, len, 0); - if (!error) - security_inode_post_setxattr(dentry, key, buf, len, 0); - up(&inode->i_sem); - } + error = vfs_setxattr(dentry, key, buf, len, 0); out: kfree(buf); return error; @@ -455,44 +468,19 @@ out_nfserr: static struct posix_acl * _get_posix_acl(struct dentry *dentry, char *key) { - struct inode *inode = dentry->d_inode; - char *buf = NULL; - int buflen, error = 0; + void *buf = NULL; struct posix_acl *pacl = NULL; + int buflen; - error = -EOPNOTSUPP; - if (inode->i_op == NULL) - goto out_err; - if (inode->i_op->getxattr == NULL) - goto out_err; - - error = security_inode_getxattr(dentry, key); - if (error) - goto out_err; - - buflen = inode->i_op->getxattr(dentry, key, NULL, 0); - if (buflen <= 0) { - error = buflen < 0 ? buflen : -ENODATA; - goto out_err; - } - - buf = kmalloc(buflen, GFP_KERNEL); - if (buf == NULL) { - error = -ENOMEM; - goto out_err; - } - - error = inode->i_op->getxattr(dentry, key, buf, buflen); - if (error < 0) - goto out_err; + buflen = nfsd_getxattr(dentry, key, &buf); + if (!buflen) + buflen = -ENODATA; + if (buflen <= 0) + return ERR_PTR(buflen); pacl = posix_acl_from_xattr(buf, buflen); - out: kfree(buf); return pacl; - out_err: - pacl = ERR_PTR(error); - goto out; } int @@ -717,27 +705,33 @@ nfsd_close(struct file *filp) * As this calls fsync (not fdatasync) there is no need for a write_inode * after it. */ -static inline void nfsd_dosync(struct file *filp, struct dentry *dp, - struct file_operations *fop) +static inline int nfsd_dosync(struct file *filp, struct dentry *dp, + struct file_operations *fop) { struct inode *inode = dp->d_inode; int (*fsync) (struct file *, struct dentry *, int); + int err = nfs_ok; filemap_fdatawrite(inode->i_mapping); if (fop && (fsync = fop->fsync)) - fsync(filp, dp, 0); + err=fsync(filp, dp, 0); filemap_fdatawait(inode->i_mapping); + + return nfserrno(err); } -static void +static int nfsd_sync(struct file *filp) { + int err; struct inode *inode = filp->f_dentry->d_inode; dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name); - down(&inode->i_sem); - nfsd_dosync(filp, filp->f_dentry, filp->f_op); - up(&inode->i_sem); + mutex_lock(&inode->i_mutex); + err=nfsd_dosync(filp, filp->f_dentry, filp->f_op); + mutex_unlock(&inode->i_mutex); + + return err; } void @@ -874,6 +868,16 @@ out: return err; } +static void kill_suid(struct dentry *dentry) +{ + struct iattr ia; + ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID; + + mutex_lock(&dentry->d_inode->i_mutex); + notify_change(dentry, &ia); + mutex_unlock(&dentry->d_inode->i_mutex); +} + static inline int nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, @@ -927,14 +931,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, } /* clear setuid/setgid flag after write */ - if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) { - struct iattr ia; - ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID; - - down(&inode->i_sem); - notify_change(dentry, &ia); - up(&inode->i_sem); - } + if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) + kill_suid(dentry); if (err >= 0 && stable) { static ino_t last_ino; @@ -962,7 +960,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (inode->i_state & I_DIRTY) { dprintk("nfsd: write sync %d\n", current->pid); - nfsd_sync(file); + err=nfsd_sync(file); } #if 0 wake_up(&inode->i_wait); @@ -1066,7 +1064,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; if (EX_ISSYNC(fhp->fh_export)) { if (file->f_op && file->f_op->fsync) { - nfsd_sync(file); + err = nfsd_sync(file); } else { err = nfserr_notsupp; } @@ -1874,39 +1872,25 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type) ssize_t size; struct posix_acl *acl; - if (!IS_POSIXACL(inode) || !inode->i_op || !inode->i_op->getxattr) + if (!IS_POSIXACL(inode)) + return ERR_PTR(-EOPNOTSUPP); + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + break; + default: return ERR_PTR(-EOPNOTSUPP); - switch(type) { - case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; - break; - case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; - break; - default: - return ERR_PTR(-EOPNOTSUPP); } - size = inode->i_op->getxattr(fhp->fh_dentry, name, NULL, 0); + size = nfsd_getxattr(fhp->fh_dentry, name, &value); + if (size < 0) + return ERR_PTR(size); - if (size < 0) { - acl = ERR_PTR(size); - goto getout; - } else if (size > 0) { - value = kmalloc(size, GFP_KERNEL); - if (!value) { - acl = ERR_PTR(-ENOMEM); - goto getout; - } - size = inode->i_op->getxattr(fhp->fh_dentry, name, value, size); - if (size < 0) { - acl = ERR_PTR(size); - goto getout; - } - } acl = posix_acl_from_xattr(value, size); - -getout: kfree(value); return acl; } @@ -1947,16 +1931,13 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) } else size = 0; - if (!fhp->fh_locked) - fh_lock(fhp); /* unlocking is done automatically */ if (size) - error = inode->i_op->setxattr(fhp->fh_dentry, name, - value, size, 0); + error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); else { if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) error = 0; else { - error = inode->i_op->removexattr(fhp->fh_dentry, name); + error = vfs_removexattr(fhp->fh_dentry, name); if (error == -ENODATA) error = 0; } diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 50a7749cfca1..02f44094bda9 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -884,7 +884,7 @@ ToDo/Notes: - Add handling for initialized_size != data_size in compressed files. - Reduce function local stack usage from 0x3d4 bytes to just noise in - fs/ntfs/upcase.c. (Randy Dunlap <rddunlap@osdl.ord>) + fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>) - Remove compiler warnings for newer gcc. - Pages are no longer kmapped by mm/filemap.c::generic_file_write() around calls to ->{prepare,commit}_write. Adapt NTFS appropriately diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index eda056bac256..9480a0526cd3 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1532,7 +1532,7 @@ int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, * NOTE to self: No changes in the attribute list are required to move from * a resident to a non-resident attribute. * - * Locking: - The caller must hold i_sem on the inode. + * Locking: - The caller must hold i_mutex on the inode. */ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) { @@ -1728,7 +1728,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) /* * This needs to be last since the address space operations ->readpage * and ->writepage can run concurrently with us as they are not - * serialized on i_sem. Note, we are not allowed to fail once we flip + * serialized on i_mutex. Note, we are not allowed to fail once we flip * this switch, which is another reason to do this last. */ NInoSetNonResident(ni); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 795c3d1930f5..b0690d4c8906 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -69,7 +69,7 @@ ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'), * work but we don't care for how quickly one can access them. This also fixes * the dcache aliasing issues. * - * Locking: - Caller must hold i_sem on the directory. + * Locking: - Caller must hold i_mutex on the directory. * - Each page cache page in the index allocation mapping must be * locked whilst being accessed otherwise we may find a corrupt * page due to it being under ->writepage at the moment which @@ -1085,11 +1085,11 @@ static inline int ntfs_filldir(ntfs_volume *vol, loff_t fpos, * While this will return the names in random order this doesn't matter for * ->readdir but OTOH results in a faster ->readdir. * - * VFS calls ->readdir without BKL but with i_sem held. This protects the VFS + * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS * parts (e.g. ->f_pos and ->i_size, and it also protects against directory * modifications). * - * Locking: - Caller must hold i_sem on the directory. + * Locking: - Caller must hold i_mutex on the directory. * - Each page cache page in the index allocation mapping must be * locked whilst being accessed otherwise we may find a corrupt * page due to it being under ->writepage at the moment which @@ -1520,7 +1520,7 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp) * Note: In the past @filp could be NULL so we ignore it as we don't need it * anyway. * - * Locking: Caller must hold i_sem on the inode. + * Locking: Caller must hold i_mutex on the inode. * * TODO: We should probably also write all attribute/index inodes associated * with this inode but since we have no simple way of getting to them we ignore diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 727533891813..fb413d3d8618 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -106,7 +106,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * this is the case, the necessary zeroing will also have happened and that all * metadata is self-consistent. * - * Locking: i_sem on the vfs inode corrseponsind to the ntfs inode @ni must be + * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be * held by the caller. */ static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, @@ -473,7 +473,7 @@ static inline int ntfs_submit_bh_for_read(struct buffer_head *bh) * @bytes: number of bytes to be written * * This is called for non-resident attributes from ntfs_file_buffered_write() - * with i_sem held on the inode (@pages[0]->mapping->host). There are + * with i_mutex held on the inode (@pages[0]->mapping->host). There are * @nr_pages pages in @pages which are locked but not kmap()ped. The source * data has not yet been copied into the @pages. * @@ -1637,7 +1637,7 @@ err_out: * @pos: byte position in file at which the write begins * @bytes: number of bytes to be written * - * This is called from ntfs_file_buffered_write() with i_sem held on the inode + * This is called from ntfs_file_buffered_write() with i_mutex held on the inode * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are * locked but not kmap()ped. The source data has already been copied into the * @page. ntfs_prepare_pages_for_non_resident_write() has been called before @@ -1814,7 +1814,7 @@ err_out: /** * ntfs_file_buffered_write - * - * Locking: The vfs is holding ->i_sem on the inode. + * Locking: The vfs is holding ->i_mutex on the inode. */ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, @@ -2173,7 +2173,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, err = remove_suid(file->f_dentry); if (err) goto out; - inode_update_time(inode, 1); + file_update_time(file); written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, count); out: @@ -2196,9 +2196,9 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const char __user *buf, BUG_ON(iocb->ki_pos != pos); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = sync_page_range(inode, mapping, pos, ret); if (err < 0) @@ -2221,12 +2221,12 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov, struct kiocb kiocb; ssize_t ret; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); init_sync_kiocb(&kiocb, file); ret = ntfs_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = sync_page_range(inode, mapping, *ppos - ret, ret); if (err < 0) @@ -2269,7 +2269,7 @@ static ssize_t ntfs_file_write(struct file *file, const char __user *buf, * Note: In the past @filp could be NULL so we ignore it as we don't need it * anyway. * - * Locking: Caller must hold i_sem on the inode. + * Locking: Caller must hold i_mutex on the inode. * * TODO: We should probably also write all attribute/index inodes associated * with this inode but since we have no simple way of getting to them we ignore diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c index 8f2d5727546f..9f5427c2d105 100644 --- a/fs/ntfs/index.c +++ b/fs/ntfs/index.c @@ -32,7 +32,7 @@ * Allocate a new index context, initialize it with @idx_ni and return it. * Return NULL if allocation failed. * - * Locking: Caller must hold i_sem on the index inode. + * Locking: Caller must hold i_mutex on the index inode. */ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) { @@ -50,7 +50,7 @@ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) * * Release the index context @ictx, releasing all associated resources. * - * Locking: Caller must hold i_sem on the index inode. + * Locking: Caller must hold i_mutex on the index inode. */ void ntfs_index_ctx_put(ntfs_index_context *ictx) { @@ -106,7 +106,7 @@ void ntfs_index_ctx_put(ntfs_index_context *ictx) * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to * ensure that the changes are written to disk. * - * Locking: - Caller must hold i_sem on the index inode. + * Locking: - Caller must hold i_mutex on the index inode. * - Each page cache page in the index allocation mapping must be * locked whilst being accessed otherwise we may find a corrupt * page due to it being under ->writepage at the moment which diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index b24f4c4b2c5c..ea1bd3feea1b 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2125,13 +2125,13 @@ void ntfs_put_inode(struct inode *vi) ntfs_inode *ni = NTFS_I(vi); if (NInoIndexAllocPresent(ni)) { struct inode *bvi = NULL; - down(&vi->i_sem); + mutex_lock(&vi->i_mutex); if (atomic_read(&vi->i_count) == 2) { bvi = ni->itype.index.bmp_ino; if (bvi) ni->itype.index.bmp_ino = NULL; } - up(&vi->i_sem); + mutex_unlock(&vi->i_mutex); if (bvi) iput(bvi); } @@ -2311,7 +2311,7 @@ static const char *es = " Leaving inconsistent metadata. Unmount and run " * * Returns 0 on success or -errno on error. * - * Called with ->i_sem held. In all but one case ->i_alloc_sem is held for + * Called with ->i_mutex held. In all but one case ->i_alloc_sem is held for * writing. The only case in the kernel where ->i_alloc_sem is not held is * mm/filemap.c::generic_file_buffered_write() where vmtruncate() is called * with the current i_size as the offset. The analogous place in NTFS is in @@ -2767,7 +2767,25 @@ unm_done: up_write(&ni->runlist.lock); done: /* Update the mtime and ctime on the base inode. */ - inode_update_time(VFS_I(base_ni), 1); + /* normally ->truncate shouldn't update ctime or mtime, + * but ntfs did before so it got a copy & paste version + * of file_update_time. one day someone should fix this + * for real. + */ + if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { + struct timespec now = current_fs_time(VFS_I(base_ni)->i_sb); + int sync_it = 0; + + if (!timespec_equal(&VFS_I(base_ni)->i_mtime, &now) || + !timespec_equal(&VFS_I(base_ni)->i_ctime, &now)) + sync_it = 1; + VFS_I(base_ni)->i_mtime = now; + VFS_I(base_ni)->i_ctime = now; + + if (sync_it) + mark_inode_dirty_sync(VFS_I(base_ni)); + } + if (likely(!err)) { NInoClearTruncateFailed(ni); ntfs_debug("Done."); @@ -2831,7 +2849,7 @@ void ntfs_truncate_vfs(struct inode *vi) { * We also abort all changes of user, group, and mode as we do not implement * the NTFS ACLs yet. * - * Called with ->i_sem held. For the ATTR_SIZE (i.e. ->truncate) case, also + * Called with ->i_mutex held. For the ATTR_SIZE (i.e. ->truncate) case, also * called with ->i_alloc_sem held for writing. * * Basically this is a copy of generic notify_change() and inode_setattr() diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 351dbc3b6e40..5ea9eb93af62 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -96,7 +96,7 @@ * name. We then convert the name to the current NLS code page, and proceed * searching for a dentry with this name, etc, as in case 2), above. * - * Locking: Caller must hold i_sem on the directory. + * Locking: Caller must hold i_mutex on the directory. */ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, struct nameidata *nd) @@ -254,7 +254,7 @@ handle_name: nls_name.hash = full_name_hash(nls_name.name, nls_name.len); /* - * Note: No need for dent->d_lock lock as i_sem is held on the + * Note: No need for dent->d_lock lock as i_mutex is held on the * parent inode. */ @@ -374,7 +374,7 @@ struct inode_operations ntfs_dir_inode_ops = { * The code is based on the ext3 ->get_parent() implementation found in * fs/ext3/namei.c::ext3_get_parent(). * - * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_sem down. + * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down. * * Return the dentry of the parent directory on success or the error code on * error (IS_ERR() is true). diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index 833df2a4e9fb..d0ef4182147b 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -48,7 +48,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_error(vol->sb, "Quota inodes are not open."); return FALSE; } - down(&vol->quota_q_ino->i_sem); + mutex_lock(&vol->quota_q_ino->i_mutex); ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); @@ -98,7 +98,7 @@ BOOL ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_index_entry_mark_dirty(ictx); set_done: ntfs_index_ctx_put(ictx); - up(&vol->quota_q_ino->i_sem); + mutex_unlock(&vol->quota_q_ino->i_mutex); /* * We set the flag so we do not try to mark the quotas out of date * again on remount. @@ -110,7 +110,7 @@ done: err_out: if (ictx) ntfs_index_ctx_put(ictx); - up(&vol->quota_q_ino->i_sem); + mutex_unlock(&vol->quota_q_ino->i_mutex); return FALSE; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 6c16db9e1a8a..c3a3f1a8310b 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -443,8 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_debug("Entering with remount options string: %s", opt); #ifndef NTFS_RW - /* For read-only compiled driver, enforce all read-only flags. */ - *flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + /* For read-only compiled driver, enforce read-only flag. */ + *flags |= MS_RDONLY; #else /* NTFS_RW */ /* * For the read-write compiled driver, if we are remounting read-write, @@ -1213,10 +1213,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Find the inode number for the hibernation file by looking up the * filename hiberfil.sys in the root directory. */ - down(&vol->root_ino->i_sem); + mutex_lock(&vol->root_ino->i_mutex); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); - up(&vol->root_ino->i_sem); + mutex_unlock(&vol->root_ino->i_mutex); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ @@ -1307,10 +1307,10 @@ static BOOL load_and_init_quota(ntfs_volume *vol) * Find the inode number for the quota file by looking up the filename * $Quota in the extended system files directory $Extend. */ - down(&vol->extend_ino->i_sem); + mutex_lock(&vol->extend_ino->i_mutex); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); - up(&vol->extend_ino->i_sem); + mutex_unlock(&vol->extend_ino->i_mutex); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have @@ -1390,10 +1390,10 @@ static BOOL load_and_init_usnjrnl(ntfs_volume *vol) * Find the inode number for the transaction log file by looking up the * filename $UsnJrnl in the extended system files directory $Extend. */ - down(&vol->extend_ino->i_sem); + mutex_lock(&vol->extend_ino->i_mutex); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, &name); - up(&vol->extend_ino->i_sem); + mutex_unlock(&vol->extend_ino->i_mutex); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, transaction logging is disabled, @@ -1721,7 +1721,7 @@ static BOOL load_system_files(ntfs_volume *vol) es3); goto iput_mirr_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", !vol->mftmirr_ino ? es1 : es2, es3); } else @@ -1837,7 +1837,7 @@ get_ctx_vol_failed: es1, es2); goto iput_vol_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -1874,7 +1874,7 @@ get_ctx_vol_failed: } goto iput_logfile_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -1919,7 +1919,7 @@ get_ctx_vol_failed: es1, es2); goto iput_root_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -1943,7 +1943,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; /* * Do not set NVolErrors() because ntfs_remount() might manage * to set the dirty flag in which case all would be well. @@ -1970,7 +1970,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; NVolSetErrors(vol); } #endif @@ -1989,7 +1989,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; NVolSetErrors(vol); } #endif /* NTFS_RW */ @@ -2030,7 +2030,7 @@ get_ctx_vol_failed: es1, es2); goto iput_quota_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2053,7 +2053,7 @@ get_ctx_vol_failed: goto iput_quota_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; NVolSetErrors(vol); } /* @@ -2074,7 +2074,7 @@ get_ctx_vol_failed: es1, es2); goto iput_usnjrnl_err_out; } - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2097,7 +2097,7 @@ get_ctx_vol_failed: goto iput_usnjrnl_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; NVolSetErrors(vol); } #endif /* NTFS_RW */ @@ -2312,9 +2312,9 @@ static void ntfs_put_super(struct super_block *sb) if (!list_empty(&sb->s_dirty)) { const char *s1, *s2; - down(&vol->mft_ino->i_sem); + mutex_lock(&vol->mft_ino->i_mutex); truncate_inode_pages(vol->mft_ino->i_mapping, 0); - up(&vol->mft_ino->i_sem); + mutex_unlock(&vol->mft_ino->i_mutex); write_inode_now(vol->mft_ino, 1); if (!list_empty(&sb->s_dirty)) { static const char *_s1 = "inodes"; @@ -2689,7 +2689,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_debug("Entering."); #ifndef NTFS_RW - sb->s_flags |= MS_RDONLY | MS_NOATIME | MS_NODIRATIME; + sb->s_flags |= MS_RDONLY; #endif /* ! NTFS_RW */ /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile new file mode 100644 index 000000000000..7d3be845a614 --- /dev/null +++ b/fs/ocfs2/Makefile @@ -0,0 +1,33 @@ +EXTRA_CFLAGS += -Ifs/ocfs2 + +EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES + +obj-$(CONFIG_OCFS2_FS) += ocfs2.o + +ocfs2-objs := \ + alloc.o \ + aops.o \ + buffer_head_io.o \ + dcache.o \ + dir.o \ + dlmglue.o \ + export.o \ + extent_map.o \ + file.o \ + heartbeat.o \ + inode.o \ + journal.o \ + localalloc.o \ + mmap.o \ + namei.o \ + slot_map.o \ + suballoc.o \ + super.o \ + symlink.o \ + sysfile.o \ + uptodate.o \ + ver.o \ + vote.o + +obj-$(CONFIG_OCFS2_FS) += cluster/ +obj-$(CONFIG_OCFS2_FS) += dlm/ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c new file mode 100644 index 000000000000..6b9812db3779 --- /dev/null +++ b/fs/ocfs2/alloc.c @@ -0,0 +1,2040 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.c + * + * Extent allocs and frees + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "sysfile.h" +#include "file.h" +#include "super.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static int ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + u64 blkno); + +static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]); + +static int ocfs2_add_branch(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *eb_bh, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac); + +static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh); + +static int ocfs2_do_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 blkno, + u32 new_clusters); + +static int ocfs2_find_branch_target(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **target_bh); + +static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe, + unsigned int new_i_clusters, + struct buffer_head *old_last_eb, + struct buffer_head **new_last_eb); + +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); + +static int ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + return blkno == (le64_to_cpu(ext->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(ext->e_clusters))); +} + +/* + * How many free extents have we got before we need more meta data? + */ +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe) +{ + int retval; + struct ocfs2_extent_list *el; + struct ocfs2_extent_block *eb; + struct buffer_head *eb_bh = NULL; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + retval = -EIO; + goto bail; + } + + if (fe->i_last_eb_blk) { + retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &eb_bh, OCFS2_BH_CACHED, inode); + if (retval < 0) { + mlog_errno(retval); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + + BUG_ON(el->l_tree_depth != 0); + + retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); +bail: + if (eb_bh) + brelse(eb_bh); + + mlog_exit(retval); + return retval; +} + +/* expects array to already be allocated + * + * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and + * l_count for you + */ +static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + int wanted, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head *bhs[]) +{ + int count, status, i; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + struct ocfs2_extent_block *eb; + + mlog_entry_void(); + + count = 0; + while (count < wanted) { + status = ocfs2_claim_metadata(osb, + handle, + meta_ac, + wanted - count, + &suballoc_bit_start, + &num_got, + &first_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = count; i < (num_got + count); i++) { + bhs[i] = sb_getblk(osb->sb, first_blkno); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + + status = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) bhs[i]->b_data; + /* Ok, setup the minimal stuff here. */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(first_blkno); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + +#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS + /* we always use slot zero's suballocator */ + eb->h_suballoc_slot = 0; +#else + eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); +#endif + eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + suballoc_bit_start++; + first_blkno++; + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. */ + status = ocfs2_journal_dirty(handle, bhs[i]); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + count += num_got; + } + + status = 0; +bail: + if (status < 0) { + for(i = 0; i < wanted; i++) { + if (bhs[i]) + brelse(bhs[i]); + bhs[i] = NULL; + } + } + mlog_exit(status); + return status; +} + +/* + * Add an entire tree branch to our inode. eb_bh is the extent block + * to start at, if we don't want to start the branch at the dinode + * structure. + * + * last_eb_bh is required as we have to update it's next_leaf pointer + * for the new last extent block. + * + * the new branch will be 'empty' in the sense that every block will + * contain a single record with e_clusters == 0. + */ +static int ocfs2_add_branch(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *eb_bh, + struct buffer_head *last_eb_bh, + struct ocfs2_alloc_context *meta_ac) +{ + int status, new_blocks, i; + u64 next_blkno, new_last_eb_blk; + struct buffer_head *bh; + struct buffer_head **new_eb_bhs = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *eb_el; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + BUG_ON(!last_eb_bh); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (eb_bh) { + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + + /* we never add a branch to a leaf. */ + BUG_ON(!el->l_tree_depth); + + new_blocks = le16_to_cpu(el->l_tree_depth); + + /* allocate the number of new eb blocks we need */ + new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!new_eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, + meta_ac, new_eb_bhs); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be + * linked with the rest of the tree. + * conversly, new_eb_bhs[0] is the new bottommost leaf. + * + * when we leave the loop, new_last_eb_blk will point to the + * newest leaf, and next_blkno will point to the topmost extent + * block. */ + next_blkno = new_last_eb_blk = 0; + for(i = 0; i < new_blocks; i++) { + bh = new_eb_bhs[i]; + eb = (struct ocfs2_extent_block *) bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + eb_el = &eb->h_list; + + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb->h_next_leaf_blk = 0; + eb_el->l_tree_depth = cpu_to_le16(i); + eb_el->l_next_free_rec = cpu_to_le16(1); + eb_el->l_recs[0].e_cpos = fe->i_clusters; + eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); + eb_el->l_recs[0].e_clusters = cpu_to_le32(0); + if (!eb_el->l_tree_depth) + new_last_eb_blk = le64_to_cpu(eb->h_blkno); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + next_blkno = le64_to_cpu(eb->h_blkno); + } + + /* This is a bit hairy. We want to update up to three blocks + * here without leaving any of them in an inconsistent state + * in case of error. We don't have to worry about + * journal_dirty erroring as it won't unless we've aborted the + * handle (in which case we would never be here) so reserving + * the write with journal_access is all we need to do. */ + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (eb_bh) { + status = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Link the new branch into the rest of the tree (el will + * either be on the fe, or the extent block passed in. */ + i = le16_to_cpu(el->l_next_free_rec); + el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); + el->l_recs[i].e_cpos = fe->i_clusters; + el->l_recs[i].e_clusters = 0; + le16_add_cpu(&el->l_next_free_rec, 1); + + /* fe needs a new last extent block pointer, as does the + * next_leaf on the previously last-extent-block. */ + fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); + + status = ocfs2_journal_dirty(handle, last_eb_bh); + if (status < 0) + mlog_errno(status); + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) + mlog_errno(status); + if (eb_bh) { + status = ocfs2_journal_dirty(handle, eb_bh); + if (status < 0) + mlog_errno(status); + } + + status = 0; +bail: + if (new_eb_bhs) { + for (i = 0; i < new_blocks; i++) + if (new_eb_bhs[i]) + brelse(new_eb_bhs[i]); + kfree(new_eb_bhs); + } + + mlog_exit(status); + return status; +} + +/* + * adds another level to the allocation tree. + * returns back the new extent block so you can add a branch to it + * after this call. + */ +static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **ret_new_eb_bh) +{ + int status, i; + struct buffer_head *new_eb_bh = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *eb_el; + + mlog_entry_void(); + + status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, + &new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + + eb_el = &eb->h_list; + fe = (struct ocfs2_dinode *) fe_bh->b_data; + fe_el = &fe->id2.i_list; + + status = ocfs2_journal_access(handle, inode, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* copy the fe data into the new extent block */ + eb_el->l_tree_depth = fe_el->l_tree_depth; + eb_el->l_next_free_rec = fe_el->l_next_free_rec; + for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { + eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; + eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; + eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; + } + + status = ocfs2_journal_dirty(handle, new_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* update fe now */ + le16_add_cpu(&fe_el->l_tree_depth, 1); + fe_el->l_recs[0].e_cpos = 0; + fe_el->l_recs[0].e_blkno = eb->h_blkno; + fe_el->l_recs[0].e_clusters = fe->i_clusters; + for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { + fe_el->l_recs[i].e_cpos = 0; + fe_el->l_recs[i].e_clusters = 0; + fe_el->l_recs[i].e_blkno = 0; + } + fe_el->l_next_free_rec = cpu_to_le16(1); + + /* If this is our 1st tree depth shift, then last_eb_blk + * becomes the allocated extent block */ + if (fe_el->l_tree_depth == cpu_to_le16(1)) + fe->i_last_eb_blk = eb->h_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *ret_new_eb_bh = new_eb_bh; + new_eb_bh = NULL; + status = 0; +bail: + if (new_eb_bh) + brelse(new_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Expects the tree to already have room in the rightmost leaf for the + * extent. Updates all the extent blocks (and the dinode) on the way + * down. + */ +static int ocfs2_do_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 start_blk, + u32 new_clusters) +{ + int status, i, num_bhs = 0; + u64 next_blkno; + u16 next_free; + struct buffer_head **eb_bhs = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + if (el->l_tree_depth) { + /* This is another operation where we want to be + * careful about our tree updates. An error here means + * none of the previous changes we made should roll + * forward. As a result, we have to record the buffers + * for this part of the tree in an array and reserve a + * journal write to them before making any changes. */ + num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); + eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), + GFP_KERNEL); + if (!eb_bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + i = 0; + while(el->l_tree_depth) { + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" has a bad " + "extent list", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); + + BUG_ON(i >= num_bhs); + status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, + eb); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access(handle, inode, eb_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + el = &eb->h_list; + i++; + /* When we leave this loop, eb_bhs[num_bhs - 1] will + * hold the bottom-most leaf extent block. */ + } + BUG_ON(el->l_tree_depth); + + el = &fe->id2.i_list; + /* If we have tree depth, then the fe update is + * trivial, and we want to switch el out for the + * bottom-most leaf in order to update it with the + * actual extent data below. */ + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" has a bad " + "extent list", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + new_clusters); + /* (num_bhs - 1) to avoid the leaf */ + for(i = 0; i < (num_bhs - 1); i++) { + eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; + el = &eb->h_list; + + /* finally, make our actual change to the + * intermediate extent blocks. */ + next_free = le16_to_cpu(el->l_next_free_rec); + le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, + new_clusters); + + status = ocfs2_journal_dirty(handle, eb_bhs[i]); + if (status < 0) + mlog_errno(status); + } + BUG_ON(i != (num_bhs - 1)); + /* note that the leaf block wasn't touched in + * the loop above */ + eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; + el = &eb->h_list; + BUG_ON(el->l_tree_depth); + } + + /* yay, we can finally add the actual extent now! */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le16_to_cpu(el->l_next_free_rec) && + ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { + le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); + } else if (le16_to_cpu(el->l_next_free_rec) && + (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { + /* having an empty extent at eof is legal. */ + if (el->l_recs[i].e_cpos != fe->i_clusters) { + ocfs2_error(inode->i_sb, + "Dinode %"MLFu64" trailing extent is bad: " + "cpos (%u) != number of clusters (%u)", + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(fe->i_clusters)); + status = -EIO; + goto bail; + } + el->l_recs[i].e_blkno = cpu_to_le64(start_blk); + el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); + } else { + /* No contiguous record, or no empty record at eof, so + * we add a new one. */ + + BUG_ON(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count)); + i = le16_to_cpu(el->l_next_free_rec); + + el->l_recs[i].e_blkno = cpu_to_le64(start_blk); + el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); + el->l_recs[i].e_cpos = fe->i_clusters; + le16_add_cpu(&el->l_next_free_rec, 1); + } + + /* + * extent_map errors are not fatal, so they are ignored outside + * of flushing the thing. + */ + status = ocfs2_extent_map_append(inode, &el->l_recs[i], + new_clusters); + if (status) { + mlog_errno(status); + ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); + } + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) + mlog_errno(status); + if (fe->id2.i_list.l_tree_depth) { + status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); + if (status < 0) + mlog_errno(status); + } + + status = 0; +bail: + if (eb_bhs) { + for (i = 0; i < num_bhs; i++) + if (eb_bhs[i]) + brelse(eb_bhs[i]); + kfree(eb_bhs); + } + + mlog_exit(status); + return status; +} + +/* + * Should only be called when there is no space left in any of the + * leaf nodes. What we want to do is find the lowest tree depth + * non-leaf extent block with room for new records. There are three + * valid results of this search: + * + * 1) a lowest extent block is found, then we pass it back in + * *lowest_eb_bh and return '0' + * + * 2) the search fails to find anything, but the dinode has room. We + * pass NULL back in *lowest_eb_bh, but still return '0' + * + * 3) the search fails to find anything AND the dinode is full, in + * which case we return > 0 + * + * return status < 0 indicates an error. + */ +static int ocfs2_find_branch_target(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head **target_bh) +{ + int status = 0, i; + u64 blkno; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + struct buffer_head *lowest_bh = NULL; + + mlog_entry_void(); + + *target_bh = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + + while(le16_to_cpu(el->l_tree_depth) > 1) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty " + "extent list (next_free_rec == 0)", + OCFS2_I(inode)->ip_blkno); + status = -EIO; + goto bail; + } + i = le16_to_cpu(el->l_next_free_rec) - 1; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (!blkno) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent " + "list where extent # %d has no physical " + "block start", + OCFS2_I(inode)->ip_blkno, i); + status = -EIO; + goto bail; + } + + if (bh) { + brelse(bh); + bh = NULL; + } + + status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &eb->h_list; + + if (le16_to_cpu(el->l_next_free_rec) < + le16_to_cpu(el->l_count)) { + if (lowest_bh) + brelse(lowest_bh); + lowest_bh = bh; + get_bh(lowest_bh); + } + } + + /* If we didn't find one and the fe doesn't have any room, + * then return '1' */ + if (!lowest_bh + && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + status = 1; + + *target_bh = lowest_bh; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +/* the caller needs to update fe->i_clusters */ +int ocfs2_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 start_blk, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac) +{ + int status, i, shift; + struct buffer_head *last_eb_bh = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + mlog_entry_void(); + + mlog(0, "add %u clusters starting at block %"MLFu64" to " + "inode %"MLFu64"\n", + new_clusters, start_blk, OCFS2_I(inode)->ip_blkno); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + el = &fe->id2.i_list; + + if (el->l_tree_depth) { + /* jump to end of tree */ + status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_exit(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + el = &eb->h_list; + } + + /* Can we allocate without adding/shifting tree bits? */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le16_to_cpu(el->l_next_free_rec) == 0 + || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) + || le32_to_cpu(el->l_recs[i].e_clusters) == 0 + || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) + goto out_add; + + mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " + "tree now.\n"); + + shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); + if (shift < 0) { + status = shift; + mlog_errno(status); + goto bail; + } + + /* We traveled all the way to the bottom of the allocation tree + * and didn't find room for any more extents - we need to add + * another tree level */ + if (shift) { + /* if we hit a leaf, we'd better be empty :) */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) != + le16_to_cpu(el->l_count)); + BUG_ON(bh); + mlog(0, "ocfs2_allocate_extent: need to shift tree depth " + "(current = %u)\n", + le16_to_cpu(fe->id2.i_list.l_tree_depth)); + + /* ocfs2_shift_tree_depth will return us a buffer with + * the new extent block (so we can pass that to + * ocfs2_add_branch). */ + status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, + meta_ac, &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + /* Special case: we have room now if we shifted from + * tree_depth 0 */ + if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) + goto out_add; + } + + /* call ocfs2_add_branch to add the final part of the tree with + * the new data. */ + mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); + status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, + meta_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +out_add: + /* Finally, we can add clusters. */ + status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, + start_blk, new_clusters); + if (status < 0) + mlog_errno(status); + +bail: + if (bh) + brelse(bh); + + if (last_eb_bh) + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) +{ + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + + mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), + "slot %d, invalid truncate log parameters: used = " + "%u, count = %u\n", osb->slot_num, + le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); + return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); +} + +static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, + unsigned int new_start) +{ + unsigned int tail_index; + unsigned int current_tail; + + /* No records, nothing to coalesce */ + if (!le16_to_cpu(tl->tl_used)) + return 0; + + tail_index = le16_to_cpu(tl->tl_used) - 1; + current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); + current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); + + return current_tail == new_start; +} + +static int ocfs2_truncate_log_append(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u64 start_blk, + unsigned int num_clusters) +{ + int status, index; + unsigned int start_cluster, tl_count; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk, + num_clusters); + + BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + + start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); + status = -EIO; + goto bail; + } + + tl_count = le16_to_cpu(tl->tl_count); + mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || + tl_count == 0, + "Truncate record count on #%"MLFu64" invalid (" + "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno, + ocfs2_truncate_recs_per_inode(osb->sb), + le16_to_cpu(tl->tl_count)); + + /* Caller should have known to flush before calling us. */ + index = le16_to_cpu(tl->tl_used); + if (index >= tl_count) { + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Log truncate of %u clusters starting at cluster %u to " + "%"MLFu64" (index = %d)\n", num_clusters, start_cluster, + OCFS2_I(tl_inode)->ip_blkno, index); + + if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { + /* + * Move index back to the record we are coalescing with. + * ocfs2_truncate_log_can_coalesce() guarantees nonzero + */ + index--; + + num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); + mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", + index, le32_to_cpu(tl->tl_recs[index].t_start), + num_clusters); + } else { + tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); + tl->tl_used = cpu_to_le16(index + 1); + } + tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); + + status = ocfs2_journal_dirty(handle, tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *data_alloc_inode, + struct buffer_head *data_alloc_bh) +{ + int status = 0; + int i; + unsigned int num_clusters; + u64 start_blk; + struct ocfs2_truncate_rec rec; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + struct inode *tl_inode = osb->osb_tl_inode; + struct buffer_head *tl_bh = osb->osb_tl_bh; + + mlog_entry_void(); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + i = le16_to_cpu(tl->tl_used) - 1; + while (i >= 0) { + /* Caller has given us at least enough credits to + * update the truncate log dinode */ + status = ocfs2_journal_access(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tl->tl_used = cpu_to_le16(i); + + status = ocfs2_journal_dirty(handle, tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* TODO: Perhaps we can calculate the bulk of the + * credits up front rather than extending like + * this. */ + status = ocfs2_extend_trans(handle, + OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + rec = tl->tl_recs[i]; + start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, + le32_to_cpu(rec.t_start)); + num_clusters = le32_to_cpu(rec.t_clusters); + + /* if start_blk is not set, we ignore the record as + * invalid. */ + if (start_blk) { + mlog(0, "free record %d, start = %u, clusters = %u\n", + i, le32_to_cpu(rec.t_start), num_clusters); + + status = ocfs2_free_clusters(handle, data_alloc_inode, + data_alloc_bh, start_blk, + num_clusters); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i--; + } + +bail: + mlog_exit(status); + return status; +} + +/* Expects you to already be holding tl_inode->i_mutex */ +static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + unsigned int num_to_flush; + struct ocfs2_journal_handle *handle = NULL; + struct inode *tl_inode = osb->osb_tl_inode; + struct inode *data_alloc_inode = NULL; + struct buffer_head *tl_bh = osb->osb_tl_bh; + struct buffer_head *data_alloc_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + mlog_entry_void(); + + BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); + status = -EIO; + goto bail; + } + + num_to_flush = le16_to_cpu(tl->tl_used); + mlog(0, "Flush %u records from truncate log #%"MLFu64"\n", + num_to_flush, OCFS2_I(tl_inode)->ip_blkno); + if (!num_to_flush) { + status = 0; + goto bail; + } + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + data_alloc_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!data_alloc_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto bail; + } + + ocfs2_handle_add_inode(handle, data_alloc_inode); + status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, + data_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (data_alloc_inode) + iput(data_alloc_inode); + + if (data_alloc_bh) + brelse(data_alloc_bh); + + mlog_exit(status); + return status; +} + +int ocfs2_flush_truncate_log(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + mutex_lock(&tl_inode->i_mutex); + status = __ocfs2_flush_truncate_log(osb); + mutex_unlock(&tl_inode->i_mutex); + + return status; +} + +static void ocfs2_truncate_log_worker(void *data) +{ + int status; + struct ocfs2_super *osb = data; + + mlog_entry_void(); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); +} + +#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel) +{ + if (osb->osb_tl_inode) { + /* We want to push off log flushes while truncates are + * still running. */ + if (cancel) + cancel_delayed_work(&osb->osb_truncate_log_wq); + + queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, + OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); + } +} + +static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, + int slot_num, + struct inode **tl_inode, + struct buffer_head **tl_bh) +{ + int status; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, + TRUNCATE_LOG_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get load truncate log inode!\n"); + goto bail; + } + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + iput(inode); + mlog_errno(status); + goto bail; + } + + *tl_inode = inode; + *tl_bh = bh; +bail: + mlog_exit(status); + return status; +} + +/* called during the 1st stage of node recovery. we stamp a clean + * truncate log and pass back a copy for processing later. if the + * truncate log does not require processing, a *tl_copy is set to + * NULL. */ +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_truncate_log *tl; + + *tl_copy = NULL; + + mlog(0, "recover truncate log from slot %d\n", slot_num); + + status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) tl_bh->b_data; + tl = &di->id2.i_dealloc; + if (!OCFS2_IS_VALID_DINODE(di)) { + OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); + status = -EIO; + goto bail; + } + + if (le16_to_cpu(tl->tl_used)) { + mlog(0, "We'll have %u logs to recover\n", + le16_to_cpu(tl->tl_used)); + + *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); + if (!(*tl_copy)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* Assuming the write-out below goes well, this copy + * will be passed back to recovery for processing. */ + memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); + + /* All we need to do to clear the truncate log is set + * tl_used. */ + tl->tl_used = 0; + + status = ocfs2_write_block(osb, tl_bh, tl_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + +bail: + if (tl_inode) + iput(tl_inode); + if (tl_bh) + brelse(tl_bh); + + if (status < 0 && (*tl_copy)) { + kfree(*tl_copy); + *tl_copy = NULL; + } + + mlog_exit(status); + return status; +} + +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy) +{ + int status = 0; + int i; + unsigned int clusters, num_recs, start_cluster; + u64 start_blk; + struct ocfs2_journal_handle *handle; + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_truncate_log *tl; + + mlog_entry_void(); + + if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { + mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); + return -EINVAL; + } + + tl = &tl_copy->id2.i_dealloc; + num_recs = le16_to_cpu(tl->tl_used); + mlog(0, "cleanup %u records from %"MLFu64"\n", num_recs, + tl_copy->i_blkno); + + mutex_lock(&tl_inode->i_mutex); + for(i = 0; i < num_recs; i++) { + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + + handle = ocfs2_start_trans(osb, NULL, + OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_up; + } + + clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); + start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); + start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); + + status = ocfs2_truncate_log_append(osb, handle, + start_blk, clusters); + ocfs2_commit_trans(handle); + if (status < 0) { + mlog_errno(status); + goto bail_up; + } + } + +bail_up: + mutex_unlock(&tl_inode->i_mutex); + + mlog_exit(status); + return status; +} + +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = osb->osb_tl_inode; + + mlog_entry_void(); + + if (tl_inode) { + cancel_delayed_work(&osb->osb_truncate_log_wq); + flush_workqueue(ocfs2_wq); + + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + + brelse(osb->osb_tl_bh); + iput(osb->osb_tl_inode); + } + + mlog_exit_void(); +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb) +{ + int status; + struct inode *tl_inode = NULL; + struct buffer_head *tl_bh = NULL; + + mlog_entry_void(); + + status = ocfs2_get_truncate_log_info(osb, + osb->slot_num, + &tl_inode, + &tl_bh); + if (status < 0) + mlog_errno(status); + + /* ocfs2_truncate_log_shutdown keys on the existence of + * osb->osb_tl_inode so we don't set any of the osb variables + * until we're sure all is well. */ + INIT_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker, osb); + osb->osb_tl_bh = tl_bh; + osb->osb_tl_inode = tl_inode; + + mlog_exit(status); + return status; +} + +/* This function will figure out whether the currently last extent + * block will be deleted, and if it will, what the new last extent + * block will be so we can update his h_next_leaf_blk field, as well + * as the dinodes i_last_eb_blk */ +static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe, + u32 new_i_clusters, + struct buffer_head *old_last_eb, + struct buffer_head **new_last_eb) +{ + int i, status = 0; + u64 block = 0; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *bh = NULL; + + *new_last_eb = NULL; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail; + } + + /* we have no tree, so of course, no last_eb. */ + if (!fe->id2.i_list.l_tree_depth) + goto bail; + + /* trunc to zero special case - this makes tree_depth = 0 + * regardless of what it is. */ + if (!new_i_clusters) + goto bail; + + eb = (struct ocfs2_extent_block *) old_last_eb->b_data; + el = &(eb->h_list); + BUG_ON(!el->l_next_free_rec); + + /* Make sure that this guy will actually be empty after we + * clear away the data. */ + if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) + goto bail; + + /* Ok, at this point, we know that last_eb will definitely + * change, so lets traverse the tree and find the second to + * last extent block. */ + el = &(fe->id2.i_list); + /* go down the tree, */ + do { + for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { + if (le32_to_cpu(el->l_recs[i].e_cpos) < + new_i_clusters) { + block = le64_to_cpu(el->l_recs[i].e_blkno); + break; + } + } + BUG_ON(i < 0); + + if (bh) { + brelse(bh); + bh = NULL; + } + + status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + } while (el->l_tree_depth); + + *new_last_eb = bh; + get_bh(*new_last_eb); + mlog(0, "returning block %"MLFu64"\n", le64_to_cpu(eb->h_blkno)); +bail: + if (bh) + brelse(bh); + + return status; +} + +static int ocfs2_do_truncate(struct ocfs2_super *osb, + unsigned int clusters_to_del, + struct inode *inode, + struct buffer_head *fe_bh, + struct buffer_head *old_last_eb_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_truncate_context *tc) +{ + int status, i, depth; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_block *last_eb = NULL; + struct ocfs2_extent_list *el; + struct buffer_head *eb_bh = NULL; + struct buffer_head *last_eb_bh = NULL; + u64 next_eb = 0; + u64 delete_blk = 0; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + status = ocfs2_find_new_last_ext_blk(osb, + inode, + fe, + le32_to_cpu(fe->i_clusters) - + clusters_to_del, + old_last_eb_bh, + &last_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (last_eb_bh) + last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + el = &(fe->id2.i_list); + + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - + clusters_to_del; + spin_unlock(&OCFS2_I(inode)->ip_lock); + le32_add_cpu(&fe->i_clusters, -clusters_to_del); + fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); + le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); + /* tree depth zero, we can just delete the clusters, otherwise + * we need to record the offset of the next level extent block + * as we may overwrite it. */ + if (!el->l_tree_depth) + delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(el->l_recs[i].e_clusters)); + else + next_eb = le64_to_cpu(el->l_recs[i].e_blkno); + + if (!el->l_recs[i].e_clusters) { + /* if we deleted the whole extent record, then clear + * out the other fields and update the extent + * list. For depth > 0 trees, we've already recorded + * the extent block in 'next_eb' */ + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + } + + depth = le16_to_cpu(el->l_tree_depth); + if (!fe->i_clusters) { + /* trunc to zero is a special case. */ + el->l_tree_depth = 0; + fe->i_last_eb_blk = 0; + } else if (last_eb) + fe->i_last_eb_blk = last_eb->h_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (last_eb) { + /* If there will be a new last extent block, then by + * definition, there cannot be any leaves to the right of + * him. */ + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + last_eb->h_next_leaf_blk = 0; + status = ocfs2_journal_dirty(handle, last_eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* if our tree depth > 0, update all the tree blocks below us. */ + while (depth) { + mlog(0, "traveling tree (depth = %d, next_eb = %"MLFu64")\n", + depth, next_eb); + status = ocfs2_read_block(osb, next_eb, &eb_bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + + status = ocfs2_journal_access(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + mlog(0, "extent block %"MLFu64", before: record %d: " + "(%u, %u, %"MLFu64"), next = %u\n", + le64_to_cpu(eb->h_blkno), i, + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le64_to_cpu(el->l_recs[i].e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); + le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); + + next_eb = le64_to_cpu(el->l_recs[i].e_blkno); + /* bottom-most block requires us to delete data.*/ + if (!el->l_tree_depth) + delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) + + ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(el->l_recs[i].e_clusters)); + if (!el->l_recs[i].e_clusters) { + el->l_recs[i].e_cpos = 0; + el->l_recs[i].e_blkno = 0; + BUG_ON(!el->l_next_free_rec); + le16_add_cpu(&el->l_next_free_rec, -1); + } + mlog(0, "extent block %"MLFu64", after: record %d: " + "(%u, %u, %"MLFu64"), next = %u\n", + le64_to_cpu(eb->h_blkno), i, + le32_to_cpu(el->l_recs[i].e_cpos), + le32_to_cpu(el->l_recs[i].e_clusters), + le64_to_cpu(el->l_recs[i].e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + status = ocfs2_journal_dirty(handle, eb_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!el->l_next_free_rec) { + mlog(0, "deleting this extent block.\n"); + + ocfs2_remove_from_cache(inode, eb_bh); + + BUG_ON(eb->h_suballoc_slot); + BUG_ON(el->l_recs[0].e_clusters); + BUG_ON(el->l_recs[0].e_cpos); + BUG_ON(el->l_recs[0].e_blkno); + status = ocfs2_free_extent_block(handle, + tc->tc_ext_alloc_inode, + tc->tc_ext_alloc_bh, + eb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + brelse(eb_bh); + eb_bh = NULL; + depth--; + } + + BUG_ON(!delete_blk); + status = ocfs2_truncate_log_append(osb, handle, delete_blk, + clusters_to_del); + if (status < 0) { + mlog_errno(status); + goto bail; + } + status = 0; +bail: + if (!status) + ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); + else + ocfs2_extent_map_drop(inode, 0); + mlog_exit(status); + return status; +} + +/* + * It is expected, that by the time you call this function, + * inode->i_size and fe->i_size have been adjusted. + * + * WARNING: This will kfree the truncate context + */ +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context *tc) +{ + int status, i, credits, tl_sem = 0; + u32 clusters_to_del, target_i_clusters; + u64 last_eb = 0; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *last_eb_bh; + struct ocfs2_journal_handle *handle = NULL; + struct inode *tl_inode = osb->osb_tl_inode; + + mlog_entry_void(); + + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + + last_eb_bh = tc->tc_last_eb_bh; + tc->tc_last_eb_bh = NULL; + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (fe->id2.i_list.l_tree_depth) { + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + el = &eb->h_list; + } else + el = &fe->id2.i_list; + last_eb = le64_to_cpu(fe->i_last_eb_blk); +start: + mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " + "last_eb = %"MLFu64", fe->i_last_eb_blk = %"MLFu64", " + "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", + le32_to_cpu(fe->i_clusters), last_eb, + le64_to_cpu(fe->i_last_eb_blk), + le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); + + if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { + mlog(0, "last_eb changed!\n"); + BUG_ON(!fe->id2.i_list.l_tree_depth); + last_eb = le64_to_cpu(fe->i_last_eb_blk); + /* i_last_eb_blk may have changed, read it if + * necessary. We don't have to worry about the + * truncate to zero case here (where there becomes no + * last_eb) because we never loop back after our work + * is done. */ + if (last_eb_bh) { + brelse(last_eb_bh); + last_eb_bh = NULL; + } + + status = ocfs2_read_block(osb, last_eb, + &last_eb_bh, OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + } + + /* by now, el will point to the extent list on the bottom most + * portion of this tree. */ + i = le16_to_cpu(el->l_next_free_rec) - 1; + if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) + clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); + else + clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + + le32_to_cpu(el->l_recs[i].e_cpos)) - + target_i_clusters; + + mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); + + mutex_lock(&tl_inode->i_mutex); + tl_sem = 1; + /* ocfs2_truncate_log_needs_flush guarantees us at least one + * record is free for use. If there isn't any, we flush to get + * an empty truncate log. */ + if (ocfs2_truncate_log_needs_flush(osb)) { + status = __ocfs2_flush_truncate_log(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, + fe, el); + handle = ocfs2_start_trans(osb, NULL, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) + mlog_errno(status); + + status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, + last_eb_bh, handle, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mutex_unlock(&tl_inode->i_mutex); + tl_sem = 0; + + ocfs2_commit_trans(handle); + handle = NULL; + + BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); + if (le32_to_cpu(fe->i_clusters) > target_i_clusters) + goto start; +bail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_schedule_truncate_log_flush(osb, 1); + + if (tl_sem) + mutex_unlock(&tl_inode->i_mutex); + + if (handle) + ocfs2_commit_trans(handle); + + if (last_eb_bh) + brelse(last_eb_bh); + + /* This will drop the ext_alloc cluster lock for us */ + ocfs2_free_truncate_context(tc); + + mlog_exit(status); + return status; +} + + +/* + * Expects the inode to already be locked. This will figure out which + * inodes need to be locked and will put them on the returned truncate + * context. + */ +int ocfs2_prepare_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context **tc) +{ + int status, metadata_delete; + unsigned int new_i_clusters; + struct ocfs2_dinode *fe; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct buffer_head *last_eb_bh = NULL; + struct inode *ext_alloc_inode = NULL; + struct buffer_head *ext_alloc_bh = NULL; + + mlog_entry_void(); + + *tc = NULL; + + new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + i_size_read(inode)); + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" + "%"MLFu64"\n", fe->i_clusters, new_i_clusters, fe->i_size); + + if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { + ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has cluster count " + "%u and size %"MLFu64" whereas struct inode has " + "cluster count %u and size %llu which caused an " + "invalid truncate to %u clusters.", + le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->i_clusters), + le64_to_cpu(fe->i_size), + OCFS2_I(inode)->ip_clusters, i_size_read(inode), + new_i_clusters); + mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); + status = -EIO; + goto bail; + } + + *tc = kcalloc(1, sizeof(struct ocfs2_truncate_context), GFP_KERNEL); + if (!(*tc)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + metadata_delete = 0; + if (fe->id2.i_list.l_tree_depth) { + /* If we have a tree, then the truncate may result in + * metadata deletes. Figure this out from the + * rightmost leaf block.*/ + status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + + brelse(last_eb_bh); + status = -EIO; + goto bail; + } + el = &(eb->h_list); + if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) + metadata_delete = 1; + } + + (*tc)->tc_last_eb_bh = last_eb_bh; + + if (metadata_delete) { + mlog(0, "Will have to delete metadata for this trunc. " + "locking allocator.\n"); + ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); + if (!ext_alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + mutex_lock(&ext_alloc_inode->i_mutex); + (*tc)->tc_ext_alloc_inode = ext_alloc_inode; + + status = ocfs2_meta_lock(ext_alloc_inode, + NULL, + &ext_alloc_bh, + 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + (*tc)->tc_ext_alloc_bh = ext_alloc_bh; + (*tc)->tc_ext_alloc_locked = 1; + } + + status = 0; +bail: + if (status < 0) { + if (*tc) + ocfs2_free_truncate_context(*tc); + *tc = NULL; + } + mlog_exit_void(); + return status; +} + +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) +{ + if (tc->tc_ext_alloc_inode) { + if (tc->tc_ext_alloc_locked) + ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); + + mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); + iput(tc->tc_ext_alloc_inode); + } + + if (tc->tc_ext_alloc_bh) + brelse(tc->tc_ext_alloc_bh); + + if (tc->tc_last_eb_bh) + brelse(tc->tc_last_eb_bh); + + kfree(tc); +} diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h new file mode 100644 index 000000000000..12ba897743f4 --- /dev/null +++ b/fs/ocfs2/alloc.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * alloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_ALLOC_H +#define OCFS2_ALLOC_H + +struct ocfs2_alloc_context; +int ocfs2_insert_extent(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 blkno, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac); +int ocfs2_num_free_extents(struct ocfs2_super *osb, + struct inode *inode, + struct ocfs2_dinode *fe); +/* how many new metadata chunks would an allocation need at maximum? */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) +{ + /* + * Rather than do all the work of determining how much we need + * (involves a ton of reads and locks), just ask for the + * maximal limit. That's a tree depth shift. So, one block for + * level of the tree (current l_tree_depth), one block for the + * new tree_depth==0 extent_block, and one block at the new + * top-of-the tree. + */ + return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; +} + +int ocfs2_truncate_log_init(struct ocfs2_super *osb); +void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb); +void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, + int cancel); +int ocfs2_flush_truncate_log(struct ocfs2_super *osb); +int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **tl_copy); +int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *tl_copy); + +struct ocfs2_truncate_context { + struct inode *tc_ext_alloc_inode; + struct buffer_head *tc_ext_alloc_bh; + int tc_ext_alloc_locked; /* is it cluster locked? */ + /* these get destroyed once it's passed to ocfs2_commit_truncate. */ + struct buffer_head *tc_last_eb_bh; +}; + +int ocfs2_prepare_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context **tc); +int ocfs2_commit_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_truncate_context *tc); + +#endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c new file mode 100644 index 000000000000..8f4467a930a5 --- /dev/null +++ b/fs/ocfs2/aops.c @@ -0,0 +1,643 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <asm/byteorder.h> + +#define MLOG_MASK_PREFIX ML_FILE_IO +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "super.h" +#include "symlink.h" + +#include "buffer_head_io.h" + +static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int status; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *buffer_cache_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + void *kaddr; + + mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, + (unsigned long long)iblock, bh_result, create); + + BUG_ON(ocfs2_inode_is_fast_symlink(inode)); + + if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { + mlog(ML_ERROR, "block offset > PATH_MAX: %llu", + (unsigned long long)iblock); + goto bail; + } + + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + &bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + fe = (struct ocfs2_dinode *) bh->b_data; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", + fe->i_blkno, 7, fe->i_signature); + goto bail; + } + + if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(fe->i_clusters))) { + mlog(ML_ERROR, "block offset is outside the allocated size: " + "%llu\n", (unsigned long long)iblock); + goto bail; + } + + /* We don't use the page cache to create symlink data, so if + * need be, copy it over from the buffer cache. */ + if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { + u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + + iblock; + buffer_cache_bh = sb_getblk(osb->sb, blkno); + if (!buffer_cache_bh) { + mlog(ML_ERROR, "couldn't getblock for symlink!\n"); + goto bail; + } + + /* we haven't locked out transactions, so a commit + * could've happened. Since we've got a reference on + * the bh, even if it commits while we're doing the + * copy, the data is still good. */ + if (buffer_jbd(buffer_cache_bh) + && ocfs2_inode_is_new(inode)) { + kaddr = kmap_atomic(bh_result->b_page, KM_USER0); + if (!kaddr) { + mlog(ML_ERROR, "couldn't kmap!\n"); + goto bail; + } + memcpy(kaddr + (bh_result->b_size * iblock), + buffer_cache_bh->b_data, + bh_result->b_size); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh_result); + } + brelse(buffer_cache_bh); + } + + map_bh(bh_result, inode->i_sb, + le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); + + err = 0; + +bail: + if (bh) + brelse(bh); + + mlog_exit(err); + return err; +} + +static int ocfs2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int err = 0; + u64 p_blkno, past_eof; + + mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, + (unsigned long long)iblock, bh_result, create); + + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", + inode, inode->i_ino); + + if (S_ISLNK(inode->i_mode)) { + /* this always does I/O for some reason. */ + err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); + goto bail; + } + + /* this can happen if another node truncs after our extend! */ + spin_lock(&OCFS2_I(inode)->ip_lock); + if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters)) + err = -EIO; + spin_unlock(&OCFS2_I(inode)->ip_lock); + if (err) + goto bail; + + err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, + NULL); + if (err) { + mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " + "%"MLFu64", NULL)\n", err, inode, + (unsigned long long)iblock, p_blkno); + goto bail; + } + + map_bh(bh_result, inode->i_sb, p_blkno); + + if (bh_result->b_blocknr == 0) { + err = -EIO; + mlog(ML_ERROR, "iblock = %llu p_blkno = %"MLFu64" " + "blkno=(%"MLFu64")\n", (unsigned long long)iblock, + p_blkno, OCFS2_I(inode)->ip_blkno); + } + + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %"MLFu64"\n", inode->i_ino, past_eof); + + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + +bail: + if (err < 0) + err = -EIO; + + mlog_exit(err); + return err; +} + +static int ocfs2_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; + int ret, unlock = 1; + + mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); + + ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); + if (ret != 0) { + if (ret == AOP_TRUNCATED_PAGE) + unlock = 0; + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * i_size might have just been updated as we grabed the meta lock. We + * might now be discovering a truncate that hit on another node. + * block_read_full_page->get_block freaks out if it is asked to read + * beyond the end of a file, so we check here. Callers + * (generic_file_read, fault->nopage) are clever enough to check i_size + * and notice that the page they just read isn't needed. + * + * XXX sys_readahead() seems to get that wrong? + */ + if (start >= i_size_read(inode)) { + char *addr = kmap(page); + memset(addr, 0, PAGE_SIZE); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + ret = 0; + goto out_alloc; + } + + ret = ocfs2_data_lock_with_page(inode, 0, page); + if (ret != 0) { + if (ret == AOP_TRUNCATED_PAGE) + unlock = 0; + mlog_errno(ret); + goto out_alloc; + } + + ret = block_read_full_page(page, ocfs2_get_block); + unlock = 0; + + ocfs2_data_unlock(inode, 0); +out_alloc: + up_read(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 0); +out: + if (unlock) + unlock_page(page); + mlog_exit(ret); + return ret; +} + +/* Note: Because we don't support holes, our allocation has + * already happened (allocation writes zeros to the file data) + * so we don't have to worry about ordered writes in + * ocfs2_writepage. + * + * ->writepage is called during the process of invalidating the page cache + * during blocked lock processing. It can't block on any cluster locks + * to during block mapping. It's relying on the fact that the block + * mapping can't have disappeared under the dirty pages that it is + * being asked to write back. + */ +static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + + mlog_entry("(0x%p)\n", page); + + ret = block_write_full_page(page, ocfs2_get_block, wbc); + + mlog_exit(ret); + + return ret; +} + +/* + * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called + * from loopback. It must be able to perform its own locking around + * ocfs2_get_block(). + */ +int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + int ret; + + mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); + + ret = ocfs2_meta_lock_with_page(inode, NULL, NULL, 0, page); + if (ret != 0) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + ret = block_prepare_write(page, from, to, ocfs2_get_block); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_meta_unlock(inode, 0); +out: + mlog_exit(ret); + return ret; +} + +/* Taken from ext3. We don't necessarily need the full blown + * functionality yet, but IMHO it's better to cut and paste the whole + * thing so we can avoid introducing our own bugs (and easily pick up + * their fixes when they happen) --Mark */ +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + struct buffer_head *next; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) + { + next = bh->b_this_page; + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, + struct page *page, + unsigned from, + unsigned to) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_journal_handle *handle = NULL; + int ret = 0; + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (!handle) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle->k_handle, + page_buffers(page), + from, to, NULL, + ocfs2_journal_dirty_data); + if (ret < 0) + mlog_errno(ret); + } +out: + if (ret) { + if (handle) + ocfs2_commit_trans(handle); + handle = ERR_PTR(ret); + } + return handle; +} + +static int ocfs2_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + int ret, extending = 0, locklevel = 0; + loff_t new_i_size; + struct buffer_head *di_bh = NULL; + struct inode *inode = page->mapping->host; + struct ocfs2_journal_handle *handle = NULL; + + mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); + + /* NOTE: ocfs2_file_aio_write has ensured that it's safe for + * us to sample inode->i_size here without the metadata lock: + * + * 1) We're currently holding the inode alloc lock, so no + * nodes can change it underneath us. + * + * 2) We've had to take the metadata lock at least once + * already to check for extending writes, hence insuring + * that our current copy is also up to date. + */ + new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + if (new_i_size > i_size_read(inode)) { + extending = 1; + locklevel = 1; + } + + ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page); + if (ret != 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_data_lock_with_page(inode, 1, page); + if (ret != 0) { + mlog_errno(ret); + goto out_unlock_meta; + } + + if (extending) { + handle = ocfs2_start_walk_page_trans(inode, page, from, to); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out_unlock_data; + } + + /* Mark our buffer early. We'd rather catch this error up here + * as opposed to after a successful commit_write which would + * require us to set back inode->i_size. */ + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + /* might update i_size */ + ret = generic_commit_write(file, page, from, to); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + if (extending) { + loff_t size = (u64) i_size_read(inode); + struct ocfs2_dinode *di = + (struct ocfs2_dinode *)di_bh->b_data; + + /* ocfs2_mark_inode_dirty is too heavy to use here. */ + inode->i_blocks = ocfs2_align_bytes_to_sectors(size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di->i_size = cpu_to_le64(size); + di->i_ctime = di->i_mtime = + cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = + cpu_to_le32(inode->i_mtime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + BUG_ON(extending && (i_size_read(inode) != new_i_size)); + +out_commit: + if (handle) + ocfs2_commit_trans(handle); +out_unlock_data: + ocfs2_data_unlock(inode, 1); +out_unlock_meta: + ocfs2_meta_unlock(inode, locklevel); +out: + if (di_bh) + brelse(di_bh); + + mlog_exit(ret); + return ret; +} + +static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) +{ + sector_t status; + u64 p_blkno = 0; + int err = 0; + struct inode *inode = mapping->host; + + mlog_entry("(block = %llu)\n", (unsigned long long)block); + + /* We don't need to lock journal system files, since they aren't + * accessed concurrently from multiple nodes. + */ + if (!INODE_JOURNAL(inode)) { + err = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } + + err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, + NULL); + + if (!INODE_JOURNAL(inode)) { + up_read(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 0); + } + + if (err) { + mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", + (unsigned long long)block); + mlog_errno(err); + goto bail; + } + + +bail: + status = err ? 0 : p_blkno; + + mlog_exit((int)status); + + return status; +} + +/* + * TODO: Make this into a generic get_blocks function. + * + * From do_direct_io in direct-io.c: + * "So what we do is to permit the ->get_blocks function to populate + * bh.b_size with the size of IO which is permitted at this offset and + * this i_blkbits." + * + * This function is called directly from get_more_blocks in direct-io.c. + * + * called like this: dio->get_blocks(dio->inode, fs_startblk, + * fs_count, map_bh, dio->rw == WRITE); + */ +static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, int create) +{ + int ret; + u64 vbo_max; /* file offset, max_blocks from iblock */ + u64 p_blkno; + int contig_blocks; + unsigned char blocksize_bits; + + if (!inode || !bh_result) { + mlog(ML_ERROR, "inode or bh_result is null\n"); + return -EIO; + } + + blocksize_bits = inode->i_sb->s_blocksize_bits; + + /* This function won't even be called if the request isn't all + * nicely aligned and of the right size, so there's no need + * for us to check any of that. */ + + vbo_max = ((u64)iblock + max_blocks) << blocksize_bits; + + spin_lock(&OCFS2_I(inode)->ip_lock); + if ((iblock + max_blocks) > + ocfs2_clusters_to_blocks(inode->i_sb, + OCFS2_I(inode)->ip_clusters)) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + ret = -EIO; + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* This figures out the size of the next contiguous block, and + * our logical offset */ + ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, + &contig_blocks); + if (ret) { + mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", + (unsigned long long)iblock); + ret = -EIO; + goto bail; + } + + map_bh(bh_result, inode->i_sb, p_blkno); + + /* make sure we don't map more than max_blocks blocks here as + that's all the kernel will handle at this point. */ + if (max_blocks < contig_blocks) + contig_blocks = max_blocks; + bh_result->b_size = contig_blocks << blocksize_bits; +bail: + return ret; +} + +/* + * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're + * particularly interested in the aio/dio case. Like the core uses + * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from + * truncation on another. + */ +static void ocfs2_dio_end_io(struct kiocb *iocb, + loff_t offset, + ssize_t bytes, + void *private) +{ + struct inode *inode = iocb->ki_filp->f_dentry->d_inode; + + /* this io's submitter should not have unlocked this before we could */ + BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + ocfs2_iocb_clear_rw_locked(iocb); + up_read(&inode->i_alloc_sem); + ocfs2_rw_unlock(inode, 0); +} + +static ssize_t ocfs2_direct_IO(int rw, + struct kiocb *iocb, + const struct iovec *iov, + loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + int ret; + + mlog_entry_void(); + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, + inode->i_sb->s_bdev, iov, offset, + nr_segs, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io); + mlog_exit(ret); + return ret; +} + +struct address_space_operations ocfs2_aops = { + .readpage = ocfs2_readpage, + .writepage = ocfs2_writepage, + .prepare_write = ocfs2_prepare_write, + .commit_write = ocfs2_commit_write, + .bmap = ocfs2_bmap, + .sync_page = block_sync_page, + .direct_IO = ocfs2_direct_IO +}; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h new file mode 100644 index 000000000000..d40456d509a0 --- /dev/null +++ b/fs/ocfs2/aops.h @@ -0,0 +1,41 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_AOPS_H +#define OCFS2_AOPS_H + +int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to); + +struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, + struct page *page, + unsigned from, + unsigned to); + +/* all ocfs2_dio_end_io()'s fault */ +#define ocfs2_iocb_is_rw_locked(iocb) \ + test_bit(0, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_rw_locked(iocb) \ + set_bit(0, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_rw_locked(iocb) \ + clear_bit(0, (unsigned long *)&iocb->private) + +#endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c new file mode 100644 index 000000000000..d424041b38e9 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.c @@ -0,0 +1,232 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * io.c + * + * Buffer cache handling + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "inode.h" +#include "journal.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, + struct inode *inode) +{ + int ret = 0; + + mlog_entry("(bh->b_blocknr = %llu, inode=%p)\n", + (unsigned long long)bh->b_blocknr, inode); + + BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); + BUG_ON(buffer_jbd(bh)); + + /* No need to check for a soft readonly file system here. non + * journalled writes are only ever done on system files which + * can get modified during recovery even if read-only. */ + if (ocfs2_is_hard_readonly(osb)) { + ret = -EROFS; + goto out; + } + + down(&OCFS2_I(inode)->ip_io_sem); + + lock_buffer(bh); + set_buffer_uptodate(bh); + + /* remove from dirty list before I/O. */ + clear_buffer_dirty(bh); + + get_bh(bh); /* for end_buffer_write_sync() */ + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + + wait_on_buffer(bh); + + if (buffer_uptodate(bh)) { + ocfs2_set_buffer_uptodate(inode, bh); + } else { + /* We don't need to remove the clustered uptodate + * information for this bh as it's not marked locally + * uptodate. */ + ret = -EIO; + brelse(bh); + } + + up(&OCFS2_I(inode)->ip_io_sem); +out: + mlog_exit(ret); + return ret; +} + +int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, + struct buffer_head *bhs[], int flags, + struct inode *inode) +{ + int status = 0; + struct super_block *sb; + int i, ignore_cache = 0; + struct buffer_head *bh; + + mlog_entry("(block=(%"MLFu64"), nr=(%d), flags=%d, inode=%p)\n", + block, nr, flags, inode); + + if (osb == NULL || osb->sb == NULL || bhs == NULL) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr < 0) { + mlog(ML_ERROR, "asked to read %d blocks!\n", nr); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + if (nr == 0) { + mlog(ML_BH_IO, "No buffers will be read!\n"); + status = 0; + goto bail; + } + + sb = osb->sb; + + if (flags & OCFS2_BH_CACHED && !inode) + flags &= ~OCFS2_BH_CACHED; + + if (inode) + down(&OCFS2_I(inode)->ip_io_sem); + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(sb, block++); + if (bhs[i] == NULL) { + if (inode) + up(&OCFS2_I(inode)->ip_io_sem); + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + ignore_cache = 0; + + if (flags & OCFS2_BH_CACHED && + !ocfs2_buffer_uptodate(inode, bh)) { + mlog(ML_UPTODATE, + "bh (%llu), inode %"MLFu64" not uptodate\n", + (unsigned long long)bh->b_blocknr, + OCFS2_I(inode)->ip_blkno); + ignore_cache = 1; + } + + /* XXX: Can we ever get this and *not* have the cached + * flag set? */ + if (buffer_jbd(bh)) { + if (!(flags & OCFS2_BH_CACHED) || ignore_cache) + mlog(ML_BH_IO, "trying to sync read a jbd " + "managed bh (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_BH_IO, "asking me to sync read a dirty " + "buffer! (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { +#ifdef CATCH_BH_JBD_RACES + mlog(ML_ERROR, "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); +#else + unlock_buffer(bh); + continue; +#endif + } + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + if (flags & OCFS2_BH_READAHEAD) + submit_bh(READA, bh); + else + submit_bh(READ, bh); + continue; + } + } + + status = 0; + + for (i = (nr - 1); i >= 0; i--) { + bh = bhs[i]; + + /* We know this can't have changed as we hold the + * inode sem. Avoid doing any work on the bh if the + * journal has it. */ + if (!buffer_jbd(bh)) + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. Don't need to + * remove the clustered uptodate information + * for this bh as it's not marked locally + * uptodate. */ + status = -EIO; + brelse(bh); + bhs[i] = NULL; + continue; + } + + if (inode) + ocfs2_set_buffer_uptodate(inode, bh); + } + if (inode) + up(&OCFS2_I(inode)->ip_io_sem); + + mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, + (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); + +bail: + + mlog_exit(status); + return status; +} diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h new file mode 100644 index 000000000000..6ecb90937b68 --- /dev/null +++ b/fs/ocfs2/buffer_head_io.h @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_buffer_head.h + * + * Buffer cache handling functions defined + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_BUFFER_HEAD_IO_H +#define OCFS2_BUFFER_HEAD_IO_H + +#include <linux/buffer_head.h> + +void ocfs2_end_buffer_io_sync(struct buffer_head *bh, + int uptodate); + +static inline int ocfs2_read_block(struct ocfs2_super *osb, + u64 off, + struct buffer_head **bh, + int flags, + struct inode *inode); + +int ocfs2_write_block(struct ocfs2_super *osb, + struct buffer_head *bh, + struct inode *inode); +int ocfs2_read_blocks(struct ocfs2_super *osb, + u64 block, + int nr, + struct buffer_head *bhs[], + int flags, + struct inode *inode); + + +#define OCFS2_BH_CACHED 1 +#define OCFS2_BH_READAHEAD 8 /* use this to pass READA down to submit_bh */ + +static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, + struct buffer_head **bh, int flags, + struct inode *inode) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_blocks(osb, off, 1, bh, + flags, inode); + +bail: + return status; +} + +#endif /* OCFS2_BUFFER_HEAD_IO_H */ diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile new file mode 100644 index 000000000000..cdd162f13650 --- /dev/null +++ b/fs/ocfs2/cluster/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o + +ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ + quorum.o tcp.o ver.o diff --git a/fs/ocfs2/cluster/endian.h b/fs/ocfs2/cluster/endian.h new file mode 100644 index 000000000000..2df9082f4e35 --- /dev/null +++ b/fs/ocfs2/cluster/endian.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_CLUSTER_ENDIAN_H +#define OCFS2_CLUSTER_ENDIAN_H + +static inline void be32_add_cpu(__be32 *var, u32 val) +{ + *var = cpu_to_be32(be32_to_cpu(*var) + val); +} + +#endif /* OCFS2_CLUSTER_ENDIAN_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c new file mode 100644 index 000000000000..7307ba528913 --- /dev/null +++ b/fs/ocfs2/cluster/heartbeat.c @@ -0,0 +1,1797 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/file.h> +#include <linux/kthread.h> +#include <linux/configfs.h> +#include <linux/random.h> +#include <linux/crc32.h> +#include <linux/time.h> + +#include "heartbeat.h" +#include "tcp.h" +#include "nodemanager.h" +#include "quorum.h" + +#include "masklog.h" + + +/* + * The first heartbeat pass had one global thread that would serialize all hb + * callback calls. This global serializing sem should only be removed once + * we've made sure that all callees can deal with being called concurrently + * from multiple hb region threads. + */ +static DECLARE_RWSEM(o2hb_callback_sem); + +/* + * multiple hb threads are watching multiple regions. A node is live + * whenever any of the threads sees activity from the node in its region. + */ +static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED; +static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; +static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +static LIST_HEAD(o2hb_node_events); +static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); + +static LIST_HEAD(o2hb_all_regions); + +static struct o2hb_callback { + struct list_head list; +} o2hb_callbacks[O2HB_NUM_CB]; + +static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); + +#define O2HB_DEFAULT_BLOCK_BITS 9 + +unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; + +/* Only sets a new threshold if there are no active regions. + * + * No locking or otherwise interesting code is required for reading + * o2hb_dead_threshold as it can't change once regions are active and + * it's not interesting to anyone until then anyway. */ +static void o2hb_dead_threshold_set(unsigned int threshold) +{ + if (threshold > O2HB_MIN_DEAD_THRESHOLD) { + spin_lock(&o2hb_live_lock); + if (list_empty(&o2hb_all_regions)) + o2hb_dead_threshold = threshold; + spin_unlock(&o2hb_live_lock); + } +} + +struct o2hb_node_event { + struct list_head hn_item; + enum o2hb_callback_type hn_event_type; + struct o2nm_node *hn_node; + int hn_node_num; +}; + +struct o2hb_disk_slot { + struct o2hb_disk_heartbeat_block *ds_raw_block; + u8 ds_node_num; + u64 ds_last_time; + u64 ds_last_generation; + u16 ds_equal_samples; + u16 ds_changed_samples; + struct list_head ds_live_item; +}; + +/* each thread owns a region.. when we're asked to tear down the region + * we ask the thread to stop, who cleans up the region */ +struct o2hb_region { + struct config_item hr_item; + + struct list_head hr_all_item; + unsigned hr_unclean_stop:1; + + /* protected by the hr_callback_sem */ + struct task_struct *hr_task; + + unsigned int hr_blocks; + unsigned long long hr_start_block; + + unsigned int hr_block_bits; + unsigned int hr_block_bytes; + + unsigned int hr_slots_per_page; + unsigned int hr_num_pages; + + struct page **hr_slot_data; + struct block_device *hr_bdev; + struct o2hb_disk_slot *hr_slots; + + /* let the person setting up hb wait for it to return until it + * has reached a 'steady' state. This will be fixed when we have + * a more complete api that doesn't lead to this sort of fragility. */ + atomic_t hr_steady_iterations; + + char hr_dev_name[BDEVNAME_SIZE]; + + unsigned int hr_timeout_ms; + + /* randomized as the region goes up and down so that a node + * recognizes a node going up and down in one iteration */ + u64 hr_generation; + + struct work_struct hr_write_timeout_work; + unsigned long hr_last_timeout_start; + + /* Used during o2hb_check_slot to hold a copy of the block + * being checked because we temporarily have to zero out the + * crc field. */ + struct o2hb_disk_heartbeat_block *hr_tmp_block; +}; + +struct o2hb_bio_wait_ctxt { + atomic_t wc_num_reqs; + struct completion wc_io_complete; +}; + +static void o2hb_write_timeout(void *arg) +{ + struct o2hb_region *reg = arg; + + mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " + "milliseconds\n", reg->hr_dev_name, + jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); + o2quo_disk_timeout(); +} + +static void o2hb_arm_write_timeout(struct o2hb_region *reg) +{ + mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + + cancel_delayed_work(®->hr_write_timeout_work); + reg->hr_last_timeout_start = jiffies; + schedule_delayed_work(®->hr_write_timeout_work, + msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); +} + +static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +{ + cancel_delayed_work(®->hr_write_timeout_work); + flush_scheduled_work(); +} + +static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, + unsigned int num_ios) +{ + atomic_set(&wc->wc_num_reqs, num_ios); + init_completion(&wc->wc_io_complete); +} + +/* Used in error paths too */ +static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, + unsigned int num) +{ + /* sadly atomic_sub_and_test() isn't available on all platforms. The + * good news is that the fast path only completes one at a time */ + while(num--) { + if (atomic_dec_and_test(&wc->wc_num_reqs)) { + BUG_ON(num > 0); + complete(&wc->wc_io_complete); + } + } +} + +static void o2hb_wait_on_io(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *wc) +{ + struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping; + + blk_run_address_space(mapping); + + wait_for_completion(&wc->wc_io_complete); +} + +static int o2hb_bio_end_io(struct bio *bio, + unsigned int bytes_done, + int error) +{ + struct o2hb_bio_wait_ctxt *wc = bio->bi_private; + + if (error) + mlog(ML_ERROR, "IO Error %d\n", error); + + if (bio->bi_size) + return 1; + + o2hb_bio_wait_dec(wc, 1); + return 0; +} + +/* Setup a Bio to cover I/O against num_slots slots starting at + * start_slot. */ +static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, + struct o2hb_bio_wait_ctxt *wc, + unsigned int start_slot, + unsigned int num_slots) +{ + int i, nr_vecs, len, first_page, last_page; + unsigned int vec_len, vec_start; + unsigned int bits = reg->hr_block_bits; + unsigned int spp = reg->hr_slots_per_page; + struct bio *bio; + struct page *page; + + nr_vecs = (num_slots + spp - 1) / spp; + + /* Testing has shown this allocation to take long enough under + * GFP_KERNEL that the local node can get fenced. It would be + * nicest if we could pre-allocate these bios and avoid this + * all together. */ + bio = bio_alloc(GFP_ATOMIC, nr_vecs); + if (!bio) { + mlog(ML_ERROR, "Could not alloc slots BIO!\n"); + bio = ERR_PTR(-ENOMEM); + goto bail; + } + + /* Must put everything in 512 byte sectors for the bio... */ + bio->bi_sector = (reg->hr_start_block + start_slot) << (bits - 9); + bio->bi_bdev = reg->hr_bdev; + bio->bi_private = wc; + bio->bi_end_io = o2hb_bio_end_io; + + first_page = start_slot / spp; + last_page = first_page + nr_vecs; + vec_start = (start_slot << bits) % PAGE_CACHE_SIZE; + for(i = first_page; i < last_page; i++) { + page = reg->hr_slot_data[i]; + + vec_len = PAGE_CACHE_SIZE; + /* last page might be short */ + if (((i + 1) * spp) > (start_slot + num_slots)) + vec_len = ((num_slots + start_slot) % spp) << bits; + vec_len -= vec_start; + + mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", + i, vec_len, vec_start); + + len = bio_add_page(bio, page, vec_len, vec_start); + if (len != vec_len) { + bio_put(bio); + bio = ERR_PTR(-EIO); + + mlog(ML_ERROR, "Error adding page to bio i = %d, " + "vec_len = %u, len = %d\n, start = %u\n", + i, vec_len, len, vec_start); + goto bail; + } + + vec_start = 0; + } + +bail: + return bio; +} + +/* + * Compute the maximum number of sectors the bdev can handle in one bio, + * as a power of two. + * + * Stolen from oracleasm, thanks Joel! + */ +static int compute_max_sectors(struct block_device *bdev) +{ + int max_pages, max_sectors, pow_two_sectors; + + struct request_queue *q; + + q = bdev_get_queue(bdev); + max_pages = q->max_sectors >> (PAGE_SHIFT - 9); + if (max_pages > BIO_MAX_PAGES) + max_pages = BIO_MAX_PAGES; + if (max_pages > q->max_phys_segments) + max_pages = q->max_phys_segments; + if (max_pages > q->max_hw_segments) + max_pages = q->max_hw_segments; + max_pages--; /* Handle I/Os that straddle a page */ + + max_sectors = max_pages << (PAGE_SHIFT - 9); + + /* Why is fls() 1-based???? */ + pow_two_sectors = 1 << (fls(max_sectors) - 1); + + return pow_two_sectors; +} + +static inline void o2hb_compute_request_limits(struct o2hb_region *reg, + unsigned int num_slots, + unsigned int *num_bios, + unsigned int *slots_per_bio) +{ + unsigned int max_sectors, io_sectors; + + max_sectors = compute_max_sectors(reg->hr_bdev); + + io_sectors = num_slots << (reg->hr_block_bits - 9); + + *num_bios = (io_sectors + max_sectors - 1) / max_sectors; + *slots_per_bio = max_sectors >> (reg->hr_block_bits - 9); + + mlog(ML_HB_BIO, "My io size is %u sectors for %u slots. This " + "device can handle %u sectors of I/O\n", io_sectors, num_slots, + max_sectors); + mlog(ML_HB_BIO, "Will need %u bios holding %u slots each\n", + *num_bios, *slots_per_bio); +} + +static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int max_slots) +{ + unsigned int num_bios, slots_per_bio, start_slot, num_slots; + int i, status; + struct o2hb_bio_wait_ctxt wc; + struct bio **bios; + struct bio *bio; + + o2hb_compute_request_limits(reg, max_slots, &num_bios, &slots_per_bio); + + bios = kcalloc(num_bios, sizeof(struct bio *), GFP_KERNEL); + if (!bios) { + status = -ENOMEM; + mlog_errno(status); + return status; + } + + o2hb_bio_wait_init(&wc, num_bios); + + num_slots = slots_per_bio; + for(i = 0; i < num_bios; i++) { + start_slot = i * slots_per_bio; + + /* adjust num_slots at last bio */ + if (max_slots < (start_slot + num_slots)) + num_slots = max_slots - start_slot; + + bio = o2hb_setup_one_bio(reg, &wc, start_slot, num_slots); + if (IS_ERR(bio)) { + o2hb_bio_wait_dec(&wc, num_bios - i); + + status = PTR_ERR(bio); + mlog_errno(status); + goto bail_and_wait; + } + bios[i] = bio; + + submit_bio(READ, bio); + } + + status = 0; + +bail_and_wait: + o2hb_wait_on_io(reg, &wc); + + if (bios) { + for(i = 0; i < num_bios; i++) + if (bios[i]) + bio_put(bios[i]); + kfree(bios); + } + + return status; +} + +static int o2hb_issue_node_write(struct o2hb_region *reg, + struct bio **write_bio, + struct o2hb_bio_wait_ctxt *write_wc) +{ + int status; + unsigned int slot; + struct bio *bio; + + o2hb_bio_wait_init(write_wc, 1); + + slot = o2nm_this_node(); + + bio = o2hb_setup_one_bio(reg, write_wc, slot, 1); + if (IS_ERR(bio)) { + status = PTR_ERR(bio); + mlog_errno(status); + goto bail; + } + + submit_bio(WRITE, bio); + + *write_bio = bio; + status = 0; +bail: + return status; +} + +static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg, + struct o2hb_disk_heartbeat_block *hb_block) +{ + __le32 old_cksum; + u32 ret; + + /* We want to compute the block crc with a 0 value in the + * hb_cksum field. Save it off here and replace after the + * crc. */ + old_cksum = hb_block->hb_cksum; + hb_block->hb_cksum = 0; + + ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes); + + hb_block->hb_cksum = old_cksum; + + return ret; +} + +static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block) +{ + mlog(ML_ERROR, "Dump slot information: seq = 0x%"MLFx64", node = %u, " + "cksum = 0x%x, generation 0x%"MLFx64"\n", + le64_to_cpu(hb_block->hb_seq), hb_block->hb_node, + le32_to_cpu(hb_block->hb_cksum), + le64_to_cpu(hb_block->hb_generation)); +} + +static int o2hb_verify_crc(struct o2hb_region *reg, + struct o2hb_disk_heartbeat_block *hb_block) +{ + u32 read, computed; + + read = le32_to_cpu(hb_block->hb_cksum); + computed = o2hb_compute_block_crc_le(reg, hb_block); + + return read == computed; +} + +/* We want to make sure that nobody is heartbeating on top of us -- + * this will help detect an invalid configuration. */ +static int o2hb_check_last_timestamp(struct o2hb_region *reg) +{ + int node_num, ret; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + node_num = o2nm_this_node(); + + ret = 1; + slot = ®->hr_slots[node_num]; + /* Don't check on our 1st timestamp */ + if (slot->ds_last_time) { + hb_block = slot->ds_raw_block; + + if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time) + ret = 0; + } + + return ret; +} + +static inline void o2hb_prepare_block(struct o2hb_region *reg, + u64 generation) +{ + int node_num; + u64 cputime; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + node_num = o2nm_this_node(); + slot = ®->hr_slots[node_num]; + + hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; + memset(hb_block, 0, reg->hr_block_bytes); + /* TODO: time stuff */ + cputime = CURRENT_TIME.tv_sec; + if (!cputime) + cputime = 1; + + hb_block->hb_seq = cpu_to_le64(cputime); + hb_block->hb_node = node_num; + hb_block->hb_generation = cpu_to_le64(generation); + + /* This step must always happen last! */ + hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, + hb_block)); + + mlog(ML_HB_BIO, "our node generation = 0x%"MLFx64", cksum = 0x%x\n", + cpu_to_le64(generation), le32_to_cpu(hb_block->hb_cksum)); +} + +static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, + struct o2nm_node *node, + int idx) +{ + struct list_head *iter; + struct o2hb_callback_func *f; + + list_for_each(iter, &hbcall->list) { + f = list_entry(iter, struct o2hb_callback_func, hc_item); + mlog(ML_HEARTBEAT, "calling funcs %p\n", f); + (f->hc_func)(node, idx, f->hc_data); + } +} + +/* Will run the list in order until we process the passed event */ +static void o2hb_run_event_list(struct o2hb_node_event *queued_event) +{ + int empty; + struct o2hb_callback *hbcall; + struct o2hb_node_event *event; + + spin_lock(&o2hb_live_lock); + empty = list_empty(&queued_event->hn_item); + spin_unlock(&o2hb_live_lock); + if (empty) + return; + + /* Holding callback sem assures we don't alter the callback + * lists when doing this, and serializes ourselves with other + * processes wanting callbacks. */ + down_write(&o2hb_callback_sem); + + spin_lock(&o2hb_live_lock); + while (!list_empty(&o2hb_node_events) + && !list_empty(&queued_event->hn_item)) { + event = list_entry(o2hb_node_events.next, + struct o2hb_node_event, + hn_item); + list_del_init(&event->hn_item); + spin_unlock(&o2hb_live_lock); + + mlog(ML_HEARTBEAT, "Node %s event for %d\n", + event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN", + event->hn_node_num); + + hbcall = hbcall_from_type(event->hn_event_type); + + /* We should *never* have gotten on to the list with a + * bad type... This isn't something that we should try + * to recover from. */ + BUG_ON(IS_ERR(hbcall)); + + o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num); + + spin_lock(&o2hb_live_lock); + } + spin_unlock(&o2hb_live_lock); + + up_write(&o2hb_callback_sem); +} + +static void o2hb_queue_node_event(struct o2hb_node_event *event, + enum o2hb_callback_type type, + struct o2nm_node *node, + int node_num) +{ + assert_spin_locked(&o2hb_live_lock); + + event->hn_event_type = type; + event->hn_node = node; + event->hn_node_num = node_num; + + mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n", + type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num); + + list_add_tail(&event->hn_item, &o2hb_node_events); +} + +static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) +{ + struct o2hb_node_event event = + { .hn_item = LIST_HEAD_INIT(event.hn_item), }; + struct o2nm_node *node; + + node = o2nm_get_node_by_num(slot->ds_node_num); + if (!node) + return; + + spin_lock(&o2hb_live_lock); + if (!list_empty(&slot->ds_live_item)) { + mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n", + slot->ds_node_num); + + list_del_init(&slot->ds_live_item); + + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, + slot->ds_node_num); + } + } + spin_unlock(&o2hb_live_lock); + + o2hb_run_event_list(&event); + + o2nm_node_put(node); +} + +static int o2hb_check_slot(struct o2hb_region *reg, + struct o2hb_disk_slot *slot) +{ + int changed = 0, gen_changed = 0; + struct o2hb_node_event event = + { .hn_item = LIST_HEAD_INIT(event.hn_item), }; + struct o2nm_node *node; + struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; + u64 cputime; + + memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); + + /* Is this correct? Do we assume that the node doesn't exist + * if we're not configured for him? */ + node = o2nm_get_node_by_num(slot->ds_node_num); + if (!node) + return 0; + + if (!o2hb_verify_crc(reg, hb_block)) { + /* all paths from here will drop o2hb_live_lock for + * us. */ + spin_lock(&o2hb_live_lock); + + /* Don't print an error on the console in this case - + * a freshly formatted heartbeat area will not have a + * crc set on it. */ + if (list_empty(&slot->ds_live_item)) + goto out; + + /* The node is live but pushed out a bad crc. We + * consider it a transient miss but don't populate any + * other values as they may be junk. */ + mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", + slot->ds_node_num, reg->hr_dev_name); + o2hb_dump_slot(hb_block); + + slot->ds_equal_samples++; + goto fire_callbacks; + } + + /* we don't care if these wrap.. the state transitions below + * clear at the right places */ + cputime = le64_to_cpu(hb_block->hb_seq); + if (slot->ds_last_time != cputime) + slot->ds_changed_samples++; + else + slot->ds_equal_samples++; + slot->ds_last_time = cputime; + + /* The node changed heartbeat generations. We assume this to + * mean it dropped off but came back before we timed out. We + * want to consider it down for the time being but don't want + * to lose any changed_samples state we might build up to + * considering it live again. */ + if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) { + gen_changed = 1; + slot->ds_equal_samples = 0; + mlog(ML_HEARTBEAT, "Node %d changed generation (0x%"MLFx64" " + "to 0x%"MLFx64")\n", slot->ds_node_num, + slot->ds_last_generation, + le64_to_cpu(hb_block->hb_generation)); + } + + slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); + + mlog(ML_HEARTBEAT, "Slot %d gen 0x%"MLFx64" cksum 0x%x " + "seq %"MLFu64" last %"MLFu64" changed %u equal %u\n", + slot->ds_node_num, slot->ds_last_generation, + le32_to_cpu(hb_block->hb_cksum), le64_to_cpu(hb_block->hb_seq), + slot->ds_last_time, slot->ds_changed_samples, + slot->ds_equal_samples); + + spin_lock(&o2hb_live_lock); + +fire_callbacks: + /* dead nodes only come to life after some number of + * changes at any time during their dead time */ + if (list_empty(&slot->ds_live_item) && + slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) { + mlog(ML_HEARTBEAT, "Node %d (id 0x%"MLFx64") joined my " + "region\n", slot->ds_node_num, slot->ds_last_generation); + + /* first on the list generates a callback */ + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + set_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node, + slot->ds_node_num); + + changed = 1; + } + + list_add_tail(&slot->ds_live_item, + &o2hb_live_slots[slot->ds_node_num]); + + slot->ds_equal_samples = 0; + goto out; + } + + /* if the list is dead, we're done.. */ + if (list_empty(&slot->ds_live_item)) + goto out; + + /* live nodes only go dead after enough consequtive missed + * samples.. reset the missed counter whenever we see + * activity */ + if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { + mlog(ML_HEARTBEAT, "Node %d left my region\n", + slot->ds_node_num); + + /* last off the live_slot generates a callback */ + list_del_init(&slot->ds_live_item); + if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { + clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); + + o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, + slot->ds_node_num); + + changed = 1; + } + + /* We don't clear this because the node is still + * actually writing new blocks. */ + if (!gen_changed) + slot->ds_changed_samples = 0; + goto out; + } + if (slot->ds_changed_samples) { + slot->ds_changed_samples = 0; + slot->ds_equal_samples = 0; + } +out: + spin_unlock(&o2hb_live_lock); + + o2hb_run_event_list(&event); + + o2nm_node_put(node); + return changed; +} + +/* This could be faster if we just implmented a find_last_bit, but I + * don't think the circumstances warrant it. */ +static int o2hb_highest_node(unsigned long *nodes, + int numbits) +{ + int highest, node; + + highest = numbits; + node = -1; + while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { + if (node >= numbits) + break; + + highest = node; + } + + return highest; +} + +static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +{ + int i, ret, highest_node, change = 0; + unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; + struct bio *write_bio; + struct o2hb_bio_wait_ctxt write_wc; + + if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) + return; + + highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES) { + mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); + return; + } + + /* No sense in reading the slots of nodes that don't exist + * yet. Of course, if the node definitions have holes in them + * then we're reading an empty slot anyway... Consider this + * best-effort. */ + ret = o2hb_read_slots(reg, highest_node + 1); + if (ret < 0) { + mlog_errno(ret); + return; + } + + /* With an up to date view of the slots, we can check that no + * other node has been improperly configured to heartbeat in + * our slot. */ + if (!o2hb_check_last_timestamp(reg)) + mlog(ML_ERROR, "Device \"%s\": another node is heartbeating " + "in our slot!\n", reg->hr_dev_name); + + /* fill in the proper info for our next heartbeat */ + o2hb_prepare_block(reg, reg->hr_generation); + + /* And fire off the write. Note that we don't wait on this I/O + * until later. */ + ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + if (ret < 0) { + mlog_errno(ret); + return; + } + + i = -1; + while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + + change |= o2hb_check_slot(reg, ®->hr_slots[i]); + } + + /* + * We have to be sure we've advertised ourselves on disk + * before we can go to steady state. This ensures that + * people we find in our steady state have seen us. + */ + o2hb_wait_on_io(reg, &write_wc); + bio_put(write_bio); + o2hb_arm_write_timeout(reg); + + /* let the person who launched us know when things are steady */ + if (!change && (atomic_read(®->hr_steady_iterations) != 0)) { + if (atomic_dec_and_test(®->hr_steady_iterations)) + wake_up(&o2hb_steady_queue); + } +} + +/* Subtract b from a, storing the result in a. a *must* have a larger + * value than b. */ +static void o2hb_tv_subtract(struct timeval *a, + struct timeval *b) +{ + /* just return 0 when a is after b */ + if (a->tv_sec < b->tv_sec || + (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { + a->tv_sec = 0; + a->tv_usec = 0; + return; + } + + a->tv_sec -= b->tv_sec; + a->tv_usec -= b->tv_usec; + while ( a->tv_usec < 0 ) { + a->tv_sec--; + a->tv_usec += 1000000; + } +} + +static unsigned int o2hb_elapsed_msecs(struct timeval *start, + struct timeval *end) +{ + struct timeval res = *end; + + o2hb_tv_subtract(&res, start); + + return res.tv_sec * 1000 + res.tv_usec / 1000; +} + +/* + * we ride the region ref that the region dir holds. before the region + * dir is removed and drops it ref it will wait to tear down this + * thread. + */ +static int o2hb_thread(void *data) +{ + int i, ret; + struct o2hb_region *reg = data; + struct bio *write_bio; + struct o2hb_bio_wait_ctxt write_wc; + struct timeval before_hb, after_hb; + unsigned int elapsed_msec; + + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); + + set_user_nice(current, -20); + + while (!kthread_should_stop() && !reg->hr_unclean_stop) { + /* We track the time spent inside + * o2hb_do_disk_heartbeat so that we avoid more then + * hr_timeout_ms between disk writes. On busy systems + * this should result in a heartbeat which is less + * likely to time itself out. */ + do_gettimeofday(&before_hb); + + o2hb_do_disk_heartbeat(reg); + + do_gettimeofday(&after_hb); + elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + + mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + before_hb.tv_sec, before_hb.tv_usec, + after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); + + if (elapsed_msec < reg->hr_timeout_ms) { + /* the kthread api has blocked signals for us so no + * need to record the return value. */ + msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); + } + } + + o2hb_disarm_write_timeout(reg); + + /* unclean stop is only used in very bad situation */ + for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) + o2hb_shutdown_slot(®->hr_slots[i]); + + /* Explicit down notification - avoid forcing the other nodes + * to timeout on this region when we could just as easily + * write a clear generation - thus indicating to them that + * this node has left this region. + * + * XXX: Should we skip this on unclean_stop? */ + o2hb_prepare_block(reg, 0); + ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); + if (ret == 0) { + o2hb_wait_on_io(reg, &write_wc); + bio_put(write_bio); + } else { + mlog_errno(ret); + } + + mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); + + return 0; +} + +void o2hb_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++) + INIT_LIST_HEAD(&o2hb_callbacks[i].list); + + for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) + INIT_LIST_HEAD(&o2hb_live_slots[i]); + + INIT_LIST_HEAD(&o2hb_node_events); + + memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); +} + +/* if we're already in a callback then we're already serialized by the sem */ +static void o2hb_fill_node_map_from_callback(unsigned long *map, + unsigned bytes) +{ + BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); + + memcpy(map, &o2hb_live_node_bitmap, bytes); +} + +/* + * get a map of all nodes that are heartbeating in any regions + */ +void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +{ + /* callers want to serialize this map and callbacks so that they + * can trust that they don't miss nodes coming to the party */ + down_read(&o2hb_callback_sem); + spin_lock(&o2hb_live_lock); + o2hb_fill_node_map_from_callback(map, bytes); + spin_unlock(&o2hb_live_lock); + up_read(&o2hb_callback_sem); +} +EXPORT_SYMBOL_GPL(o2hb_fill_node_map); + +/* + * heartbeat configfs bits. The heartbeat set is a default set under + * the cluster set in nodemanager.c. + */ + +static struct o2hb_region *to_o2hb_region(struct config_item *item) +{ + return item ? container_of(item, struct o2hb_region, hr_item) : NULL; +} + +/* drop_item only drops its ref after killing the thread, nothing should + * be using the region anymore. this has to clean up any state that + * attributes might have built up. */ +static void o2hb_region_release(struct config_item *item) +{ + int i; + struct page *page; + struct o2hb_region *reg = to_o2hb_region(item); + + if (reg->hr_tmp_block) + kfree(reg->hr_tmp_block); + + if (reg->hr_slot_data) { + for (i = 0; i < reg->hr_num_pages; i++) { + page = reg->hr_slot_data[i]; + if (page) + __free_page(page); + } + kfree(reg->hr_slot_data); + } + + if (reg->hr_bdev) + blkdev_put(reg->hr_bdev); + + if (reg->hr_slots) + kfree(reg->hr_slots); + + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + spin_unlock(&o2hb_live_lock); + + kfree(reg); +} + +static int o2hb_read_block_input(struct o2hb_region *reg, + const char *page, + size_t count, + unsigned long *ret_bytes, + unsigned int *ret_bits) +{ + unsigned long bytes; + char *p = (char *)page; + + bytes = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + /* Heartbeat and fs min / max block sizes are the same. */ + if (bytes > 4096 || bytes < 512) + return -ERANGE; + if (hweight16(bytes) != 1) + return -EINVAL; + + if (ret_bytes) + *ret_bytes = bytes; + if (ret_bits) + *ret_bits = ffs(bytes) - 1; + + return 0; +} + +static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%u\n", reg->hr_block_bytes); +} + +static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + int status; + unsigned long block_bytes; + unsigned int block_bits; + + if (reg->hr_bdev) + return -EINVAL; + + status = o2hb_read_block_input(reg, page, count, + &block_bytes, &block_bits); + if (status) + return status; + + reg->hr_block_bytes = (unsigned int)block_bytes; + reg->hr_block_bits = block_bits; + + return count; +} + +static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%llu\n", reg->hr_start_block); +} + +static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + unsigned long long tmp; + char *p = (char *)page; + + if (reg->hr_bdev) + return -EINVAL; + + tmp = simple_strtoull(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + reg->hr_start_block = tmp; + + return count; +} + +static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, + char *page) +{ + return sprintf(page, "%d\n", reg->hr_blocks); +} + +static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + if (reg->hr_bdev) + return -EINVAL; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp > O2NM_MAX_NODES || tmp == 0) + return -ERANGE; + + reg->hr_blocks = (unsigned int)tmp; + + return count; +} + +static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, + char *page) +{ + unsigned int ret = 0; + + if (reg->hr_bdev) + ret = sprintf(page, "%s\n", reg->hr_dev_name); + + return ret; +} + +static void o2hb_init_region_params(struct o2hb_region *reg) +{ + reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; + + mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", + reg->hr_start_block, reg->hr_blocks); + mlog(ML_HEARTBEAT, "hr_block_bytes = %u, hr_block_bits = %u\n", + reg->hr_block_bytes, reg->hr_block_bits); + mlog(ML_HEARTBEAT, "hr_timeout_ms = %u\n", reg->hr_timeout_ms); + mlog(ML_HEARTBEAT, "dead threshold = %u\n", o2hb_dead_threshold); +} + +static int o2hb_map_slot_data(struct o2hb_region *reg) +{ + int i, j; + unsigned int last_slot; + unsigned int spp = reg->hr_slots_per_page; + struct page *page; + char *raw; + struct o2hb_disk_slot *slot; + + reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); + if (reg->hr_tmp_block == NULL) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + reg->hr_slots = kcalloc(reg->hr_blocks, + sizeof(struct o2hb_disk_slot), GFP_KERNEL); + if (reg->hr_slots == NULL) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + for(i = 0; i < reg->hr_blocks; i++) { + slot = ®->hr_slots[i]; + slot->ds_node_num = i; + INIT_LIST_HEAD(&slot->ds_live_item); + slot->ds_raw_block = NULL; + } + + reg->hr_num_pages = (reg->hr_blocks + spp - 1) / spp; + mlog(ML_HEARTBEAT, "Going to require %u pages to cover %u blocks " + "at %u blocks per page\n", + reg->hr_num_pages, reg->hr_blocks, spp); + + reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), + GFP_KERNEL); + if (!reg->hr_slot_data) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + for(i = 0; i < reg->hr_num_pages; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + + reg->hr_slot_data[i] = page; + + last_slot = i * spp; + raw = page_address(page); + for (j = 0; + (j < spp) && ((j + last_slot) < reg->hr_blocks); + j++) { + BUG_ON((j + last_slot) >= reg->hr_blocks); + + slot = ®->hr_slots[j + last_slot]; + slot->ds_raw_block = + (struct o2hb_disk_heartbeat_block *) raw; + + raw += reg->hr_block_bytes; + } + } + + return 0; +} + +/* Read in all the slots available and populate the tracking + * structures so that we can start with a baseline idea of what's + * there. */ +static int o2hb_populate_slot_data(struct o2hb_region *reg) +{ + int ret, i; + struct o2hb_disk_slot *slot; + struct o2hb_disk_heartbeat_block *hb_block; + + mlog_entry_void(); + + ret = o2hb_read_slots(reg, reg->hr_blocks); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* We only want to get an idea of the values initially in each + * slot, so we do no verification - o2hb_check_slot will + * actually determine if each configured slot is valid and + * whether any values have changed. */ + for(i = 0; i < reg->hr_blocks; i++) { + slot = ®->hr_slots[i]; + hb_block = (struct o2hb_disk_heartbeat_block *) slot->ds_raw_block; + + /* Only fill the values that o2hb_check_slot uses to + * determine changing slots */ + slot->ds_last_time = le64_to_cpu(hb_block->hb_seq); + slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation); + } + +out: + mlog_exit(ret); + return ret; +} + +/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ +static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, + const char *page, + size_t count) +{ + long fd; + int sectsize; + char *p = (char *)page; + struct file *filp = NULL; + struct inode *inode = NULL; + ssize_t ret = -EINVAL; + + if (reg->hr_bdev) + goto out; + + /* We can't heartbeat without having had our node number + * configured yet. */ + if (o2nm_this_node() == O2NM_MAX_NODES) + goto out; + + fd = simple_strtol(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + goto out; + + if (fd < 0 || fd >= INT_MAX) + goto out; + + filp = fget(fd); + if (filp == NULL) + goto out; + + if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || + reg->hr_block_bytes == 0) + goto out; + + inode = igrab(filp->f_mapping->host); + if (inode == NULL) + goto out; + + if (!S_ISBLK(inode->i_mode)) + goto out; + + reg->hr_bdev = I_BDEV(filp->f_mapping->host); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, 0); + if (ret) { + reg->hr_bdev = NULL; + goto out; + } + inode = NULL; + + bdevname(reg->hr_bdev, reg->hr_dev_name); + + sectsize = bdev_hardsect_size(reg->hr_bdev); + if (sectsize != reg->hr_block_bytes) { + mlog(ML_ERROR, + "blocksize %u incorrect for device, expected %d", + reg->hr_block_bytes, sectsize); + ret = -EINVAL; + goto out; + } + + o2hb_init_region_params(reg); + + /* Generation of zero is invalid */ + do { + get_random_bytes(®->hr_generation, + sizeof(reg->hr_generation)); + } while (reg->hr_generation == 0); + + ret = o2hb_map_slot_data(reg); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = o2hb_populate_slot_data(reg); + if (ret) { + mlog_errno(ret); + goto out; + } + + INIT_WORK(®->hr_write_timeout_work, o2hb_write_timeout, reg); + + /* + * A node is considered live after it has beat LIVE_THRESHOLD + * times. We're not steady until we've given them a chance + * _after_ our first read. + */ + atomic_set(®->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1); + + reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s", + reg->hr_item.ci_name); + if (IS_ERR(reg->hr_task)) { + ret = PTR_ERR(reg->hr_task); + mlog_errno(ret); + reg->hr_task = NULL; + goto out; + } + + ret = wait_event_interruptible(o2hb_steady_queue, + atomic_read(®->hr_steady_iterations) == 0); + if (ret) { + kthread_stop(reg->hr_task); + reg->hr_task = NULL; + goto out; + } + + ret = count; +out: + if (filp) + fput(filp); + if (inode) + iput(inode); + if (ret < 0) { + if (reg->hr_bdev) { + blkdev_put(reg->hr_bdev); + reg->hr_bdev = NULL; + } + } + return ret; +} + +struct o2hb_region_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2hb_region *, char *); + ssize_t (*store)(struct o2hb_region *, const char *, size_t); +}; + +static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "block_bytes", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_block_bytes_read, + .store = o2hb_region_block_bytes_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_start_block = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "start_block", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_start_block_read, + .store = o2hb_region_start_block_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_blocks = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "blocks", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_blocks_read, + .store = o2hb_region_blocks_write, +}; + +static struct o2hb_region_attribute o2hb_region_attr_dev = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "dev", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_region_dev_read, + .store = o2hb_region_dev_write, +}; + +static struct configfs_attribute *o2hb_region_attrs[] = { + &o2hb_region_attr_block_bytes.attr, + &o2hb_region_attr_start_block.attr, + &o2hb_region_attr_blocks.attr, + &o2hb_region_attr_dev.attr, + NULL, +}; + +static ssize_t o2hb_region_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2hb_region *reg = to_o2hb_region(item); + struct o2hb_region_attribute *o2hb_region_attr = + container_of(attr, struct o2hb_region_attribute, attr); + ssize_t ret = 0; + + if (o2hb_region_attr->show) + ret = o2hb_region_attr->show(reg, page); + return ret; +} + +static ssize_t o2hb_region_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2hb_region *reg = to_o2hb_region(item); + struct o2hb_region_attribute *o2hb_region_attr = + container_of(attr, struct o2hb_region_attribute, attr); + ssize_t ret = -EINVAL; + + if (o2hb_region_attr->store) + ret = o2hb_region_attr->store(reg, page, count); + return ret; +} + +static struct configfs_item_operations o2hb_region_item_ops = { + .release = o2hb_region_release, + .show_attribute = o2hb_region_show, + .store_attribute = o2hb_region_store, +}; + +static struct config_item_type o2hb_region_type = { + .ct_item_ops = &o2hb_region_item_ops, + .ct_attrs = o2hb_region_attrs, + .ct_owner = THIS_MODULE, +}; + +/* heartbeat set */ + +struct o2hb_heartbeat_group { + struct config_group hs_group; + /* some stuff? */ +}; + +static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group *group) +{ + return group ? + container_of(group, struct o2hb_heartbeat_group, hs_group) + : NULL; +} + +static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, + const char *name) +{ + struct o2hb_region *reg = NULL; + struct config_item *ret = NULL; + + reg = kcalloc(1, sizeof(struct o2hb_region), GFP_KERNEL); + if (reg == NULL) + goto out; /* ENOMEM */ + + config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + + ret = ®->hr_item; + + spin_lock(&o2hb_live_lock); + list_add_tail(®->hr_all_item, &o2hb_all_regions); + spin_unlock(&o2hb_live_lock); +out: + if (ret == NULL) + kfree(reg); + + return ret; +} + +static void o2hb_heartbeat_group_drop_item(struct config_group *group, + struct config_item *item) +{ + struct o2hb_region *reg = to_o2hb_region(item); + + /* stop the thread when the user removes the region dir */ + if (reg->hr_task) { + kthread_stop(reg->hr_task); + reg->hr_task = NULL; + } + + config_item_put(item); +} + +struct o2hb_heartbeat_group_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2hb_heartbeat_group *, char *); + ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); +}; + +static ssize_t o2hb_heartbeat_group_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); + struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = + container_of(attr, struct o2hb_heartbeat_group_attribute, attr); + ssize_t ret = 0; + + if (o2hb_heartbeat_group_attr->show) + ret = o2hb_heartbeat_group_attr->show(reg, page); + return ret; +} + +static ssize_t o2hb_heartbeat_group_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); + struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = + container_of(attr, struct o2hb_heartbeat_group_attribute, attr); + ssize_t ret = -EINVAL; + + if (o2hb_heartbeat_group_attr->store) + ret = o2hb_heartbeat_group_attr->store(reg, page, count); + return ret; +} + +static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, + char *page) +{ + return sprintf(page, "%u\n", o2hb_dead_threshold); +} + +static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, + const char *page, + size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 10); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + /* this will validate ranges for us. */ + o2hb_dead_threshold_set((unsigned int) tmp); + + return count; +} + +static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "dead_threshold", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2hb_heartbeat_group_threshold_show, + .store = o2hb_heartbeat_group_threshold_store, +}; + +static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { + &o2hb_heartbeat_group_attr_threshold.attr, + NULL, +}; + +static struct configfs_item_operations o2hb_hearbeat_group_item_ops = { + .show_attribute = o2hb_heartbeat_group_show, + .store_attribute = o2hb_heartbeat_group_store, +}; + +static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { + .make_item = o2hb_heartbeat_group_make_item, + .drop_item = o2hb_heartbeat_group_drop_item, +}; + +static struct config_item_type o2hb_heartbeat_group_type = { + .ct_group_ops = &o2hb_heartbeat_group_group_ops, + .ct_item_ops = &o2hb_hearbeat_group_item_ops, + .ct_attrs = o2hb_heartbeat_group_attrs, + .ct_owner = THIS_MODULE, +}; + +/* this is just here to avoid touching group in heartbeat.h which the + * entire damn world #includes */ +struct config_group *o2hb_alloc_hb_set(void) +{ + struct o2hb_heartbeat_group *hs = NULL; + struct config_group *ret = NULL; + + hs = kcalloc(1, sizeof(struct o2hb_heartbeat_group), GFP_KERNEL); + if (hs == NULL) + goto out; + + config_group_init_type_name(&hs->hs_group, "heartbeat", + &o2hb_heartbeat_group_type); + + ret = &hs->hs_group; +out: + if (ret == NULL) + kfree(hs); + return ret; +} + +void o2hb_free_hb_set(struct config_group *group) +{ + struct o2hb_heartbeat_group *hs = to_o2hb_heartbeat_group(group); + kfree(hs); +} + +/* hb callback registration and issueing */ + +static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type) +{ + if (type == O2HB_NUM_CB) + return ERR_PTR(-EINVAL); + + return &o2hb_callbacks[type]; +} + +void o2hb_setup_callback(struct o2hb_callback_func *hc, + enum o2hb_callback_type type, + o2hb_cb_func *func, + void *data, + int priority) +{ + INIT_LIST_HEAD(&hc->hc_item); + hc->hc_func = func; + hc->hc_data = data; + hc->hc_priority = priority; + hc->hc_type = type; + hc->hc_magic = O2HB_CB_MAGIC; +} +EXPORT_SYMBOL_GPL(o2hb_setup_callback); + +int o2hb_register_callback(struct o2hb_callback_func *hc) +{ + struct o2hb_callback_func *tmp; + struct list_head *iter; + struct o2hb_callback *hbcall; + int ret; + + BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); + BUG_ON(!list_empty(&hc->hc_item)); + + hbcall = hbcall_from_type(hc->hc_type); + if (IS_ERR(hbcall)) { + ret = PTR_ERR(hbcall); + goto out; + } + + down_write(&o2hb_callback_sem); + + list_for_each(iter, &hbcall->list) { + tmp = list_entry(iter, struct o2hb_callback_func, hc_item); + if (hc->hc_priority < tmp->hc_priority) { + list_add_tail(&hc->hc_item, iter); + break; + } + } + if (list_empty(&hc->hc_item)) + list_add_tail(&hc->hc_item, &hbcall->list); + + up_write(&o2hb_callback_sem); + ret = 0; +out: + mlog(ML_HEARTBEAT, "returning %d on behalf of %p for funcs %p\n", + ret, __builtin_return_address(0), hc); + return ret; +} +EXPORT_SYMBOL_GPL(o2hb_register_callback); + +int o2hb_unregister_callback(struct o2hb_callback_func *hc) +{ + BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); + + mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n", + __builtin_return_address(0), hc); + + if (list_empty(&hc->hc_item)) + return 0; + + down_write(&o2hb_callback_sem); + + list_del_init(&hc->hc_item); + + up_write(&o2hb_callback_sem); + + return 0; +} +EXPORT_SYMBOL_GPL(o2hb_unregister_callback); + +int o2hb_check_node_heartbeating(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + o2hb_fill_node_map(testing_map, sizeof(testing_map)); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); + +int o2hb_check_node_heartbeating_from_callback(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + + o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); + +/* Makes sure our local node is configured with a node number, and is + * heartbeating. */ +int o2hb_check_local_node_heartbeating(void) +{ + u8 node_num; + + /* if this node was set then we have networking */ + node_num = o2nm_this_node(); + if (node_num == O2NM_MAX_NODES) { + mlog(ML_HEARTBEAT, "this node has not been configured.\n"); + return 0; + } + + return o2hb_check_node_heartbeating(node_num); +} +EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); + +/* + * this is just a hack until we get the plumbing which flips file systems + * read only and drops the hb ref instead of killing the node dead. + */ +void o2hb_stop_all_regions(void) +{ + struct o2hb_region *reg; + + mlog(ML_ERROR, "stopping heartbeat on all active regions.\n"); + + spin_lock(&o2hb_live_lock); + + list_for_each_entry(reg, &o2hb_all_regions, hr_all_item) + reg->hr_unclean_stop = 1; + + spin_unlock(&o2hb_live_lock); +} +EXPORT_SYMBOL_GPL(o2hb_stop_all_regions); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h new file mode 100644 index 000000000000..cac6223206a9 --- /dev/null +++ b/fs/ocfs2/cluster/heartbeat.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_HEARTBEAT_H +#define O2CLUSTER_HEARTBEAT_H + +#include "ocfs2_heartbeat.h" + +#define O2HB_REGION_TIMEOUT_MS 2000 + +/* number of changes to be seen as live */ +#define O2HB_LIVE_THRESHOLD 2 +/* number of equal samples to be seen as dead */ +extern unsigned int o2hb_dead_threshold; +#define O2HB_DEFAULT_DEAD_THRESHOLD 7 +/* Otherwise MAX_WRITE_TIMEOUT will be zero... */ +#define O2HB_MIN_DEAD_THRESHOLD 2 +#define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) + +#define O2HB_CB_MAGIC 0x51d1e4ec + +/* callback stuff */ +enum o2hb_callback_type { + O2HB_NODE_DOWN_CB = 0, + O2HB_NODE_UP_CB, + O2HB_NUM_CB +}; + +struct o2nm_node; +typedef void (o2hb_cb_func)(struct o2nm_node *, int, void *); + +struct o2hb_callback_func { + u32 hc_magic; + struct list_head hc_item; + o2hb_cb_func *hc_func; + void *hc_data; + int hc_priority; + enum o2hb_callback_type hc_type; +}; + +struct config_group *o2hb_alloc_hb_set(void); +void o2hb_free_hb_set(struct config_group *group); + +void o2hb_setup_callback(struct o2hb_callback_func *hc, + enum o2hb_callback_type type, + o2hb_cb_func *func, + void *data, + int priority); +int o2hb_register_callback(struct o2hb_callback_func *hc); +int o2hb_unregister_callback(struct o2hb_callback_func *hc); +void o2hb_fill_node_map(unsigned long *map, + unsigned bytes); +void o2hb_init(void); +int o2hb_check_node_heartbeating(u8 node_num); +int o2hb_check_node_heartbeating_from_callback(u8 node_num); +int o2hb_check_local_node_heartbeating(void); +void o2hb_stop_all_regions(void); + +#endif /* O2CLUSTER_HEARTBEAT_H */ diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c new file mode 100644 index 000000000000..fd741cea5705 --- /dev/null +++ b/fs/ocfs2/cluster/masklog.c @@ -0,0 +1,166 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/string.h> +#include <asm/uaccess.h> + +#include "masklog.h" + +struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); +EXPORT_SYMBOL_GPL(mlog_and_bits); +struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK); +EXPORT_SYMBOL_GPL(mlog_not_bits); + +static ssize_t mlog_mask_show(u64 mask, char *buf) +{ + char *state; + + if (__mlog_test_u64(mask, mlog_and_bits)) + state = "allow"; + else if (__mlog_test_u64(mask, mlog_not_bits)) + state = "deny"; + else + state = "off"; + + return snprintf(buf, PAGE_SIZE, "%s\n", state); +} + +static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) +{ + if (!strnicmp(buf, "allow", 5)) { + __mlog_set_u64(mask, mlog_and_bits); + __mlog_clear_u64(mask, mlog_not_bits); + } else if (!strnicmp(buf, "deny", 4)) { + __mlog_set_u64(mask, mlog_not_bits); + __mlog_clear_u64(mask, mlog_and_bits); + } else if (!strnicmp(buf, "off", 3)) { + __mlog_clear_u64(mask, mlog_not_bits); + __mlog_clear_u64(mask, mlog_and_bits); + } else + return -EINVAL; + + return count; +} + +struct mlog_attribute { + struct attribute attr; + u64 mask; +}; + +#define to_mlog_attr(_attr) container_of(_attr, struct mlog_attribute, attr) + +#define define_mask(_name) { \ + .attr = { \ + .name = #_name, \ + .mode = S_IRUGO | S_IWUSR, \ + }, \ + .mask = ML_##_name, \ +} + +static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { + define_mask(ENTRY), + define_mask(EXIT), + define_mask(TCP), + define_mask(MSG), + define_mask(SOCKET), + define_mask(HEARTBEAT), + define_mask(HB_BIO), + define_mask(DLMFS), + define_mask(DLM), + define_mask(DLM_DOMAIN), + define_mask(DLM_THREAD), + define_mask(DLM_MASTER), + define_mask(DLM_RECOVERY), + define_mask(AIO), + define_mask(JOURNAL), + define_mask(DISK_ALLOC), + define_mask(SUPER), + define_mask(FILE_IO), + define_mask(EXTENT_MAP), + define_mask(DLM_GLUE), + define_mask(BH_IO), + define_mask(UPTODATE), + define_mask(NAMEI), + define_mask(INODE), + define_mask(VOTE), + define_mask(DCACHE), + define_mask(CONN), + define_mask(QUORUM), + define_mask(EXPORT), + define_mask(ERROR), + define_mask(NOTICE), + define_mask(KTHREAD), +}; + +static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; + +static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, + char *buf) +{ + struct mlog_attribute *mlog_attr = to_mlog_attr(attr); + + return mlog_mask_show(mlog_attr->mask, buf); +} + +static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, + const char *buf, size_t count) +{ + struct mlog_attribute *mlog_attr = to_mlog_attr(attr); + + return mlog_mask_store(mlog_attr->mask, buf, count); +} + +static struct sysfs_ops mlog_attr_ops = { + .show = mlog_show, + .store = mlog_store, +}; + +static struct kobj_type mlog_ktype = { + .default_attrs = mlog_attr_ptrs, + .sysfs_ops = &mlog_attr_ops, +}; + +static struct kset mlog_kset = { + .kobj = {.name = "logmask", .ktype = &mlog_ktype}, +}; + +int mlog_sys_init(struct subsystem *o2cb_subsys) +{ + int i = 0; + + while (mlog_attrs[i].attr.mode) { + mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + i++; + } + mlog_attr_ptrs[i] = NULL; + + mlog_kset.subsys = o2cb_subsys; + return kset_register(&mlog_kset); +} + +void mlog_sys_shutdown(void) +{ + kset_unregister(&mlog_kset); +} diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h new file mode 100644 index 000000000000..e8c56a3d9c64 --- /dev/null +++ b/fs/ocfs2/cluster/masklog.h @@ -0,0 +1,274 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_MASKLOG_H +#define O2CLUSTER_MASKLOG_H + +/* + * For now this is a trivial wrapper around printk() that gives the critical + * ability to enable sets of debugging output at run-time. In the future this + * will almost certainly be redirected to relayfs so that it can pay a + * substantially lower heisenberg tax. + * + * Callers associate the message with a bitmask and a global bitmask is + * maintained with help from /proc. If any of the bits match the message is + * output. + * + * We must have efficient bit tests on i386 and it seems gcc still emits crazy + * code for the 64bit compare. It emits very good code for the dual unsigned + * long tests, though, completely avoiding tests that can never pass if the + * caller gives a constant bitmask that fills one of the longs with all 0s. So + * the desire is to have almost all of the calls decided on by comparing just + * one of the longs. This leads to having infrequently given bits that are + * frequently matched in the high bits. + * + * _ERROR and _NOTICE are used for messages that always go to the console and + * have appropriate KERN_ prefixes. We wrap these in our function instead of + * just calling printk() so that this can eventually make its way through + * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. + * The inline tests and macro dance give GCC the opportunity to quite cleverly + * only emit the appropriage printk() when the caller passes in a constant + * mask, as is almost always the case. + * + * All this bitmask nonsense is hidden from the /proc interface so that Joel + * doesn't have an aneurism. Reading the file gives a straight forward + * indication of which bits are on or off: + * ENTRY off + * EXIT off + * TCP off + * MSG off + * SOCKET off + * ERROR off + * NOTICE on + * + * Writing changes the state of a given bit and requires a strictly formatted + * single write() call: + * + * write(fd, "ENTRY on", 8); + * + * would turn the entry bit on. "1" is also accepted in the place of "on", and + * "off" and "0" behave as expected. + * + * Some trivial shell can flip all the bits on or off: + * + * log_mask="/proc/fs/ocfs2_nodemanager/log_mask" + * cat $log_mask | ( + * while read bit status; do + * # $1 is "on" or "off", say + * echo "$bit $1" > $log_mask + * done + * ) + */ + +/* for task_struct */ +#include <linux/sched.h> + +/* bits that are frequently given and infrequently matched in the low word */ +/* NOTE: If you add a flag, you need to also update mlog.c! */ +#define ML_ENTRY 0x0000000000000001ULL /* func call entry */ +#define ML_EXIT 0x0000000000000002ULL /* func call exit */ +#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ +#define ML_MSG 0x0000000000000008ULL /* net network messages */ +#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */ +#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */ +#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */ +#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */ +#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */ +#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */ +#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */ +#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */ +#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */ +#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */ +#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */ +#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */ +#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */ +#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */ +#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */ +#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */ +#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */ +#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */ +#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */ +#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */ +#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */ +#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */ +#define ML_CONN 0x0000000004000000ULL /* net connection management */ +#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ +#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ +/* bits that are infrequently given and frequently matched in the high word */ +#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ +#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ +#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ + +#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) +#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) +#ifndef MLOG_MASK_PREFIX +#define MLOG_MASK_PREFIX 0 +#endif + +#define MLOG_MAX_BITS 64 + +struct mlog_bits { + unsigned long words[MLOG_MAX_BITS / BITS_PER_LONG]; +}; + +extern struct mlog_bits mlog_and_bits, mlog_not_bits; + +#if BITS_PER_LONG == 32 + +#define __mlog_test_u64(mask, bits) \ + ( (u32)(mask & 0xffffffff) & bits.words[0] || \ + ((u64)(mask) >> 32) & bits.words[1] ) +#define __mlog_set_u64(mask, bits) do { \ + bits.words[0] |= (u32)(mask & 0xffffffff); \ + bits.words[1] |= (u64)(mask) >> 32; \ +} while (0) +#define __mlog_clear_u64(mask, bits) do { \ + bits.words[0] &= ~((u32)(mask & 0xffffffff)); \ + bits.words[1] &= ~((u64)(mask) >> 32); \ +} while (0) +#define MLOG_BITS_RHS(mask) { \ + { \ + [0] = (u32)(mask & 0xffffffff), \ + [1] = (u64)(mask) >> 32, \ + } \ +} + +#else /* 32bit long above, 64bit long below */ + +#define __mlog_test_u64(mask, bits) ((mask) & bits.words[0]) +#define __mlog_set_u64(mask, bits) do { \ + bits.words[0] |= (mask); \ +} while (0) +#define __mlog_clear_u64(mask, bits) do { \ + bits.words[0] &= ~(mask); \ +} while (0) +#define MLOG_BITS_RHS(mask) { { (mask) } } + +#endif + +/* + * smp_processor_id() "helpfully" screams when called outside preemptible + * regions in current kernels. sles doesn't have the variants that don't + * scream. just do this instead of trying to guess which we're building + * against.. *sigh*. + */ +#define __mlog_cpu_guess ({ \ + unsigned long _cpu = get_cpu(); \ + put_cpu(); \ + _cpu; \ +}) + +/* In the following two macros, the whitespace after the ',' just + * before ##args is intentional. Otherwise, gcc 2.95 will eat the + * previous token if args expands to nothing. + */ +#define __mlog_printk(level, fmt, args...) \ + printk(level "(%u,%lu):%s:%d " fmt, current->pid, \ + __mlog_cpu_guess, __PRETTY_FUNCTION__, __LINE__ , \ + ##args) + +#define mlog(mask, fmt, args...) do { \ + u64 __m = MLOG_MASK_PREFIX | (mask); \ + if (__mlog_test_u64(__m, mlog_and_bits) && \ + !__mlog_test_u64(__m, mlog_not_bits)) { \ + if (__m & ML_ERROR) \ + __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ + else if (__m & ML_NOTICE) \ + __mlog_printk(KERN_NOTICE, fmt , ##args); \ + else __mlog_printk(KERN_INFO, fmt , ##args); \ + } \ +} while (0) + +#define mlog_errno(st) do { \ + int _st = (st); \ + if (_st != -ERESTARTSYS && _st != -EINTR && \ + _st != AOP_TRUNCATED_PAGE) \ + mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ +} while (0) + +#define mlog_entry(fmt, args...) do { \ + mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \ +} while (0) + +#define mlog_entry_void() do { \ + mlog(ML_ENTRY, "ENTRY:\n"); \ +} while (0) + +/* + * We disable this for sparse. + */ +#if !defined(__CHECKER__) +#define mlog_exit(st) do { \ + if (__builtin_types_compatible_p(typeof(st), unsigned long)) \ + mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \ + else if (__builtin_types_compatible_p(typeof(st), signed long)) \ + mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \ + else if (__builtin_types_compatible_p(typeof(st), unsigned int) \ + || __builtin_types_compatible_p(typeof(st), unsigned short) \ + || __builtin_types_compatible_p(typeof(st), unsigned char)) \ + mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \ + else if (__builtin_types_compatible_p(typeof(st), signed int) \ + || __builtin_types_compatible_p(typeof(st), signed short) \ + || __builtin_types_compatible_p(typeof(st), signed char)) \ + mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \ + else if (__builtin_types_compatible_p(typeof(st), long long)) \ + mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \ + else \ + mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \ +} while (0) +#else +#define mlog_exit(st) do { \ + mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \ +} while (0) +#endif + +#define mlog_exit_ptr(ptr) do { \ + mlog(ML_EXIT, "EXIT: %p\n", ptr); \ +} while (0) + +#define mlog_exit_void() do { \ + mlog(ML_EXIT, "EXIT\n"); \ +} while (0) + +#define mlog_bug_on_msg(cond, fmt, args...) do { \ + if (cond) { \ + mlog(ML_ERROR, "bug expression: " #cond "\n"); \ + mlog(ML_ERROR, fmt, ##args); \ + BUG(); \ + } \ +} while (0) + +#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) +#define MLFi64 "lld" +#define MLFu64 "llu" +#define MLFx64 "llx" +#else +#define MLFi64 "ld" +#define MLFu64 "lu" +#define MLFx64 "lx" +#endif + +#include <linux/kobject.h> +#include <linux/sysfs.h> +int mlog_sys_init(struct subsystem *o2cb_subsys); +void mlog_sys_shutdown(void); + +#endif /* O2CLUSTER_MASKLOG_H */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c new file mode 100644 index 000000000000..cf7828f23361 --- /dev/null +++ b/fs/ocfs2/cluster/nodemanager.c @@ -0,0 +1,791 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/configfs.h> + +#include "endian.h" +#include "tcp.h" +#include "nodemanager.h" +#include "heartbeat.h" +#include "masklog.h" +#include "sys.h" +#include "ver.h" + +/* for now we operate under the assertion that there can be only one + * cluster active at a time. Changing this will require trickling + * cluster references throughout where nodes are looked up */ +static struct o2nm_cluster *o2nm_single_cluster = NULL; + +#define OCFS2_MAX_HB_CTL_PATH 256 +static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; + +static ctl_table ocfs2_nm_table[] = { + { + .ctl_name = 1, + .procname = "hb_ctl_path", + .data = ocfs2_hb_ctl_path, + .maxlen = OCFS2_MAX_HB_CTL_PATH, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { .ctl_name = 0 } +}; + +static ctl_table ocfs2_mod_table[] = { + { + .ctl_name = KERN_OCFS2_NM, + .procname = "nm", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_nm_table + }, + { .ctl_name = 0} +}; + +static ctl_table ocfs2_kern_table[] = { + { + .ctl_name = KERN_OCFS2, + .procname = "ocfs2", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_mod_table + }, + { .ctl_name = 0} +}; + +static ctl_table ocfs2_root_table[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ocfs2_kern_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ocfs2_table_header = NULL; + +const char *o2nm_get_hb_ctl_path(void) +{ + return ocfs2_hb_ctl_path; +} +EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path); + +struct o2nm_cluster { + struct config_group cl_group; + unsigned cl_has_local:1; + u8 cl_local_node; + rwlock_t cl_nodes_lock; + struct o2nm_node *cl_nodes[O2NM_MAX_NODES]; + struct rb_root cl_node_ip_tree; + /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ + unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +struct o2nm_node *o2nm_get_node_by_num(u8 node_num) +{ + struct o2nm_node *node = NULL; + + if (node_num >= O2NM_MAX_NODES || o2nm_single_cluster == NULL) + goto out; + + read_lock(&o2nm_single_cluster->cl_nodes_lock); + node = o2nm_single_cluster->cl_nodes[node_num]; + if (node) + config_item_get(&node->nd_item); + read_unlock(&o2nm_single_cluster->cl_nodes_lock); +out: + return node; +} +EXPORT_SYMBOL_GPL(o2nm_get_node_by_num); + +int o2nm_configured_node_map(unsigned long *map, unsigned bytes) +{ + struct o2nm_cluster *cluster = o2nm_single_cluster; + + BUG_ON(bytes < (sizeof(cluster->cl_nodes_bitmap))); + + if (cluster == NULL) + return -EINVAL; + + read_lock(&cluster->cl_nodes_lock); + memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + read_unlock(&cluster->cl_nodes_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(o2nm_configured_node_map); + +static struct o2nm_node *o2nm_node_ip_tree_lookup(struct o2nm_cluster *cluster, + __be32 ip_needle, + struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &cluster->cl_node_ip_tree.rb_node; + struct rb_node *parent = NULL; + struct o2nm_node *node, *ret = NULL; + + while (*p) { + parent = *p; + node = rb_entry(parent, struct o2nm_node, nd_ip_node); + + if (memcmp(&ip_needle, &node->nd_ipv4_address, + sizeof(ip_needle)) < 0) + p = &(*p)->rb_left; + else if (memcmp(&ip_needle, &node->nd_ipv4_address, + sizeof(ip_needle)) > 0) + p = &(*p)->rb_right; + else { + ret = node; + break; + } + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + + return ret; +} + +struct o2nm_node *o2nm_get_node_by_ip(__be32 addr) +{ + struct o2nm_node *node = NULL; + struct o2nm_cluster *cluster = o2nm_single_cluster; + + if (cluster == NULL) + goto out; + + read_lock(&cluster->cl_nodes_lock); + node = o2nm_node_ip_tree_lookup(cluster, addr, NULL, NULL); + if (node) + config_item_get(&node->nd_item); + read_unlock(&cluster->cl_nodes_lock); + +out: + return node; +} +EXPORT_SYMBOL_GPL(o2nm_get_node_by_ip); + +void o2nm_node_put(struct o2nm_node *node) +{ + config_item_put(&node->nd_item); +} +EXPORT_SYMBOL_GPL(o2nm_node_put); + +void o2nm_node_get(struct o2nm_node *node) +{ + config_item_get(&node->nd_item); +} +EXPORT_SYMBOL_GPL(o2nm_node_get); + +u8 o2nm_this_node(void) +{ + u8 node_num = O2NM_MAX_NODES; + + if (o2nm_single_cluster && o2nm_single_cluster->cl_has_local) + node_num = o2nm_single_cluster->cl_local_node; + + return node_num; +} +EXPORT_SYMBOL_GPL(o2nm_this_node); + +/* node configfs bits */ + +static struct o2nm_cluster *to_o2nm_cluster(struct config_item *item) +{ + return item ? + container_of(to_config_group(item), struct o2nm_cluster, + cl_group) + : NULL; +} + +static struct o2nm_node *to_o2nm_node(struct config_item *item) +{ + return item ? container_of(item, struct o2nm_node, nd_item) : NULL; +} + +static void o2nm_node_release(struct config_item *item) +{ + struct o2nm_node *node = to_o2nm_node(item); + kfree(node); +} + +static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%d\n", node->nd_num); +} + +static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) +{ + /* through the first node_set .parent + * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); +} + +enum { + O2NM_NODE_ATTR_NUM = 0, + O2NM_NODE_ATTR_PORT, + O2NM_NODE_ATTR_ADDRESS, + O2NM_NODE_ATTR_LOCAL, +}; + +static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp >= O2NM_MAX_NODES) + return -ERANGE; + + /* once we're in the cl_nodes tree networking can look us up by + * node number and try to use our address and port attributes + * to connect to this node.. make sure that they've been set + * before writing the node attribute? */ + if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EINVAL; /* XXX */ + + write_lock(&cluster->cl_nodes_lock); + if (cluster->cl_nodes[tmp]) + p = NULL; + else { + cluster->cl_nodes[tmp] = node; + node->nd_num = tmp; + set_bit(tmp, cluster->cl_nodes_bitmap); + } + write_unlock(&cluster->cl_nodes_lock); + if (p == NULL) + return -EEXIST; + + return count; +} +static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); +} + +static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, + const char *page, size_t count) +{ + unsigned long tmp; + char *p = (char *)page; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + if (tmp == 0) + return -EINVAL; + if (tmp >= (u16)-1) + return -ERANGE; + + node->nd_ipv4_port = htons(tmp); + + return count; +} + +static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%u.%u.%u.%u\n", NIPQUAD(node->nd_ipv4_address)); +} + +static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, + const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + int ret, i; + struct rb_node **p, *parent; + unsigned int octets[4]; + __be32 ipv4_addr = 0; + + ret = sscanf(page, "%3u.%3u.%3u.%3u", &octets[3], &octets[2], + &octets[1], &octets[0]); + if (ret != 4) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(octets); i++) { + if (octets[i] > 255) + return -ERANGE; + be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); + } + + ret = 0; + write_lock(&cluster->cl_nodes_lock); + if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) + ret = -EEXIST; + else { + rb_link_node(&node->nd_ip_node, parent, p); + rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); + } + write_unlock(&cluster->cl_nodes_lock); + if (ret) + return ret; + + memcpy(&node->nd_ipv4_address, &ipv4_addr, sizeof(ipv4_addr)); + + return count; +} + +static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +{ + return sprintf(page, "%d\n", node->nd_local); +} + +static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, + size_t count) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + unsigned long tmp; + char *p = (char *)page; + ssize_t ret; + + tmp = simple_strtoul(p, &p, 0); + if (!p || (*p && (*p != '\n'))) + return -EINVAL; + + tmp = !!tmp; /* boolean of whether this node wants to be local */ + + /* setting local turns on networking rx for now so we require having + * set everything else first */ + if (!test_bit(O2NM_NODE_ATTR_ADDRESS, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_NUM, &node->nd_set_attributes) || + !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EINVAL; /* XXX */ + + /* the only failure case is trying to set a new local node + * when a different one is already set */ + if (tmp && tmp == cluster->cl_has_local && + cluster->cl_local_node != node->nd_num) + return -EBUSY; + + /* bring up the rx thread if we're setting the new local node. */ + if (tmp && !cluster->cl_has_local) { + ret = o2net_start_listening(node); + if (ret) + return ret; + } + + if (!tmp && cluster->cl_has_local && + cluster->cl_local_node == node->nd_num) { + o2net_stop_listening(node); + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + } + + node->nd_local = tmp; + if (node->nd_local) { + cluster->cl_has_local = tmp; + cluster->cl_local_node = node->nd_num; + } + + return count; +} + +struct o2nm_node_attribute { + struct configfs_attribute attr; + ssize_t (*show)(struct o2nm_node *, char *); + ssize_t (*store)(struct o2nm_node *, const char *, size_t); +}; + +static struct o2nm_node_attribute o2nm_node_attr_num = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "num", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_num_read, + .store = o2nm_node_num_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "ipv4_port", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_ipv4_port_read, + .store = o2nm_node_ipv4_port_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "ipv4_address", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_ipv4_address_read, + .store = o2nm_node_ipv4_address_write, +}; + +static struct o2nm_node_attribute o2nm_node_attr_local = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "local", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_node_local_read, + .store = o2nm_node_local_write, +}; + +static struct configfs_attribute *o2nm_node_attrs[] = { + [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, + [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, + [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, + [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + NULL, +}; + +static int o2nm_attr_index(struct configfs_attribute *attr) +{ + int i; + for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { + if (attr == o2nm_node_attrs[i]) + return i; + } + BUG(); + return 0; +} + +static ssize_t o2nm_node_show(struct config_item *item, + struct configfs_attribute *attr, + char *page) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_node_attribute *o2nm_node_attr = + container_of(attr, struct o2nm_node_attribute, attr); + ssize_t ret = 0; + + if (o2nm_node_attr->show) + ret = o2nm_node_attr->show(node, page); + return ret; +} + +static ssize_t o2nm_node_store(struct config_item *item, + struct configfs_attribute *attr, + const char *page, size_t count) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_node_attribute *o2nm_node_attr = + container_of(attr, struct o2nm_node_attribute, attr); + ssize_t ret; + int attr_index = o2nm_attr_index(attr); + + if (o2nm_node_attr->store == NULL) { + ret = -EINVAL; + goto out; + } + + if (test_bit(attr_index, &node->nd_set_attributes)) + return -EBUSY; + + ret = o2nm_node_attr->store(node, page, count); + if (ret < count) + goto out; + + set_bit(attr_index, &node->nd_set_attributes); +out: + return ret; +} + +static struct configfs_item_operations o2nm_node_item_ops = { + .release = o2nm_node_release, + .show_attribute = o2nm_node_show, + .store_attribute = o2nm_node_store, +}; + +static struct config_item_type o2nm_node_type = { + .ct_item_ops = &o2nm_node_item_ops, + .ct_attrs = o2nm_node_attrs, + .ct_owner = THIS_MODULE, +}; + +/* node set */ + +struct o2nm_node_group { + struct config_group ns_group; + /* some stuff? */ +}; + +#if 0 +static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) +{ + return group ? + container_of(group, struct o2nm_node_group, ns_group) + : NULL; +} +#endif + +static struct config_item *o2nm_node_group_make_item(struct config_group *group, + const char *name) +{ + struct o2nm_node *node = NULL; + struct config_item *ret = NULL; + + if (strlen(name) > O2NM_MAX_NAME_LEN) + goto out; /* ENAMETOOLONG */ + + node = kcalloc(1, sizeof(struct o2nm_node), GFP_KERNEL); + if (node == NULL) + goto out; /* ENOMEM */ + + strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */ + config_item_init_type_name(&node->nd_item, name, &o2nm_node_type); + spin_lock_init(&node->nd_lock); + + ret = &node->nd_item; + +out: + if (ret == NULL) + kfree(node); + + return ret; +} + +static void o2nm_node_group_drop_item(struct config_group *group, + struct config_item *item) +{ + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); + + o2net_disconnect_node(node); + + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } + + /* XXX call into net to stop this node from trading messages */ + + write_lock(&cluster->cl_nodes_lock); + + /* XXX sloppy */ + if (node->nd_ipv4_address) + rb_erase(&node->nd_ip_node, &cluster->cl_node_ip_tree); + + /* nd_num might be 0 if the node number hasn't been set.. */ + if (cluster->cl_nodes[node->nd_num] == node) { + cluster->cl_nodes[node->nd_num] = NULL; + clear_bit(node->nd_num, cluster->cl_nodes_bitmap); + } + write_unlock(&cluster->cl_nodes_lock); + + config_item_put(item); +} + +static struct configfs_group_operations o2nm_node_group_group_ops = { + .make_item = o2nm_node_group_make_item, + .drop_item = o2nm_node_group_drop_item, +}; + +static struct config_item_type o2nm_node_group_type = { + .ct_group_ops = &o2nm_node_group_group_ops, + .ct_owner = THIS_MODULE, +}; + +/* cluster */ + +static void o2nm_cluster_release(struct config_item *item) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + + kfree(cluster->cl_group.default_groups); + kfree(cluster); +} + +static struct configfs_item_operations o2nm_cluster_item_ops = { + .release = o2nm_cluster_release, +}; + +static struct config_item_type o2nm_cluster_type = { + .ct_item_ops = &o2nm_cluster_item_ops, + .ct_owner = THIS_MODULE, +}; + +/* cluster set */ + +struct o2nm_cluster_group { + struct configfs_subsystem cs_subsys; + /* some stuff? */ +}; + +#if 0 +static struct o2nm_cluster_group *to_o2nm_cluster_group(struct config_group *group) +{ + return group ? + container_of(to_configfs_subsystem(group), struct o2nm_cluster_group, cs_subsys) + : NULL; +} +#endif + +static struct config_group *o2nm_cluster_group_make_group(struct config_group *group, + const char *name) +{ + struct o2nm_cluster *cluster = NULL; + struct o2nm_node_group *ns = NULL; + struct config_group *o2hb_group = NULL, *ret = NULL; + void *defs = NULL; + + /* this runs under the parent dir's i_mutex; there can be only + * one caller in here at a time */ + if (o2nm_single_cluster) + goto out; /* ENOSPC */ + + cluster = kcalloc(1, sizeof(struct o2nm_cluster), GFP_KERNEL); + ns = kcalloc(1, sizeof(struct o2nm_node_group), GFP_KERNEL); + defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); + o2hb_group = o2hb_alloc_hb_set(); + if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + goto out; + + config_group_init_type_name(&cluster->cl_group, name, + &o2nm_cluster_type); + config_group_init_type_name(&ns->ns_group, "node", + &o2nm_node_group_type); + + cluster->cl_group.default_groups = defs; + cluster->cl_group.default_groups[0] = &ns->ns_group; + cluster->cl_group.default_groups[1] = o2hb_group; + cluster->cl_group.default_groups[2] = NULL; + rwlock_init(&cluster->cl_nodes_lock); + cluster->cl_node_ip_tree = RB_ROOT; + + ret = &cluster->cl_group; + o2nm_single_cluster = cluster; + +out: + if (ret == NULL) { + kfree(cluster); + kfree(ns); + o2hb_free_hb_set(o2hb_group); + kfree(defs); + } + + return ret; +} + +static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) +{ + struct o2nm_cluster *cluster = to_o2nm_cluster(item); + int i; + struct config_item *killme; + + BUG_ON(o2nm_single_cluster != cluster); + o2nm_single_cluster = NULL; + + for (i = 0; cluster->cl_group.default_groups[i]; i++) { + killme = &cluster->cl_group.default_groups[i]->cg_item; + cluster->cl_group.default_groups[i] = NULL; + config_item_put(killme); + } + + config_item_put(item); +} + +static struct configfs_group_operations o2nm_cluster_group_group_ops = { + .make_group = o2nm_cluster_group_make_group, + .drop_item = o2nm_cluster_group_drop_item, +}; + +static struct config_item_type o2nm_cluster_group_type = { + .ct_group_ops = &o2nm_cluster_group_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct o2nm_cluster_group o2nm_cluster_group = { + .cs_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "cluster", + .ci_type = &o2nm_cluster_group_type, + }, + }, + }, +}; + +static void __exit exit_o2nm(void) +{ + if (ocfs2_table_header) + unregister_sysctl_table(ocfs2_table_header); + + /* XXX sync with hb callbacks and shut down hb? */ + o2net_unregister_hb_callbacks(); + configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); + o2cb_sys_shutdown(); + + o2net_exit(); +} + +static int __init init_o2nm(void) +{ + int ret = -1; + + cluster_print_version(); + + o2hb_init(); + o2net_init(); + + ocfs2_table_header = register_sysctl_table(ocfs2_root_table, 0); + if (!ocfs2_table_header) { + printk(KERN_ERR "nodemanager: unable to register sysctl\n"); + ret = -ENOMEM; /* or something. */ + goto out; + } + + ret = o2net_register_hb_callbacks(); + if (ret) + goto out_sysctl; + + config_group_init(&o2nm_cluster_group.cs_subsys.su_group); + init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem); + ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys); + if (ret) { + printk(KERN_ERR "nodemanager: Registration returned %d\n", ret); + goto out_callbacks; + } + + ret = o2cb_sys_init(); + if (!ret) + goto out; + + configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys); +out_callbacks: + o2net_unregister_hb_callbacks(); +out_sysctl: + unregister_sysctl_table(ocfs2_table_header); +out: + return ret; +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(init_o2nm) +module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h new file mode 100644 index 000000000000..fce8033c310f --- /dev/null +++ b/fs/ocfs2/cluster/nodemanager.h @@ -0,0 +1,64 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * nodemanager.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_NODEMANAGER_H +#define O2CLUSTER_NODEMANAGER_H + +#include "ocfs2_nodemanager.h" + +/* This totally doesn't belong here. */ +#include <linux/configfs.h> +#include <linux/rbtree.h> + +#define KERN_OCFS2 988 +#define KERN_OCFS2_NM 1 + +const char *o2nm_get_hb_ctl_path(void); + +struct o2nm_node { + spinlock_t nd_lock; + struct config_item nd_item; + char nd_name[O2NM_MAX_NAME_LEN+1]; /* replace? */ + __u8 nd_num; + /* only one address per node, as attributes, for now. */ + __be32 nd_ipv4_address; + __be16 nd_ipv4_port; + struct rb_node nd_ip_node; + /* there can be only one local node for now */ + int nd_local; + + unsigned long nd_set_attributes; +}; + +u8 o2nm_this_node(void); + +int o2nm_configured_node_map(unsigned long *map, unsigned bytes); +struct o2nm_node *o2nm_get_node_by_num(u8 node_num); +struct o2nm_node *o2nm_get_node_by_ip(__be32 addr); +void o2nm_node_get(struct o2nm_node *node); +void o2nm_node_put(struct o2nm_node *node); + +#endif /* O2CLUSTER_NODEMANAGER_H */ diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h new file mode 100644 index 000000000000..94096069cb43 --- /dev/null +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -0,0 +1,37 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_heartbeat.h + * + * On-disk structures for ocfs2_heartbeat + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_HEARTBEAT_H +#define _OCFS2_HEARTBEAT_H + +struct o2hb_disk_heartbeat_block { + __le64 hb_seq; + __u8 hb_node; + __u8 hb_pad1[3]; + __le32 hb_cksum; + __le64 hb_generation; +}; + +#endif /* _OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h new file mode 100644 index 000000000000..5b9854bad571 --- /dev/null +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -0,0 +1,39 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_nodemanager.h + * + * Header describing the interface between userspace and the kernel + * for the ocfs2_nodemanager module. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef _OCFS2_NODEMANAGER_H +#define _OCFS2_NODEMANAGER_H + +#define O2NM_API_VERSION 5 + +#define O2NM_MAX_NODES 255 +#define O2NM_INVALID_NODE_NUM 255 + +/* host name, group name, cluster name all 64 bytes */ +#define O2NM_MAX_NAME_LEN 64 // __NEW_UTS_LEN + +#endif /* _OCFS2_NODEMANAGER_H */ diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c new file mode 100644 index 000000000000..7bba98fbfc15 --- /dev/null +++ b/fs/ocfs2/cluster/quorum.c @@ -0,0 +1,315 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* This quorum hack is only here until we transition to some more rational + * approach that is driven from userspace. Honest. No foolin'. + * + * Imagine two nodes lose network connectivity to each other but they're still + * up and operating in every other way. Presumably a network timeout indicates + * that a node is broken and should be recovered. They can't both recover each + * other and both carry on without serialising their access to the file system. + * They need to decide who is authoritative. Now extend that problem to + * arbitrary groups of nodes losing connectivity between each other. + * + * So we declare that a node which has given up on connecting to a majority + * of nodes who are still heartbeating will fence itself. + * + * There are huge opportunities for races here. After we give up on a node's + * connection we need to wait long enough to give heartbeat an opportunity + * to declare the node as truly dead. We also need to be careful with the + * race between when we see a node start heartbeating and when we connect + * to it. + * + * So nodes that are in this transtion put a hold on the quorum decision + * with a counter. As they fall out of this transition they drop the count + * and if they're the last, they fire off the decision. + */ +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +#include "heartbeat.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_QUORUM +#include "masklog.h" +#include "quorum.h" + +static struct o2quo_state { + spinlock_t qs_lock; + struct work_struct qs_work; + int qs_pending; + int qs_heartbeating; + unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int qs_connected; + unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int qs_holds; + unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; +} o2quo_state; + +/* this is horribly heavy-handed. It should instead flip the file + * system RO and call some userspace script. */ +static void o2quo_fence_self(void) +{ + /* panic spins with interrupts enabled. with preempt + * threads can still schedule, etc, etc */ + o2hb_stop_all_regions(); + panic("ocfs2 is very sorry to be fencing this system by panicing\n"); +} + +/* Indicate that a timeout occured on a hearbeat region write. The + * other nodes in the cluster may consider us dead at that time so we + * want to "fence" ourselves so that we don't scribble on the disk + * after they think they've recovered us. This can't solve all + * problems related to writeout after recovery but this hack can at + * least close some of those gaps. When we have real fencing, this can + * go away as our node would be fenced externally before other nodes + * begin recovery. */ +void o2quo_disk_timeout(void) +{ + o2quo_fence_self(); +} + +static void o2quo_make_decision(void *arg) +{ + int quorum; + int lowest_hb, lowest_reachable = 0, fence = 0; + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); + if (lowest_hb != O2NM_MAX_NODES) + lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); + + mlog(0, "heartbeating: %d, connected: %d, " + "lowest: %d (%sreachable)\n", qs->qs_heartbeating, + qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); + + if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || + qs->qs_heartbeating == 1) + goto out; + + if (qs->qs_heartbeating & 1) { + /* the odd numbered cluster case is straight forward -- + * if we can't talk to the majority we're hosed */ + quorum = (qs->qs_heartbeating + 1)/2; + if (qs->qs_connected < quorum) { + mlog(ML_ERROR, "fencing this node because it is " + "only connected to %u nodes and %u is needed " + "to make a quorum out of %u heartbeating nodes\n", + qs->qs_connected, quorum, + qs->qs_heartbeating); + fence = 1; + } + } else { + /* the even numbered cluster adds the possibility of each half + * of the cluster being able to talk amongst themselves.. in + * that case we're hosed if we can't talk to the group that has + * the lowest numbered node */ + quorum = qs->qs_heartbeating / 2; + if (qs->qs_connected < quorum) { + mlog(ML_ERROR, "fencing this node because it is " + "only connected to %u nodes and %u is needed " + "to make a quorum out of %u heartbeating nodes\n", + qs->qs_connected, quorum, + qs->qs_heartbeating); + fence = 1; + } + else if ((qs->qs_connected == quorum) && + !lowest_reachable) { + mlog(ML_ERROR, "fencing this node because it is " + "connected to a half-quorum of %u out of %u " + "nodes which doesn't include the lowest active " + "node %u\n", quorum, qs->qs_heartbeating, + lowest_hb); + fence = 1; + } + } + +out: + spin_unlock(&qs->qs_lock); + if (fence) + o2quo_fence_self(); +} + +static void o2quo_set_hold(struct o2quo_state *qs, u8 node) +{ + assert_spin_locked(&qs->qs_lock); + + if (!test_and_set_bit(node, qs->qs_hold_bm)) { + qs->qs_holds++; + mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, + "node %u\n", node); + mlog(0, "node %u, %d total\n", node, qs->qs_holds); + } +} + +static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) +{ + assert_spin_locked(&qs->qs_lock); + + if (test_and_clear_bit(node, qs->qs_hold_bm)) { + mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); + if (--qs->qs_holds == 0) { + if (qs->qs_pending) { + qs->qs_pending = 0; + schedule_work(&qs->qs_work); + } + } + mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", + node, qs->qs_holds); + } +} + +/* as a node comes up we delay the quorum decision until we know the fate of + * the connection. the hold will be droped in conn_up or hb_down. it might be + * perpetuated by con_err until hb_down. if we already have a conn, we might + * be dropping a hold that conn_up got. */ +void o2quo_hb_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_heartbeating++; + mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, + "node %u\n", node); + mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); + set_bit(node, qs->qs_hb_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); + + if (!test_bit(node, qs->qs_conn_bm)) + o2quo_set_hold(qs, node); + else + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* hb going down releases any holds we might have had due to this node from + * conn_up, conn_err, or hb_up */ +void o2quo_hb_down(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_heartbeating--; + mlog_bug_on_msg(qs->qs_heartbeating < 0, + "node %u, %d heartbeating\n", + node, qs->qs_heartbeating); + mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); + clear_bit(node, qs->qs_hb_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); + + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* this tells us that we've decided that the node is still heartbeating + * even though we've lost it's conn. it must only be called after conn_err + * and indicates that we must now make a quorum decision in the future, + * though we might be doing so after waiting for holds to drain. Here + * we'll be dropping the hold from conn_err. */ +void o2quo_hb_still_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + mlog(0, "node %u\n", node); + + qs->qs_pending = 1; + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* This is analagous to hb_up. as a node's connection comes up we delay the + * quorum decision until we see it heartbeating. the hold will be droped in + * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if + * it's already heartbeating we we might be dropping a hold that conn_up got. + * */ +void o2quo_conn_up(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + qs->qs_connected++; + mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, + "node %u\n", node); + mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); + set_bit(node, qs->qs_conn_bm); + + mlog(0, "node %u, %d total\n", node, qs->qs_connected); + + if (!test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); + else + o2quo_clear_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +/* we've decided that we won't ever be connecting to the node again. if it's + * still heartbeating we grab a hold that will delay decisions until either the + * node stops heartbeating from hb_down or the caller decides that the node is + * still up and calls still_up */ +void o2quo_conn_err(u8 node) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock(&qs->qs_lock); + + if (test_bit(node, qs->qs_conn_bm)) { + qs->qs_connected--; + mlog_bug_on_msg(qs->qs_connected < 0, + "node %u, connected %d\n", + node, qs->qs_connected); + + clear_bit(node, qs->qs_conn_bm); + } + + mlog(0, "node %u, %d total\n", node, qs->qs_connected); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); + + spin_unlock(&qs->qs_lock); +} + +void o2quo_init(void) +{ + struct o2quo_state *qs = &o2quo_state; + + spin_lock_init(&qs->qs_lock); + INIT_WORK(&qs->qs_work, o2quo_make_decision, NULL); +} + +void o2quo_exit(void) +{ + flush_scheduled_work(); +} diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h new file mode 100644 index 000000000000..6649cc6f67c9 --- /dev/null +++ b/fs/ocfs2/cluster/quorum.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_QUORUM_H +#define O2CLUSTER_QUORUM_H + +void o2quo_init(void); +void o2quo_exit(void); + +void o2quo_hb_up(u8 node); +void o2quo_hb_down(u8 node); +void o2quo_hb_still_up(u8 node); +void o2quo_conn_up(u8 node); +void o2quo_conn_err(u8 node); +void o2quo_disk_timeout(void); + +#endif /* O2CLUSTER_QUORUM_H */ diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c new file mode 100644 index 000000000000..1d9f6acafa2e --- /dev/null +++ b/fs/ocfs2/cluster/sys.c @@ -0,0 +1,124 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sys.c + * + * OCFS2 cluster sysfs interface + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, + * version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> + +#include "ocfs2_nodemanager.h" +#include "masklog.h" +#include "sys.h" + +struct o2cb_attribute { + struct attribute attr; + ssize_t (*show)(char *buf); + ssize_t (*store)(const char *buf, size_t count); +}; + +#define O2CB_ATTR(_name, _mode, _show, _store) \ +struct o2cb_attribute o2cb_attr_##_name = __ATTR(_name, _mode, _show, _store) + +#define to_o2cb_subsys(k) container_of(to_kset(k), struct subsystem, kset) +#define to_o2cb_attr(_attr) container_of(_attr, struct o2cb_attribute, attr) + +static ssize_t o2cb_interface_revision_show(char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); +} + +static O2CB_ATTR(interface_revision, S_IFREG | S_IRUGO, o2cb_interface_revision_show, NULL); + +static struct attribute *o2cb_attrs[] = { + &o2cb_attr_interface_revision.attr, + NULL, +}; + +static ssize_t +o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer); +static ssize_t +o2cb_store(struct kobject * kobj, struct attribute * attr, + const char * buffer, size_t count); +static struct sysfs_ops o2cb_sysfs_ops = { + .show = o2cb_show, + .store = o2cb_store, +}; + +static struct kobj_type o2cb_subsys_type = { + .default_attrs = o2cb_attrs, + .sysfs_ops = &o2cb_sysfs_ops, +}; + +/* gives us o2cb_subsys */ +static decl_subsys(o2cb, NULL, NULL); + +static ssize_t +o2cb_show(struct kobject * kobj, struct attribute * attr, char * buffer) +{ + struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); + struct subsystem *sbs = to_o2cb_subsys(kobj); + + BUG_ON(sbs != &o2cb_subsys); + + if (o2cb_attr->show) + return o2cb_attr->show(buffer); + return -EIO; +} + +static ssize_t +o2cb_store(struct kobject * kobj, struct attribute * attr, + const char * buffer, size_t count) +{ + struct o2cb_attribute *o2cb_attr = to_o2cb_attr(attr); + struct subsystem *sbs = to_o2cb_subsys(kobj); + + BUG_ON(sbs != &o2cb_subsys); + + if (o2cb_attr->store) + return o2cb_attr->store(buffer, count); + return -EIO; +} + +void o2cb_sys_shutdown(void) +{ + mlog_sys_shutdown(); + subsystem_unregister(&o2cb_subsys); +} + +int o2cb_sys_init(void) +{ + int ret; + + o2cb_subsys.kset.kobj.ktype = &o2cb_subsys_type; + ret = subsystem_register(&o2cb_subsys); + if (ret) + return ret; + + ret = mlog_sys_init(&o2cb_subsys); + if (ret) + subsystem_unregister(&o2cb_subsys); + return ret; +} diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h new file mode 100644 index 000000000000..d66b8ab0045e --- /dev/null +++ b/fs/ocfs2/cluster/sys.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sys.h + * + * Function prototypes for o2cb sysfs interface + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation, + * version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_SYS_H +#define O2CLUSTER_SYS_H + +void o2cb_sys_shutdown(void); +int o2cb_sys_init(void); + +#endif /* O2CLUSTER_SYS_H */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c new file mode 100644 index 000000000000..35d92c01a972 --- /dev/null +++ b/fs/ocfs2/cluster/tcp.c @@ -0,0 +1,1829 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * ---- + * + * Callers for this were originally written against a very simple synchronus + * API. This implementation reflects those simple callers. Some day I'm sure + * we'll need to move to a more robust posting/callback mechanism. + * + * Transmit calls pass in kernel virtual addresses and block copying this into + * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting + * for a failed socket to timeout. TX callers can also pass in a poniter to an + * 'int' which gets filled with an errno off the wire in response to the + * message they send. + * + * Handlers for unsolicited messages are registered. Each socket has a page + * that incoming data is copied into. First the header, then the data. + * Handlers are called from only one thread with a reference to this per-socket + * page. This page is destroyed after the handler call, so it can't be + * referenced beyond the call. Handlers may block but are discouraged from + * doing so. + * + * Any framing errors (bad magic, large payload lengths) close a connection. + * + * Our sock_container holds the state we associate with a socket. It's current + * framing state is held there as well as the refcounting we do around when it + * is safe to tear down the socket. The socket is only finally torn down from + * the container when the container loses all of its references -- so as long + * as you hold a ref on the container you can trust that the socket is valid + * for use with kernel socket APIs. + * + * Connections are initiated between a pair of nodes when the node with the + * higher node number gets a heartbeat callback which indicates that the lower + * numbered node has started heartbeating. The lower numbered node is passive + * and only accepts the connection if the higher numbered node is heartbeating. + */ + +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/slab.h> +#include <linux/idr.h> +#include <linux/kref.h> +#include <net/tcp.h> + +#include <asm/uaccess.h> + +#include "heartbeat.h" +#include "tcp.h" +#include "nodemanager.h" +#define MLOG_MASK_PREFIX ML_TCP +#include "masklog.h" +#include "quorum.h" + +#include "tcp_internal.h" + +/* + * The linux network stack isn't sparse endian clean.. It has macros like + * ntohs() which perform the endian checks and structs like sockaddr_in + * which aren't annotated. So __force is found here to get the build + * clean. When they emerge from the dark ages and annotate the code + * we can remove these. + */ + +#define SC_NODEF_FMT "node %s (num %u) at %u.%u.%u.%u:%u" +#define SC_NODEF_ARGS(sc) sc->sc_node->nd_name, sc->sc_node->nd_num, \ + NIPQUAD(sc->sc_node->nd_ipv4_address), \ + ntohs(sc->sc_node->nd_ipv4_port) + +/* + * In the following two log macros, the whitespace after the ',' just + * before ##args is intentional. Otherwise, gcc 2.95 will eat the + * previous token if args expands to nothing. + */ +#define msglog(hdr, fmt, args...) do { \ + typeof(hdr) __hdr = (hdr); \ + mlog(ML_MSG, "[mag %u len %u typ %u stat %d sys_stat %d " \ + "key %08x num %u] " fmt, \ + be16_to_cpu(__hdr->magic), be16_to_cpu(__hdr->data_len), \ + be16_to_cpu(__hdr->msg_type), be32_to_cpu(__hdr->status), \ + be32_to_cpu(__hdr->sys_status), be32_to_cpu(__hdr->key), \ + be32_to_cpu(__hdr->msg_num) , ##args); \ +} while (0) + +#define sclog(sc, fmt, args...) do { \ + typeof(sc) __sc = (sc); \ + mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ + "pg_off %zu] " fmt, __sc, \ + atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ + ##args); \ +} while (0) + +static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED; +static struct rb_root o2net_handler_tree = RB_ROOT; + +static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; + +/* XXX someday we'll need better accounting */ +static struct socket *o2net_listen_sock = NULL; + +/* + * listen work is only queued by the listening socket callbacks on the + * o2net_wq. teardown detaches the callbacks before destroying the workqueue. + * quorum work is queued as sock containers are shutdown.. stop_listening + * tears down all the node's sock containers, preventing future shutdowns + * and queued quroum work, before canceling delayed quorum work and + * destroying the work queue. + */ +static struct workqueue_struct *o2net_wq; +static struct work_struct o2net_listen_work; + +static struct o2hb_callback_func o2net_hb_up, o2net_hb_down; +#define O2NET_HB_PRI 0x1 + +static struct o2net_handshake *o2net_hand; +static struct o2net_msg *o2net_keep_req, *o2net_keep_resp; + +static int o2net_sys_err_translations[O2NET_ERR_MAX] = + {[O2NET_ERR_NONE] = 0, + [O2NET_ERR_NO_HNDLR] = -ENOPROTOOPT, + [O2NET_ERR_OVERFLOW] = -EOVERFLOW, + [O2NET_ERR_DIED] = -EHOSTDOWN,}; + +/* can't quite avoid *all* internal declarations :/ */ +static void o2net_sc_connect_completed(void *arg); +static void o2net_rx_until_empty(void *arg); +static void o2net_shutdown_sc(void *arg); +static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_sc_send_keep_req(void *arg); +static void o2net_idle_timer(unsigned long data); +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); + +static inline int o2net_sys_err_to_errno(enum o2net_system_error err) +{ + int trans; + BUG_ON(err >= O2NET_ERR_MAX); + trans = o2net_sys_err_translations[err]; + + /* Just in case we mess up the translation table above */ + BUG_ON(err != O2NET_ERR_NONE && trans == 0); + return trans; +} + +static struct o2net_node * o2net_nn_from_num(u8 node_num) +{ + BUG_ON(node_num >= ARRAY_SIZE(o2net_nodes)); + return &o2net_nodes[node_num]; +} + +static u8 o2net_num_from_nn(struct o2net_node *nn) +{ + BUG_ON(nn == NULL); + return nn - o2net_nodes; +} + +/* ------------------------------------------------------------ */ + +static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) +{ + int ret = 0; + + do { + if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { + ret = -EAGAIN; + break; + } + spin_lock(&nn->nn_lock); + ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); + if (ret == 0) + list_add_tail(&nsw->ns_node_item, + &nn->nn_status_list); + spin_unlock(&nn->nn_lock); + } while (ret == -EAGAIN); + + if (ret == 0) { + init_waitqueue_head(&nsw->ns_wq); + nsw->ns_sys_status = O2NET_ERR_NONE; + nsw->ns_status = 0; + } + + return ret; +} + +static void o2net_complete_nsw_locked(struct o2net_node *nn, + struct o2net_status_wait *nsw, + enum o2net_system_error sys_status, + s32 status) +{ + assert_spin_locked(&nn->nn_lock); + + if (!list_empty(&nsw->ns_node_item)) { + list_del_init(&nsw->ns_node_item); + nsw->ns_sys_status = sys_status; + nsw->ns_status = status; + idr_remove(&nn->nn_status_idr, nsw->ns_id); + wake_up(&nsw->ns_wq); + } +} + +static void o2net_complete_nsw(struct o2net_node *nn, + struct o2net_status_wait *nsw, + u64 id, enum o2net_system_error sys_status, + s32 status) +{ + spin_lock(&nn->nn_lock); + if (nsw == NULL) { + if (id > INT_MAX) + goto out; + + nsw = idr_find(&nn->nn_status_idr, id); + if (nsw == NULL) + goto out; + } + + o2net_complete_nsw_locked(nn, nsw, sys_status, status); + +out: + spin_unlock(&nn->nn_lock); + return; +} + +static void o2net_complete_nodes_nsw(struct o2net_node *nn) +{ + struct list_head *iter, *tmp; + unsigned int num_kills = 0; + struct o2net_status_wait *nsw; + + assert_spin_locked(&nn->nn_lock); + + list_for_each_safe(iter, tmp, &nn->nn_status_list) { + nsw = list_entry(iter, struct o2net_status_wait, ns_node_item); + o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); + num_kills++; + } + + mlog(0, "completed %d messages for node %u\n", num_kills, + o2net_num_from_nn(nn)); +} + +static int o2net_nsw_completed(struct o2net_node *nn, + struct o2net_status_wait *nsw) +{ + int completed; + spin_lock(&nn->nn_lock); + completed = list_empty(&nsw->ns_node_item); + spin_unlock(&nn->nn_lock); + return completed; +} + +/* ------------------------------------------------------------ */ + +static void sc_kref_release(struct kref *kref) +{ + struct o2net_sock_container *sc = container_of(kref, + struct o2net_sock_container, sc_kref); + sclog(sc, "releasing\n"); + + if (sc->sc_sock) { + sock_release(sc->sc_sock); + sc->sc_sock = NULL; + } + + o2nm_node_put(sc->sc_node); + sc->sc_node = NULL; + + kfree(sc); +} + +static void sc_put(struct o2net_sock_container *sc) +{ + sclog(sc, "put\n"); + kref_put(&sc->sc_kref, sc_kref_release); +} +static void sc_get(struct o2net_sock_container *sc) +{ + sclog(sc, "get\n"); + kref_get(&sc->sc_kref); +} +static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) +{ + struct o2net_sock_container *sc, *ret = NULL; + struct page *page = NULL; + + page = alloc_page(GFP_NOFS); + sc = kcalloc(1, sizeof(*sc), GFP_NOFS); + if (sc == NULL || page == NULL) + goto out; + + kref_init(&sc->sc_kref); + o2nm_node_get(node); + sc->sc_node = node; + + INIT_WORK(&sc->sc_connect_work, o2net_sc_connect_completed, sc); + INIT_WORK(&sc->sc_rx_work, o2net_rx_until_empty, sc); + INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc, sc); + INIT_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req, sc); + + init_timer(&sc->sc_idle_timeout); + sc->sc_idle_timeout.function = o2net_idle_timer; + sc->sc_idle_timeout.data = (unsigned long)sc; + + sclog(sc, "alloced\n"); + + ret = sc; + sc->sc_page = page; + sc = NULL; + page = NULL; + +out: + if (page) + __free_page(page); + kfree(sc); + + return ret; +} + +/* ------------------------------------------------------------ */ + +static void o2net_sc_queue_work(struct o2net_sock_container *sc, + struct work_struct *work) +{ + sc_get(sc); + if (!queue_work(o2net_wq, work)) + sc_put(sc); +} +static void o2net_sc_queue_delayed_work(struct o2net_sock_container *sc, + struct work_struct *work, + int delay) +{ + sc_get(sc); + if (!queue_delayed_work(o2net_wq, work, delay)) + sc_put(sc); +} +static void o2net_sc_cancel_delayed_work(struct o2net_sock_container *sc, + struct work_struct *work) +{ + if (cancel_delayed_work(work)) + sc_put(sc); +} + +static void o2net_set_nn_state(struct o2net_node *nn, + struct o2net_sock_container *sc, + unsigned valid, int err) +{ + int was_valid = nn->nn_sc_valid; + int was_err = nn->nn_persistent_error; + struct o2net_sock_container *old_sc = nn->nn_sc; + + assert_spin_locked(&nn->nn_lock); + + /* the node num comparison and single connect/accept path should stop + * an non-null sc from being overwritten with another */ + BUG_ON(sc && nn->nn_sc && nn->nn_sc != sc); + mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); + mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); + + /* we won't reconnect after our valid conn goes away for + * this hb iteration.. here so it shows up in the logs */ + if (was_valid && !valid && err == 0) + err = -ENOTCONN; + + mlog(ML_CONN, "node %u sc: %p -> %p, valid %u -> %u, err %d -> %d\n", + o2net_num_from_nn(nn), nn->nn_sc, sc, nn->nn_sc_valid, valid, + nn->nn_persistent_error, err); + + nn->nn_sc = sc; + nn->nn_sc_valid = valid ? 1 : 0; + nn->nn_persistent_error = err; + + /* mirrors o2net_tx_can_proceed() */ + if (nn->nn_persistent_error || nn->nn_sc_valid) + wake_up(&nn->nn_sc_wq); + + if (!was_err && nn->nn_persistent_error) { + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + } + + if (was_valid && !valid) { + mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n", + SC_NODEF_ARGS(old_sc)); + o2net_complete_nodes_nsw(nn); + } + + if (!was_valid && valid) { + o2quo_conn_up(o2net_num_from_nn(nn)); + /* this is a bit of a hack. we only try reconnecting + * when heartbeating starts until we get a connection. + * if that connection then dies we don't try reconnecting. + * the only way to start connecting again is to down + * heartbeat and bring it back up. */ + cancel_delayed_work(&nn->nn_connect_expired); + mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", + o2nm_this_node() > sc->sc_node->nd_num ? + "connected to" : "accepted connection from", + SC_NODEF_ARGS(sc)); + } + + /* trigger the connecting worker func as long as we're not valid, + * it will back off if it shouldn't connect. This can be called + * from node config teardown and so needs to be careful about + * the work queue actually being up. */ + if (!valid && o2net_wq) { + unsigned long delay; + /* delay if we're withing a RECONNECT_DELAY of the + * last attempt */ + delay = (nn->nn_last_connect_attempt + + msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + - jiffies; + if (delay > msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS)) + delay = 0; + mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); + queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); + } + + /* keep track of the nn's sc ref for the caller */ + if ((old_sc == NULL) && sc) + sc_get(sc); + if (old_sc && (old_sc != sc)) { + o2net_sc_queue_work(old_sc, &old_sc->sc_shutdown_work); + sc_put(old_sc); + } +} + +/* see o2net_register_callbacks() */ +static void o2net_data_ready(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); + + read_lock(&sk->sk_callback_lock); + if (sk->sk_user_data) { + struct o2net_sock_container *sc = sk->sk_user_data; + sclog(sc, "data_ready hit\n"); + do_gettimeofday(&sc->sc_tv_data_ready); + o2net_sc_queue_work(sc, &sc->sc_rx_work); + ready = sc->sc_data_ready; + } else { + ready = sk->sk_data_ready; + } + read_unlock(&sk->sk_callback_lock); + + ready(sk, bytes); +} + +/* see o2net_register_callbacks() */ +static void o2net_state_change(struct sock *sk) +{ + void (*state_change)(struct sock *sk); + struct o2net_sock_container *sc; + + read_lock(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc == NULL) { + state_change = sk->sk_state_change; + goto out; + } + + sclog(sc, "state_change to %d\n", sk->sk_state); + + state_change = sc->sc_state_change; + + switch(sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + o2net_sc_queue_work(sc, &sc->sc_connect_work); + break; + default: + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); + break; + } +out: + read_unlock(&sk->sk_callback_lock); + state_change(sk); +} + +/* + * we register callbacks so we can queue work on events before calling + * the original callbacks. our callbacks our careful to test user_data + * to discover when they've reaced with o2net_unregister_callbacks(). + */ +static void o2net_register_callbacks(struct sock *sk, + struct o2net_sock_container *sc) +{ + write_lock_bh(&sk->sk_callback_lock); + + /* accepted sockets inherit the old listen socket data ready */ + if (sk->sk_data_ready == o2net_listen_data_ready) { + sk->sk_data_ready = sk->sk_user_data; + sk->sk_user_data = NULL; + } + + BUG_ON(sk->sk_user_data != NULL); + sk->sk_user_data = sc; + sc_get(sc); + + sc->sc_data_ready = sk->sk_data_ready; + sc->sc_state_change = sk->sk_state_change; + sk->sk_data_ready = o2net_data_ready; + sk->sk_state_change = o2net_state_change; + + write_unlock_bh(&sk->sk_callback_lock); +} + +static int o2net_unregister_callbacks(struct sock *sk, + struct o2net_sock_container *sc) +{ + int ret = 0; + + write_lock_bh(&sk->sk_callback_lock); + if (sk->sk_user_data == sc) { + ret = 1; + sk->sk_user_data = NULL; + sk->sk_data_ready = sc->sc_data_ready; + sk->sk_state_change = sc->sc_state_change; + } + write_unlock_bh(&sk->sk_callback_lock); + + return ret; +} + +/* + * this is a little helper that is called by callers who have seen a problem + * with an sc and want to detach it from the nn if someone already hasn't beat + * them to it. if an error is given then the shutdown will be persistent + * and pending transmits will be canceled. + */ +static void o2net_ensure_shutdown(struct o2net_node *nn, + struct o2net_sock_container *sc, + int err) +{ + spin_lock(&nn->nn_lock); + if (nn->nn_sc == sc) + o2net_set_nn_state(nn, NULL, 0, err); + spin_unlock(&nn->nn_lock); +} + +/* + * This work queue function performs the blocking parts of socket shutdown. A + * few paths lead here. set_nn_state will trigger this callback if it sees an + * sc detached from the nn. state_change will also trigger this callback + * directly when it sees errors. In that case we need to call set_nn_state + * ourselves as state_change couldn't get the nn_lock and call set_nn_state + * itself. + */ +static void o2net_shutdown_sc(void *arg) +{ + struct o2net_sock_container *sc = arg; + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + sclog(sc, "shutting down\n"); + + /* drop the callbacks ref and call shutdown only once */ + if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { + /* we shouldn't flush as we're in the thread, the + * races with pending sc work structs are harmless */ + del_timer_sync(&sc->sc_idle_timeout); + o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); + sc_put(sc); + sc->sc_sock->ops->shutdown(sc->sc_sock, + RCV_SHUTDOWN|SEND_SHUTDOWN); + } + + /* not fatal so failed connects before the other guy has our + * heartbeat can be retried */ + o2net_ensure_shutdown(nn, sc, 0); + sc_put(sc); +} + +/* ------------------------------------------------------------ */ + +static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type, + u32 key) +{ + int ret = memcmp(&nmh->nh_key, &key, sizeof(key)); + + if (ret == 0) + ret = memcmp(&nmh->nh_msg_type, &msg_type, sizeof(msg_type)); + + return ret; +} + +static struct o2net_msg_handler * +o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; + struct o2net_msg_handler *nmh, *ret = NULL; + int cmp; + + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + cmp = o2net_handler_cmp(nmh, msg_type, key); + + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { + ret = nmh; + break; + } + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + + return ret; +} + +static void o2net_handler_kref_release(struct kref *kref) +{ + struct o2net_msg_handler *nmh; + nmh = container_of(kref, struct o2net_msg_handler, nh_kref); + + kfree(nmh); +} + +static void o2net_handler_put(struct o2net_msg_handler *nmh) +{ + kref_put(&nmh->nh_kref, o2net_handler_kref_release); +} + +/* max_len is protection for the handler func. incoming messages won't + * be given to the handler if their payload is longer than the max. */ +int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, + o2net_msg_handler_func *func, void *data, + struct list_head *unreg_list) +{ + struct o2net_msg_handler *nmh = NULL; + struct rb_node **p, *parent; + int ret = 0; + + if (max_len > O2NET_MAX_PAYLOAD_BYTES) { + mlog(0, "max_len for message handler out of range: %u\n", + max_len); + ret = -EINVAL; + goto out; + } + + if (!msg_type) { + mlog(0, "no message type provided: %u, %p\n", msg_type, func); + ret = -EINVAL; + goto out; + + } + if (!func) { + mlog(0, "no message handler provided: %u, %p\n", + msg_type, func); + ret = -EINVAL; + goto out; + } + + nmh = kcalloc(1, sizeof(struct o2net_msg_handler), GFP_NOFS); + if (nmh == NULL) { + ret = -ENOMEM; + goto out; + } + + nmh->nh_func = func; + nmh->nh_func_data = data; + nmh->nh_msg_type = msg_type; + nmh->nh_max_len = max_len; + nmh->nh_key = key; + /* the tree and list get this ref.. they're both removed in + * unregister when this ref is dropped */ + kref_init(&nmh->nh_kref); + INIT_LIST_HEAD(&nmh->nh_unregister_item); + + write_lock(&o2net_handler_lock); + if (o2net_handler_tree_lookup(msg_type, key, &p, &parent)) + ret = -EEXIST; + else { + rb_link_node(&nmh->nh_node, parent, p); + rb_insert_color(&nmh->nh_node, &o2net_handler_tree); + list_add_tail(&nmh->nh_unregister_item, unreg_list); + + mlog(ML_TCP, "registered handler func %p type %u key %08x\n", + func, msg_type, key); + /* we've had some trouble with handlers seemingly vanishing. */ + mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p, + &parent) == NULL, + "couldn't find handler we *just* registerd " + "for type %u key %08x\n", msg_type, key); + } + write_unlock(&o2net_handler_lock); + if (ret) + goto out; + +out: + if (ret) + kfree(nmh); + + return ret; +} +EXPORT_SYMBOL_GPL(o2net_register_handler); + +void o2net_unregister_handler_list(struct list_head *list) +{ + struct list_head *pos, *n; + struct o2net_msg_handler *nmh; + + write_lock(&o2net_handler_lock); + list_for_each_safe(pos, n, list) { + nmh = list_entry(pos, struct o2net_msg_handler, + nh_unregister_item); + mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", + nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); + rb_erase(&nmh->nh_node, &o2net_handler_tree); + list_del_init(&nmh->nh_unregister_item); + kref_put(&nmh->nh_kref, o2net_handler_kref_release); + } + write_unlock(&o2net_handler_lock); +} +EXPORT_SYMBOL_GPL(o2net_unregister_handler_list); + +static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) +{ + struct o2net_msg_handler *nmh; + + read_lock(&o2net_handler_lock); + nmh = o2net_handler_tree_lookup(msg_type, key, NULL, NULL); + if (nmh) + kref_get(&nmh->nh_kref); + read_unlock(&o2net_handler_lock); + + return nmh; +} + +/* ------------------------------------------------------------ */ + +static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) +{ + int ret; + mm_segment_t oldfs; + struct kvec vec = { + .iov_len = len, + .iov_base = data, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&vec, + .msg_flags = MSG_DONTWAIT, + }; + + oldfs = get_fs(); + set_fs(get_ds()); + ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); + set_fs(oldfs); + + return ret; +} + +static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, + size_t veclen, size_t total) +{ + int ret; + mm_segment_t oldfs; + struct msghdr msg = { + .msg_iov = (struct iovec *)vec, + .msg_iovlen = veclen, + }; + + if (sock == NULL) { + ret = -EINVAL; + goto out; + } + + oldfs = get_fs(); + set_fs(get_ds()); + ret = sock_sendmsg(sock, &msg, total); + set_fs(oldfs); + if (ret != total) { + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, + total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ + goto out; + } + + ret = 0; +out: + if (ret < 0) + mlog(0, "returning error: %d\n", ret); + return ret; +} + +static void o2net_sendpage(struct o2net_sock_container *sc, + void *kmalloced_virt, + size_t size) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + ssize_t ret; + + + ret = sc->sc_sock->ops->sendpage(sc->sc_sock, + virt_to_page(kmalloced_virt), + (long)kmalloced_virt & ~PAGE_MASK, + size, MSG_DONTWAIT); + if (ret != size) { + mlog(ML_ERROR, "sendpage of size %zu to " SC_NODEF_FMT + " failed with %zd\n", size, SC_NODEF_ARGS(sc), ret); + o2net_ensure_shutdown(nn, sc, 0); + } +} + +static void o2net_init_msg(struct o2net_msg *msg, u16 data_len, u16 msg_type, u32 key) +{ + memset(msg, 0, sizeof(struct o2net_msg)); + msg->magic = cpu_to_be16(O2NET_MSG_MAGIC); + msg->data_len = cpu_to_be16(data_len); + msg->msg_type = cpu_to_be16(msg_type); + msg->sys_status = cpu_to_be32(O2NET_ERR_NONE); + msg->status = 0; + msg->key = cpu_to_be32(key); +} + +static int o2net_tx_can_proceed(struct o2net_node *nn, + struct o2net_sock_container **sc_ret, + int *error) +{ + int ret = 0; + + spin_lock(&nn->nn_lock); + if (nn->nn_persistent_error) { + ret = 1; + *sc_ret = NULL; + *error = nn->nn_persistent_error; + } else if (nn->nn_sc_valid) { + kref_get(&nn->nn_sc->sc_kref); + + ret = 1; + *sc_ret = nn->nn_sc; + *error = 0; + } + spin_unlock(&nn->nn_lock); + + return ret; +} + +int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, + size_t caller_veclen, u8 target_node, int *status) +{ + int ret, error = 0; + struct o2net_msg *msg = NULL; + size_t veclen, caller_bytes = 0; + struct kvec *vec = NULL; + struct o2net_sock_container *sc = NULL; + struct o2net_node *nn = o2net_nn_from_num(target_node); + struct o2net_status_wait nsw = { + .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), + }; + + if (o2net_wq == NULL) { + mlog(0, "attempt to tx without o2netd running\n"); + ret = -ESRCH; + goto out; + } + + if (caller_veclen == 0) { + mlog(0, "bad kvec array length\n"); + ret = -EINVAL; + goto out; + } + + caller_bytes = iov_length((struct iovec *)caller_vec, caller_veclen); + if (caller_bytes > O2NET_MAX_PAYLOAD_BYTES) { + mlog(0, "total payload len %zu too large\n", caller_bytes); + ret = -EINVAL; + goto out; + } + + if (target_node == o2nm_this_node()) { + ret = -ELOOP; + goto out; + } + + ret = wait_event_interruptible(nn->nn_sc_wq, + o2net_tx_can_proceed(nn, &sc, &error)); + if (!ret && error) + ret = error; + if (ret) + goto out; + + veclen = caller_veclen + 1; + vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); + if (vec == NULL) { + mlog(0, "failed to %zu element kvec!\n", veclen); + ret = -ENOMEM; + goto out; + } + + msg = kmalloc(sizeof(struct o2net_msg), GFP_ATOMIC); + if (!msg) { + mlog(0, "failed to allocate a o2net_msg!\n"); + ret = -ENOMEM; + goto out; + } + + o2net_init_msg(msg, caller_bytes, msg_type, key); + + vec[0].iov_len = sizeof(struct o2net_msg); + vec[0].iov_base = msg; + memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); + + ret = o2net_prep_nsw(nn, &nsw); + if (ret) + goto out; + + msg->msg_num = cpu_to_be32(nsw.ns_id); + + /* finally, convert the message header to network byte-order + * and send */ + ret = o2net_send_tcp_msg(sc->sc_sock, vec, veclen, + sizeof(struct o2net_msg) + caller_bytes); + msglog(msg, "sending returned %d\n", ret); + if (ret < 0) { + mlog(0, "error returned from o2net_send_tcp_msg=%d\n", ret); + goto out; + } + + /* wait on other node's handler */ + wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); + + /* Note that we avoid overwriting the callers status return + * variable if a system error was reported on the other + * side. Callers beware. */ + ret = o2net_sys_err_to_errno(nsw.ns_sys_status); + if (status && !ret) + *status = nsw.ns_status; + + mlog(0, "woken, returning system status %d, user status %d\n", + ret, nsw.ns_status); +out: + if (sc) + sc_put(sc); + if (vec) + kfree(vec); + if (msg) + kfree(msg); + o2net_complete_nsw(nn, &nsw, 0, 0, 0); + return ret; +} +EXPORT_SYMBOL_GPL(o2net_send_message_vec); + +int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, + u8 target_node, int *status) +{ + struct kvec vec = { + .iov_base = data, + .iov_len = len, + }; + return o2net_send_message_vec(msg_type, key, &vec, 1, + target_node, status); +} +EXPORT_SYMBOL_GPL(o2net_send_message); + +static int o2net_send_status_magic(struct socket *sock, struct o2net_msg *hdr, + enum o2net_system_error syserr, int err) +{ + struct kvec vec = { + .iov_base = hdr, + .iov_len = sizeof(struct o2net_msg), + }; + + BUG_ON(syserr >= O2NET_ERR_MAX); + + /* leave other fields intact from the incoming message, msg_num + * in particular */ + hdr->sys_status = cpu_to_be32(syserr); + hdr->status = cpu_to_be32(err); + hdr->magic = cpu_to_be16(O2NET_MSG_STATUS_MAGIC); // twiddle the magic + hdr->data_len = 0; + + msglog(hdr, "about to send status magic %d\n", err); + /* hdr has been in host byteorder this whole time */ + return o2net_send_tcp_msg(sock, &vec, 1, sizeof(struct o2net_msg)); +} + +/* this returns -errno if the header was unknown or too large, etc. + * after this is called the buffer us reused for the next message */ +static int o2net_process_message(struct o2net_sock_container *sc, + struct o2net_msg *hdr) +{ + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + int ret = 0, handler_status; + enum o2net_system_error syserr; + struct o2net_msg_handler *nmh = NULL; + + msglog(hdr, "processing message\n"); + + o2net_sc_postpone_idle(sc); + + switch(be16_to_cpu(hdr->magic)) { + case O2NET_MSG_STATUS_MAGIC: + /* special type for returning message status */ + o2net_complete_nsw(nn, NULL, + be32_to_cpu(hdr->msg_num), + be32_to_cpu(hdr->sys_status), + be32_to_cpu(hdr->status)); + goto out; + case O2NET_MSG_KEEP_REQ_MAGIC: + o2net_sendpage(sc, o2net_keep_resp, + sizeof(*o2net_keep_resp)); + goto out; + case O2NET_MSG_KEEP_RESP_MAGIC: + goto out; + case O2NET_MSG_MAGIC: + break; + default: + msglog(hdr, "bad magic\n"); + ret = -EINVAL; + goto out; + break; + } + + /* find a handler for it */ + handler_status = 0; + nmh = o2net_handler_get(be16_to_cpu(hdr->msg_type), + be32_to_cpu(hdr->key)); + if (!nmh) { + mlog(ML_TCP, "couldn't find handler for type %u key %08x\n", + be16_to_cpu(hdr->msg_type), be32_to_cpu(hdr->key)); + syserr = O2NET_ERR_NO_HNDLR; + goto out_respond; + } + + syserr = O2NET_ERR_NONE; + + if (be16_to_cpu(hdr->data_len) > nmh->nh_max_len) + syserr = O2NET_ERR_OVERFLOW; + + if (syserr != O2NET_ERR_NONE) + goto out_respond; + + do_gettimeofday(&sc->sc_tv_func_start); + sc->sc_msg_key = be32_to_cpu(hdr->key); + sc->sc_msg_type = be16_to_cpu(hdr->msg_type); + handler_status = (nmh->nh_func)(hdr, sizeof(struct o2net_msg) + + be16_to_cpu(hdr->data_len), + nmh->nh_func_data); + do_gettimeofday(&sc->sc_tv_func_stop); + +out_respond: + /* this destroys the hdr, so don't use it after this */ + ret = o2net_send_status_magic(sc->sc_sock, hdr, syserr, + handler_status); + hdr = NULL; + mlog(0, "sending handler status %d, syserr %d returned %d\n", + handler_status, syserr, ret); + +out: + if (nmh) + o2net_handler_put(nmh); + return ret; +} + +static int o2net_check_handshake(struct o2net_sock_container *sc) +{ + struct o2net_handshake *hand = page_address(sc->sc_page); + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { + mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " + "version %llu but %llu is required, disconnecting\n", + SC_NODEF_ARGS(sc), + (unsigned long long)be64_to_cpu(hand->protocol_version), + O2NET_PROTOCOL_VERSION); + + /* don't bother reconnecting if its the wrong version. */ + o2net_ensure_shutdown(nn, sc, -ENOTCONN); + return -1; + } + + sc->sc_handshake_ok = 1; + + spin_lock(&nn->nn_lock); + /* set valid and queue the idle timers only if it hasn't been + * shut down already */ + if (nn->nn_sc == sc) { + o2net_sc_postpone_idle(sc); + o2net_set_nn_state(nn, sc, 1, 0); + } + spin_unlock(&nn->nn_lock); + + /* shift everything up as though it wasn't there */ + sc->sc_page_off -= sizeof(struct o2net_handshake); + if (sc->sc_page_off) + memmove(hand, hand + 1, sc->sc_page_off); + + return 0; +} + +/* this demuxes the queued rx bytes into header or payload bits and calls + * handlers as each full message is read off the socket. it returns -error, + * == 0 eof, or > 0 for progress made.*/ +static int o2net_advance_rx(struct o2net_sock_container *sc) +{ + struct o2net_msg *hdr; + int ret = 0; + void *data; + size_t datalen; + + sclog(sc, "receiving\n"); + do_gettimeofday(&sc->sc_tv_advance_start); + + /* do we need more header? */ + if (sc->sc_page_off < sizeof(struct o2net_msg)) { + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = sizeof(struct o2net_msg) - sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) { + sc->sc_page_off += ret; + + /* this working relies on the handshake being + * smaller than the normal message header */ + if (sc->sc_page_off >= sizeof(struct o2net_handshake)&& + !sc->sc_handshake_ok && o2net_check_handshake(sc)) { + ret = -EPROTO; + goto out; + } + + /* only swab incoming here.. we can + * only get here once as we cross from + * being under to over */ + if (sc->sc_page_off == sizeof(struct o2net_msg)) { + hdr = page_address(sc->sc_page); + if (be16_to_cpu(hdr->data_len) > + O2NET_MAX_PAYLOAD_BYTES) + ret = -EOVERFLOW; + } + } + if (ret <= 0) + goto out; + } + + if (sc->sc_page_off < sizeof(struct o2net_msg)) { + /* oof, still don't have a header */ + goto out; + } + + /* this was swabbed above when we first read it */ + hdr = page_address(sc->sc_page); + + msglog(hdr, "at page_off %zu\n", sc->sc_page_off); + + /* do we need more payload? */ + if (sc->sc_page_off - sizeof(struct o2net_msg) < be16_to_cpu(hdr->data_len)) { + /* need more payload */ + data = page_address(sc->sc_page) + sc->sc_page_off; + datalen = (sizeof(struct o2net_msg) + be16_to_cpu(hdr->data_len)) - + sc->sc_page_off; + ret = o2net_recv_tcp_msg(sc->sc_sock, data, datalen); + if (ret > 0) + sc->sc_page_off += ret; + if (ret <= 0) + goto out; + } + + if (sc->sc_page_off - sizeof(struct o2net_msg) == be16_to_cpu(hdr->data_len)) { + /* we can only get here once, the first time we read + * the payload.. so set ret to progress if the handler + * works out. after calling this the message is toast */ + ret = o2net_process_message(sc, hdr); + if (ret == 0) + ret = 1; + sc->sc_page_off = 0; + } + +out: + sclog(sc, "ret = %d\n", ret); + do_gettimeofday(&sc->sc_tv_advance_stop); + return ret; +} + +/* this work func is triggerd by data ready. it reads until it can read no + * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing + * our work the work struct will be marked and we'll be called again. */ +static void o2net_rx_until_empty(void *arg) +{ + struct o2net_sock_container *sc = arg; + int ret; + + do { + ret = o2net_advance_rx(sc); + } while (ret > 0); + + if (ret <= 0 && ret != -EAGAIN) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + sclog(sc, "saw error %d, closing\n", ret); + /* not permanent so read failed handshake can retry */ + o2net_ensure_shutdown(nn, sc, 0); + } + + sc_put(sc); +} + +static int o2net_set_nodelay(struct socket *sock) +{ + int ret, val = 1; + mm_segment_t oldfs; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Dear unsuspecting programmer, + * + * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level + * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will + * silently turn into SO_DEBUG. + * + * Yours, + * Keeper of hilariously fragile interfaces. + */ + ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char __user *)&val, sizeof(val)); + + set_fs(oldfs); + return ret; +} + +/* ------------------------------------------------------------ */ + +/* called when a connect completes and after a sock is accepted. the + * rx path will see the response and mark the sc valid */ +static void o2net_sc_connect_completed(void *arg) +{ + struct o2net_sock_container *sc = arg; + + mlog(ML_MSG, "sc sending handshake with ver %llu id %llx\n", + (unsigned long long)O2NET_PROTOCOL_VERSION, + (unsigned long long)be64_to_cpu(o2net_hand->connector_id)); + + o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); + sc_put(sc); +} + +/* this is called as a work_struct func. */ +static void o2net_sc_send_keep_req(void *arg) +{ + struct o2net_sock_container *sc = arg; + + o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req)); + sc_put(sc); +} + +/* socket shutdown does a del_timer_sync against this as it tears down. + * we can't start this timer until we've got to the point in sc buildup + * where shutdown is going to be involved */ +static void o2net_idle_timer(unsigned long data) +{ + struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct timeval now; + + do_gettimeofday(&now); + + mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 " + "seconds, shutting it down.\n", SC_NODEF_ARGS(sc)); + mlog(ML_NOTICE, "here are some times that might help debug the " + "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " + "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", + sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, + now.tv_sec, now.tv_usec, + sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, + sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, + sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, + sc->sc_msg_key, sc->sc_msg_type, + sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, + sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); + + o2net_sc_queue_work(sc, &sc->sc_shutdown_work); +} + +static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) +{ + o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); + o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, + O2NET_KEEPALIVE_DELAY_SECS * HZ); + do_gettimeofday(&sc->sc_tv_timer); + mod_timer(&sc->sc_idle_timeout, + jiffies + (O2NET_IDLE_TIMEOUT_SECS * HZ)); +} + +/* this work func is kicked whenever a path sets the nn state which doesn't + * have valid set. This includes seeing hb come up, losing a connection, + * having a connect attempt fail, etc. This centralizes the logic which decides + * if a connect attempt should be made or if we should give up and all future + * transmit attempts should fail */ +static void o2net_start_connect(void *arg) +{ + struct o2net_node *nn = arg; + struct o2net_sock_container *sc = NULL; + struct o2nm_node *node = NULL; + struct socket *sock = NULL; + struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; + int ret = 0; + + /* if we're greater we initiate tx, otherwise we accept */ + if (o2nm_this_node() <= o2net_num_from_nn(nn)) + goto out; + + /* watch for racing with tearing a node down */ + node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); + if (node == NULL) { + ret = 0; + goto out; + } + + spin_lock(&nn->nn_lock); + /* see if we already have one pending or have given up */ + if (nn->nn_sc || nn->nn_persistent_error) + arg = NULL; + spin_unlock(&nn->nn_lock); + if (arg == NULL) /* *shrug*, needed some indicator */ + goto out; + + nn->nn_last_connect_attempt = jiffies; + + sc = sc_alloc(node); + if (sc == NULL) { + mlog(0, "couldn't allocate sc\n"); + ret = -ENOMEM; + goto out; + } + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) { + mlog(0, "can't create socket: %d\n", ret); + goto out; + } + sc->sc_sock = sock; /* freed by sc_kref_release */ + + sock->sk->sk_allocation = GFP_ATOMIC; + + myaddr.sin_family = AF_INET; + myaddr.sin_port = (__force u16)htons(0); /* any port */ + + ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + sizeof(myaddr)); + if (ret) { + mlog(0, "bind failed: %d\n", ret); + goto out; + } + + ret = o2net_set_nodelay(sc->sc_sock); + if (ret) { + mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); + goto out; + } + + o2net_register_callbacks(sc->sc_sock->sk, sc); + + spin_lock(&nn->nn_lock); + /* handshake completion will set nn->nn_sc_valid */ + o2net_set_nn_state(nn, sc, 0, 0); + spin_unlock(&nn->nn_lock); + + remoteaddr.sin_family = AF_INET; + remoteaddr.sin_addr.s_addr = (__force u32)node->nd_ipv4_address; + remoteaddr.sin_port = (__force u16)node->nd_ipv4_port; + + ret = sc->sc_sock->ops->connect(sc->sc_sock, + (struct sockaddr *)&remoteaddr, + sizeof(remoteaddr), + O_NONBLOCK); + if (ret == -EINPROGRESS) + ret = 0; + +out: + if (ret) { + mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " + "with errno %d\n", SC_NODEF_ARGS(sc), ret); + /* 0 err so that another will be queued and attempted + * from set_nn_state */ + if (sc) + o2net_ensure_shutdown(nn, sc, 0); + } + if (sc) + sc_put(sc); + if (node) + o2nm_node_put(node); + + return; +} + +static void o2net_connect_expired(void *arg) +{ + struct o2net_node *nn = arg; + + spin_lock(&nn->nn_lock); + if (!nn->nn_sc_valid) { + mlog(ML_ERROR, "no connection established with node %u after " + "%u seconds, giving up and returning errors.\n", + o2net_num_from_nn(nn), O2NET_IDLE_TIMEOUT_SECS); + + o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + } + spin_unlock(&nn->nn_lock); +} + +static void o2net_still_up(void *arg) +{ + struct o2net_node *nn = arg; + + o2quo_hb_still_up(o2net_num_from_nn(nn)); +} + +/* ------------------------------------------------------------ */ + +void o2net_disconnect_node(struct o2nm_node *node) +{ + struct o2net_node *nn = o2net_nn_from_num(node->nd_num); + + /* don't reconnect until it's heartbeating again */ + spin_lock(&nn->nn_lock); + o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + spin_unlock(&nn->nn_lock); + + if (o2net_wq) { + cancel_delayed_work(&nn->nn_connect_expired); + cancel_delayed_work(&nn->nn_connect_work); + cancel_delayed_work(&nn->nn_still_up); + flush_workqueue(o2net_wq); + } +} + +static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, + void *data) +{ + o2quo_hb_down(node_num); + + if (node_num != o2nm_this_node()) + o2net_disconnect_node(node); +} + +static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, + void *data) +{ + struct o2net_node *nn = o2net_nn_from_num(node_num); + + o2quo_hb_up(node_num); + + /* ensure an immediate connect attempt */ + nn->nn_last_connect_attempt = jiffies - + (msecs_to_jiffies(O2NET_RECONNECT_DELAY_MS) + 1); + + if (node_num != o2nm_this_node()) { + /* heartbeat doesn't work unless a local node number is + * configured and doing so brings up the o2net_wq, so we can + * use it.. */ + queue_delayed_work(o2net_wq, &nn->nn_connect_expired, + O2NET_IDLE_TIMEOUT_SECS * HZ); + + /* believe it or not, accept and node hearbeating testing + * can succeed for this node before we got here.. so + * only use set_nn_state to clear the persistent error + * if that hasn't already happened */ + spin_lock(&nn->nn_lock); + if (nn->nn_persistent_error) + o2net_set_nn_state(nn, NULL, 0, 0); + spin_unlock(&nn->nn_lock); + } +} + +void o2net_unregister_hb_callbacks(void) +{ + int ret; + + ret = o2hb_unregister_callback(&o2net_hb_up); + if (ret < 0) + mlog(ML_ERROR, "Status return %d unregistering heartbeat up " + "callback!\n", ret); + + ret = o2hb_unregister_callback(&o2net_hb_down); + if (ret < 0) + mlog(ML_ERROR, "Status return %d unregistering heartbeat down " + "callback!\n", ret); +} + +int o2net_register_hb_callbacks(void) +{ + int ret; + + o2hb_setup_callback(&o2net_hb_down, O2HB_NODE_DOWN_CB, + o2net_hb_node_down_cb, NULL, O2NET_HB_PRI); + o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB, + o2net_hb_node_up_cb, NULL, O2NET_HB_PRI); + + ret = o2hb_register_callback(&o2net_hb_up); + if (ret == 0) + ret = o2hb_register_callback(&o2net_hb_down); + + if (ret) + o2net_unregister_hb_callbacks(); + + return ret; +} + +/* ------------------------------------------------------------ */ + +static int o2net_accept_one(struct socket *sock) +{ + int ret, slen; + struct sockaddr_in sin; + struct socket *new_sock = NULL; + struct o2nm_node *node = NULL; + struct o2net_sock_container *sc = NULL; + struct o2net_node *nn; + + BUG_ON(sock == NULL); + ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, + sock->sk->sk_protocol, &new_sock); + if (ret) + goto out; + + new_sock->type = sock->type; + new_sock->ops = sock->ops; + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + if (ret < 0) + goto out; + + new_sock->sk->sk_allocation = GFP_ATOMIC; + + ret = o2net_set_nodelay(new_sock); + if (ret) { + mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); + goto out; + } + + slen = sizeof(sin); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, + &slen, 1); + if (ret < 0) + goto out; + + node = o2nm_get_node_by_ip((__force __be32)sin.sin_addr.s_addr); + if (node == NULL) { + mlog(ML_NOTICE, "attempt to connect from unknown node at " + "%u.%u.%u.%u:%d\n", NIPQUAD(sin.sin_addr.s_addr), + ntohs((__force __be16)sin.sin_port)); + ret = -EINVAL; + goto out; + } + + if (o2nm_this_node() > node->nd_num) { + mlog(ML_NOTICE, "unexpected connect attempted from a lower " + "numbered node '%s' at " "%u.%u.%u.%u:%d with num %u\n", + node->nd_name, NIPQUAD(sin.sin_addr.s_addr), + ntohs((__force __be16)sin.sin_port), node->nd_num); + ret = -EINVAL; + goto out; + } + + /* this happens all the time when the other node sees our heartbeat + * and tries to connect before we see their heartbeat */ + if (!o2hb_check_node_heartbeating_from_callback(node->nd_num)) { + mlog(ML_CONN, "attempt to connect from node '%s' at " + "%u.%u.%u.%u:%d but it isn't heartbeating\n", + node->nd_name, NIPQUAD(sin.sin_addr.s_addr), + ntohs((__force __be16)sin.sin_port)); + ret = -EINVAL; + goto out; + } + + nn = o2net_nn_from_num(node->nd_num); + + spin_lock(&nn->nn_lock); + if (nn->nn_sc) + ret = -EBUSY; + else + ret = 0; + spin_unlock(&nn->nn_lock); + if (ret) { + mlog(ML_NOTICE, "attempt to connect from node '%s' at " + "%u.%u.%u.%u:%d but it already has an open connection\n", + node->nd_name, NIPQUAD(sin.sin_addr.s_addr), + ntohs((__force __be16)sin.sin_port)); + goto out; + } + + sc = sc_alloc(node); + if (sc == NULL) { + ret = -ENOMEM; + goto out; + } + + sc->sc_sock = new_sock; + new_sock = NULL; + + spin_lock(&nn->nn_lock); + o2net_set_nn_state(nn, sc, 0, 0); + spin_unlock(&nn->nn_lock); + + o2net_register_callbacks(sc->sc_sock->sk, sc); + o2net_sc_queue_work(sc, &sc->sc_rx_work); + + o2net_sendpage(sc, o2net_hand, sizeof(*o2net_hand)); + +out: + if (new_sock) + sock_release(new_sock); + if (node) + o2nm_node_put(node); + if (sc) + sc_put(sc); + return ret; +} + +static void o2net_accept_many(void *arg) +{ + struct socket *sock = arg; + while (o2net_accept_one(sock) == 0) + cond_resched(); +} + +static void o2net_listen_data_ready(struct sock *sk, int bytes) +{ + void (*ready)(struct sock *sk, int bytes); + + read_lock(&sk->sk_callback_lock); + ready = sk->sk_user_data; + if (ready == NULL) { /* check for teardown race */ + ready = sk->sk_data_ready; + goto out; + } + + /* ->sk_data_ready is also called for a newly established child socket + * before it has been accepted and the acceptor has set up their + * data_ready.. we only want to queue listen work for our listening + * socket */ + if (sk->sk_state == TCP_LISTEN) { + mlog(ML_TCP, "bytes: %d\n", bytes); + queue_work(o2net_wq, &o2net_listen_work); + } + +out: + read_unlock(&sk->sk_callback_lock); + ready(sk, bytes); +} + +static int o2net_open_listening_sock(__be16 port) +{ + struct socket *sock = NULL; + int ret; + struct sockaddr_in sin = { + .sin_family = PF_INET, + .sin_addr = { .s_addr = (__force u32)htonl(INADDR_ANY) }, + .sin_port = (__force u16)port, + }; + + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (ret < 0) { + mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); + goto out; + } + + sock->sk->sk_allocation = GFP_ATOMIC; + + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_user_data = sock->sk->sk_data_ready; + sock->sk->sk_data_ready = o2net_listen_data_ready; + write_unlock_bh(&sock->sk->sk_callback_lock); + + o2net_listen_sock = sock; + INIT_WORK(&o2net_listen_work, o2net_accept_many, sock); + + sock->sk->sk_reuse = 1; + ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + if (ret < 0) { + mlog(ML_ERROR, "unable to bind socket to port %d, ret=%d\n", + ntohs(port), ret); + goto out; + } + + ret = sock->ops->listen(sock, 64); + if (ret < 0) { + mlog(ML_ERROR, "unable to listen on port %d, ret=%d\n", + ntohs(port), ret); + } + +out: + if (ret) { + o2net_listen_sock = NULL; + if (sock) + sock_release(sock); + } + return ret; +} + +/* + * called from node manager when we should bring up our network listening + * socket. node manager handles all the serialization to only call this + * once and to match it with o2net_stop_listening(). note, + * o2nm_this_node() doesn't work yet as we're being called while it + * is being set up. + */ +int o2net_start_listening(struct o2nm_node *node) +{ + int ret = 0; + + BUG_ON(o2net_wq != NULL); + BUG_ON(o2net_listen_sock != NULL); + + mlog(ML_KTHREAD, "starting o2net thread...\n"); + o2net_wq = create_singlethread_workqueue("o2net"); + if (o2net_wq == NULL) { + mlog(ML_ERROR, "unable to launch o2net thread\n"); + return -ENOMEM; /* ? */ + } + + ret = o2net_open_listening_sock(node->nd_ipv4_port); + if (ret) { + destroy_workqueue(o2net_wq); + o2net_wq = NULL; + } else + o2quo_conn_up(node->nd_num); + + return ret; +} + +/* again, o2nm_this_node() doesn't work here as we're involved in + * tearing it down */ +void o2net_stop_listening(struct o2nm_node *node) +{ + struct socket *sock = o2net_listen_sock; + size_t i; + + BUG_ON(o2net_wq == NULL); + BUG_ON(o2net_listen_sock == NULL); + + /* stop the listening socket from generating work */ + write_lock_bh(&sock->sk->sk_callback_lock); + sock->sk->sk_data_ready = sock->sk->sk_user_data; + sock->sk->sk_user_data = NULL; + write_unlock_bh(&sock->sk->sk_callback_lock); + + for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { + struct o2nm_node *node = o2nm_get_node_by_num(i); + if (node) { + o2net_disconnect_node(node); + o2nm_node_put(node); + } + } + + /* finish all work and tear down the work queue */ + mlog(ML_KTHREAD, "waiting for o2net thread to exit....\n"); + destroy_workqueue(o2net_wq); + o2net_wq = NULL; + + sock_release(o2net_listen_sock); + o2net_listen_sock = NULL; + + o2quo_conn_err(node->nd_num); +} + +/* ------------------------------------------------------------ */ + +int o2net_init(void) +{ + unsigned long i; + + o2quo_init(); + + o2net_hand = kcalloc(1, sizeof(struct o2net_handshake), GFP_KERNEL); + o2net_keep_req = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); + o2net_keep_resp = kcalloc(1, sizeof(struct o2net_msg), GFP_KERNEL); + if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + return -ENOMEM; + } + + o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); + o2net_hand->connector_id = cpu_to_be64(1); + + o2net_keep_req->magic = cpu_to_be16(O2NET_MSG_KEEP_REQ_MAGIC); + o2net_keep_resp->magic = cpu_to_be16(O2NET_MSG_KEEP_RESP_MAGIC); + + for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { + struct o2net_node *nn = o2net_nn_from_num(i); + + spin_lock_init(&nn->nn_lock); + INIT_WORK(&nn->nn_connect_work, o2net_start_connect, nn); + INIT_WORK(&nn->nn_connect_expired, o2net_connect_expired, nn); + INIT_WORK(&nn->nn_still_up, o2net_still_up, nn); + /* until we see hb from a node we'll return einval */ + nn->nn_persistent_error = -ENOTCONN; + init_waitqueue_head(&nn->nn_sc_wq); + idr_init(&nn->nn_status_idr); + INIT_LIST_HEAD(&nn->nn_status_list); + } + + return 0; +} + +void o2net_exit(void) +{ + o2quo_exit(); + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); +} diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h new file mode 100644 index 000000000000..a6f4585501c8 --- /dev/null +++ b/fs/ocfs2/cluster/tcp.h @@ -0,0 +1,113 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * tcp.h + * + * Function prototypes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef O2CLUSTER_TCP_H +#define O2CLUSTER_TCP_H + +#include <linux/socket.h> +#ifdef __KERNEL__ +#include <net/sock.h> +#include <linux/tcp.h> +#else +#include <sys/socket.h> +#endif +#include <linux/inet.h> +#include <linux/in.h> + +struct o2net_msg +{ + __be16 magic; + __be16 data_len; + __be16 msg_type; + __be16 pad1; + __be32 sys_status; + __be32 status; + __be32 key; + __be32 msg_num; + __u8 buf[0]; +}; + +typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data); + +#define O2NET_MAX_PAYLOAD_BYTES (4096 - sizeof(struct o2net_msg)) + +/* TODO: figure this out.... */ +static inline int o2net_link_down(int err, struct socket *sock) +{ + if (sock) { + if (sock->sk->sk_state != TCP_ESTABLISHED && + sock->sk->sk_state != TCP_CLOSE_WAIT) + return 1; + } + + if (err >= 0) + return 0; + switch (err) { + /* ????????????????????????? */ + case -ERESTARTSYS: + case -EBADF: + /* When the server has died, an ICMP port unreachable + * message prompts ECONNREFUSED. */ + case -ECONNREFUSED: + case -ENOTCONN: + case -ECONNRESET: + case -EPIPE: + return 1; + } + return 0; +} + +enum { + O2NET_DRIVER_UNINITED, + O2NET_DRIVER_READY, +}; + +int o2net_init_tcp_sock(struct inode *inode); +int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, + u8 target_node, int *status); +int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, + size_t veclen, u8 target_node, int *status); +int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, + struct inode *group); + +int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, + o2net_msg_handler_func *func, void *data, + struct list_head *unreg_list); +void o2net_unregister_handler_list(struct list_head *list); + +struct o2nm_node; +int o2net_register_hb_callbacks(void); +void o2net_unregister_hb_callbacks(void); +int o2net_start_listening(struct o2nm_node *node); +void o2net_stop_listening(struct o2nm_node *node); +void o2net_disconnect_node(struct o2nm_node *node); + +int o2net_init(void); +void o2net_exit(void); +int o2net_proc_init(struct proc_dir_entry *parent); +void o2net_proc_exit(struct proc_dir_entry *parent); + +#endif /* O2CLUSTER_TCP_H */ diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h new file mode 100644 index 000000000000..ff9e2e2104c2 --- /dev/null +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -0,0 +1,174 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_TCP_INTERNAL_H +#define O2CLUSTER_TCP_INTERNAL_H + +#define O2NET_MSG_MAGIC ((u16)0xfa55) +#define O2NET_MSG_STATUS_MAGIC ((u16)0xfa56) +#define O2NET_MSG_KEEP_REQ_MAGIC ((u16)0xfa57) +#define O2NET_MSG_KEEP_RESP_MAGIC ((u16)0xfa58) + +/* same as hb delay, we're waiting for another node to recognize our hb */ +#define O2NET_RECONNECT_DELAY_MS O2HB_REGION_TIMEOUT_MS + +/* we're delaying our quorum decision so that heartbeat will have timed + * out truly dead nodes by the time we come around to making decisions + * on their number */ +#define O2NET_QUORUM_DELAY_MS ((o2hb_dead_threshold + 2) * O2HB_REGION_TIMEOUT_MS) + +#define O2NET_KEEPALIVE_DELAY_SECS 5 +#define O2NET_IDLE_TIMEOUT_SECS 10 + +/* + * This version number represents quite a lot, unfortunately. It not + * only represents the raw network message protocol on the wire but also + * locking semantics of the file system using the protocol. It should + * be somewhere else, I'm sure, but right now it isn't. + * + * New in version 2: + * - full 64 bit i_size in the metadata lock lvbs + * - introduction of "rw" lock and pushing meta/data locking down + */ +#define O2NET_PROTOCOL_VERSION 2ULL +struct o2net_handshake { + __be64 protocol_version; + __be64 connector_id; +}; + +struct o2net_node { + /* this is never called from int/bh */ + spinlock_t nn_lock; + + /* set the moment an sc is allocated and a connect is started */ + struct o2net_sock_container *nn_sc; + /* _valid is only set after the handshake passes and tx can happen */ + unsigned nn_sc_valid:1; + /* if this is set tx just returns it */ + int nn_persistent_error; + + /* threads waiting for an sc to arrive wait on the wq for generation + * to increase. it is increased when a connecting socket succeeds + * or fails or when an accepted socket is attached. */ + wait_queue_head_t nn_sc_wq; + + struct idr nn_status_idr; + struct list_head nn_status_list; + + /* connects are attempted from when heartbeat comes up until either hb + * goes down, the node is unconfigured, no connect attempts succeed + * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work + * is queued from set_nn_state both from hb up and from itself if a + * connect attempt fails and so can be self-arming. shutdown is + * careful to first mark the nn such that no connects will be attempted + * before canceling delayed connect work and flushing the queue. */ + struct work_struct nn_connect_work; + unsigned long nn_last_connect_attempt; + + /* this is queued as nodes come up and is canceled when a connection is + * established. this expiring gives up on the node and errors out + * transmits */ + struct work_struct nn_connect_expired; + + /* after we give up on a socket we wait a while before deciding + * that it is still heartbeating and that we should do some + * quorum work */ + struct work_struct nn_still_up; +}; + +struct o2net_sock_container { + struct kref sc_kref; + /* the next two are vaild for the life time of the sc */ + struct socket *sc_sock; + struct o2nm_node *sc_node; + + /* all of these sc work structs hold refs on the sc while they are + * queued. they should not be able to ref a freed sc. the teardown + * race is with o2net_wq destruction in o2net_stop_listening() */ + + /* rx and connect work are generated from socket callbacks. sc + * shutdown removes the callbacks and then flushes the work queue */ + struct work_struct sc_rx_work; + struct work_struct sc_connect_work; + /* shutdown work is triggered in two ways. the simple way is + * for a code path calls ensure_shutdown which gets a lock, removes + * the sc from the nn, and queues the work. in this case the + * work is single-shot. the work is also queued from a sock + * callback, though, and in this case the work will find the sc + * still on the nn and will call ensure_shutdown itself.. this + * ends up triggering the shutdown work again, though nothing + * will be done in that second iteration. so work queue teardown + * has to be careful to remove the sc from the nn before waiting + * on the work queue so that the shutdown work doesn't remove the + * sc and rearm itself. + */ + struct work_struct sc_shutdown_work; + + struct timer_list sc_idle_timeout; + struct work_struct sc_keepalive_work; + + unsigned sc_handshake_ok:1; + + struct page *sc_page; + size_t sc_page_off; + + /* original handlers for the sockets */ + void (*sc_state_change)(struct sock *sk); + void (*sc_data_ready)(struct sock *sk, int bytes); + + struct timeval sc_tv_timer; + struct timeval sc_tv_data_ready; + struct timeval sc_tv_advance_start; + struct timeval sc_tv_advance_stop; + struct timeval sc_tv_func_start; + struct timeval sc_tv_func_stop; + u32 sc_msg_key; + u16 sc_msg_type; +}; + +struct o2net_msg_handler { + struct rb_node nh_node; + u32 nh_max_len; + u32 nh_msg_type; + u32 nh_key; + o2net_msg_handler_func *nh_func; + o2net_msg_handler_func *nh_func_data; + struct kref nh_kref; + struct list_head nh_unregister_item; +}; + +enum o2net_system_error { + O2NET_ERR_NONE = 0, + O2NET_ERR_NO_HNDLR, + O2NET_ERR_OVERFLOW, + O2NET_ERR_DIED, + O2NET_ERR_MAX +}; + +struct o2net_status_wait { + enum o2net_system_error ns_sys_status; + s32 ns_status; + int ns_id; + wait_queue_head_t ns_wq; + struct list_head ns_node_item; +}; + +#endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c new file mode 100644 index 000000000000..7286c48bb30d --- /dev/null +++ b/fs/ocfs2/cluster/ver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include "ver.h" + +#define CLUSTER_BUILD_VERSION "1.3.3" + +#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION + +void cluster_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h new file mode 100644 index 000000000000..32554c3382c2 --- /dev/null +++ b/fs/ocfs2/cluster/ver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef O2CLUSTER_VER_H +#define O2CLUSTER_VER_H + +void cluster_print_version(void); + +#endif /* O2CLUSTER_VER_H */ diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c new file mode 100644 index 000000000000..bd85182e97bc --- /dev/null +++ b/fs/ocfs2/dcache.c @@ -0,0 +1,91 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.c + * + * dentry cache handling code + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/namei.h> + +#define MLOG_MASK_PREFIX ML_DCACHE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "file.h" +#include "inode.h" + +static int ocfs2_dentry_revalidate(struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + int ret = 0; /* if all else fails, just return false */ + struct ocfs2_super *osb; + + mlog_entry("(0x%p, '%.*s')\n", dentry, + dentry->d_name.len, dentry->d_name.name); + + /* Never trust a negative dentry - force a new lookup. */ + if (inode == NULL) { + mlog(0, "negative dentry: %.*s\n", dentry->d_name.len, + dentry->d_name.name); + goto bail; + } + + osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!osb); + + if (inode != osb->root_inode) { + spin_lock(&OCFS2_I(inode)->ip_lock); + /* did we or someone else delete this inode? */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + mlog(0, "inode (%"MLFu64") deleted, returning false\n", + OCFS2_I(inode)->ip_blkno); + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (!inode->i_nlink) { + mlog(0, "Inode %"MLFu64" orphaned, returning false " + "dir = %d\n", OCFS2_I(inode)->ip_blkno, + S_ISDIR(inode->i_mode)); + goto bail; + } + } + + ret = 1; + +bail: + mlog_exit(ret); + + return ret; +} + +struct dentry_operations ocfs2_dentry_ops = { + .d_revalidate = ocfs2_dentry_revalidate, +}; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h new file mode 100644 index 000000000000..90072771114b --- /dev/null +++ b/fs/ocfs2/dcache.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dcache.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DCACHE_H +#define OCFS2_DCACHE_H + +extern struct dentry_operations ocfs2_dentry_ops; + +#endif /* OCFS2_DCACHE_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c new file mode 100644 index 000000000000..57158fa75d91 --- /dev/null +++ b/fs/ocfs2/dir.c @@ -0,0 +1,618 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.c + * + * Creates, reads, walks and deletes directory-nodes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_NAMEI +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static unsigned char ocfs2_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int ocfs2_extend_dir(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct buffer_head **new_de_bh); +/* + * ocfs2_readdir() + * + */ +int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + int error = 0; + unsigned long offset, blk; + int i, num, stored; + struct buffer_head * bh, * tmp; + struct ocfs2_dir_entry * de; + int err; + struct inode *inode = filp->f_dentry->d_inode; + struct super_block * sb = inode->i_sb; + int have_disk_lock = 0; + + mlog_entry("dirino=%"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + stored = 0; + bh = NULL; + + error = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (error < 0) { + if (error != -ENOENT) + mlog_errno(error); + /* we haven't got any yet, so propagate the error. */ + stored = error; + goto bail; + } + have_disk_lock = 1; + + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < i_size_read(inode)) { + blk = (filp->f_pos) >> sb->s_blocksize_bits; + bh = ocfs2_bread(inode, blk, &err, 0); + if (!bh) { + mlog(ML_ERROR, "directory #%"MLFu64" contains a hole " + "at offset %lld\n", + OCFS2_I(inode)->ip_blkno, + filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + + /* + * Do the readahead (8k) + */ + if (!offset) { + for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0; + i > 0; i--) { + tmp = ocfs2_bread(inode, ++blk, &err, 1); + if (tmp) + brelse(tmp); + } + } + +revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ocfs2_dir_entry *) (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + OCFS2_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < i_size_read(inode) + && offset < sb->s_blocksize) { + de = (struct ocfs2_dir_entry *) (bh->b_data + offset); + if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse(bh); + goto bail; + } + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = filp->f_version; + unsigned char d_type = DT_UNKNOWN; + + if (de->file_type < OCFS2_FT_MAX) + d_type = ocfs2_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + ino_from_blkno(sb, le64_to_cpu(de->inode)), + d_type); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse(bh); + } + + stored = 0; +bail: + if (have_disk_lock) + ocfs2_meta_unlock(inode, 0); + + mlog_exit(stored); + + return stored; +} + +/* + * NOTE: this should always be called with parent dir i_mutex taken. + */ +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct buffer_head **dirent_bh, + struct ocfs2_dir_entry **dirent) +{ + int status = -ENOENT; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry("(osb=%p, parent=%"MLFu64", name='%.*s', blkno=%p, " + "inode=%p)\n", + osb, OCFS2_I(inode)->ip_blkno, namelen, name, blkno, inode); + + *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); + if (!*dirent_bh || !*dirent) { + status = -ENOENT; + goto leave; + } + + *blkno = le64_to_cpu((*dirent)->inode); + + status = 0; +leave: + if (status < 0) { + *dirent = NULL; + if (*dirent_bh) { + brelse(*dirent_bh); + *dirent_bh = NULL; + } + } + + mlog_exit(status); + return status; +} + +/* Check for a name within a directory. + * + * Return 0 if the name does not exist + * Return -EEXIST if the directory contains the name + * + * Callers should have i_mutex + a cluster lock on dir + */ +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen) +{ + int ret; + struct buffer_head *dirent_bh = NULL; + struct ocfs2_dir_entry *dirent = NULL; + + mlog_entry("dir %"MLFu64", name '%.*s'\n", OCFS2_I(dir)->ip_blkno, + namelen, name); + + ret = -EEXIST; + dirent_bh = ocfs2_find_entry(name, namelen, dir, &dirent); + if (dirent_bh) + goto bail; + + ret = 0; +bail: + if (dirent_bh) + brelse(dirent_bh); + + mlog_exit(ret); + return ret; +} + +/* + * routine to check that the specified directory is empty (for rmdir) + */ +int ocfs2_empty_dir(struct inode *inode) +{ + unsigned long offset; + struct buffer_head * bh; + struct ocfs2_dir_entry * de, * de1; + struct super_block * sb; + int err; + + sb = inode->i_sb; + if ((i_size_read(inode) < + (OCFS2_DIR_REC_LEN(1) + OCFS2_DIR_REC_LEN(2))) || + !(bh = ocfs2_bread(inode, 0, &err, 0))) { + mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " + "no data block\n", + OCFS2_I(inode)->ip_blkno); + return 1; + } + + de = (struct ocfs2_dir_entry *) bh->b_data; + de1 = (struct ocfs2_dir_entry *) + ((char *)de + le16_to_cpu(de->rec_len)); + if ((le64_to_cpu(de->inode) != OCFS2_I(inode)->ip_blkno) || + !le64_to_cpu(de1->inode) || + strcmp(".", de->name) || + strcmp("..", de1->name)) { + mlog(ML_ERROR, "bad directory (dir #%"MLFu64") - " + "no `.' or `..'\n", + OCFS2_I(inode)->ip_blkno); + brelse(bh); + return 1; + } + offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); + de = (struct ocfs2_dir_entry *)((char *)de1 + le16_to_cpu(de1->rec_len)); + while (offset < i_size_read(inode) ) { + if (!bh || (void *)de >= (void *)(bh->b_data + sb->s_blocksize)) { + brelse(bh); + bh = ocfs2_bread(inode, + offset >> sb->s_blocksize_bits, &err, 0); + if (!bh) { + mlog(ML_ERROR, "directory #%"MLFu64" contains " + "a hole at offset %lu\n", + OCFS2_I(inode)->ip_blkno, offset); + offset += sb->s_blocksize; + continue; + } + de = (struct ocfs2_dir_entry *) bh->b_data; + } + if (!ocfs2_check_dir_entry(inode, de, bh, offset)) { + brelse(bh); + return 1; + } + if (le64_to_cpu(de->inode)) { + brelse(bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *) + ((char *)de + le16_to_cpu(de->rec_len)); + } + brelse(bh); + return 1; +} + +/* returns a bh of the 1st new block in the allocation. */ +int ocfs2_do_extend_dir(struct super_block *sb, + struct ocfs2_journal_handle *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh) +{ + int status; + int extend; + u64 p_blkno; + + spin_lock(&OCFS2_I(dir)->ip_lock); + extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); + spin_unlock(&OCFS2_I(dir)->ip_lock); + + if (extend) { + status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, + parent_fe_bh, handle, + data_ac, meta_ac, NULL); + BUG_ON(status == -EAGAIN); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> + (sb->s_blocksize_bits - 9)), + 1, &p_blkno, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *new_bh = sb_getblk(sb, p_blkno); + if (!*new_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + status = 0; +bail: + mlog_exit(status); + return status; +} + +/* assumes you already have a cluster lock on the directory. */ +static int ocfs2_extend_dir(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct buffer_head **new_de_bh) +{ + int status = 0; + int credits, num_free_extents; + loff_t dir_i_size; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry * de; + struct super_block *sb = osb->sb; + + mlog_entry_void(); + + dir_i_size = i_size_read(dir); + mlog(0, "extending dir %"MLFu64" (i_size = %lld)\n", + OCFS2_I(dir)->ip_blkno, dir_i_size); + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* dir->i_size is always block aligned. */ + spin_lock(&OCFS2_I(dir)->ip_lock); + if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { + spin_unlock(&OCFS2_I(dir)->ip_lock); + num_free_extents = ocfs2_num_free_extents(osb, dir, fe); + if (num_free_extents < 0) { + status = num_free_extents; + mlog_errno(status); + goto bail; + } + + if (!num_free_extents) { + status = ocfs2_reserve_new_metadata(osb, handle, + fe, &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + credits = ocfs2_calc_extend_credits(sb, fe, 1); + } else { + spin_unlock(&OCFS2_I(dir)->ip_lock); + credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; + } + + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_do_extend_dir(osb->sb, handle, dir, parent_fe_bh, + data_ac, meta_ac, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(dir, new_bh); + + status = ocfs2_journal_access(handle, dir, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, sb->s_blocksize); + de = (struct ocfs2_dir_entry *) new_bh->b_data; + de->inode = 0; + de->rec_len = cpu_to_le16(sb->s_blocksize); + status = ocfs2_journal_dirty(handle, new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + dir_i_size += dir->i_sb->s_blocksize; + i_size_write(dir, dir_i_size); + dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); + status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *new_de_bh = new_bh; + get_bh(*new_de_bh); +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + if (new_bh) + brelse(new_bh); + + mlog_exit(status); + return status; +} + +/* + * Search the dir for a good spot, extending it if necessary. The + * block containing an appropriate record is returned in ret_de_bh. + */ +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct buffer_head **ret_de_bh) +{ + unsigned long offset; + struct buffer_head * bh = NULL; + unsigned short rec_len; + struct ocfs2_dinode *fe; + struct ocfs2_dir_entry *de; + struct super_block *sb; + int status; + + mlog_entry_void(); + + mlog(0, "getting ready to insert namelen %d into dir %"MLFu64"\n", + namelen, OCFS2_I(dir)->ip_blkno); + + BUG_ON(!S_ISDIR(dir->i_mode)); + fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + BUG_ON(le64_to_cpu(fe->i_size) != i_size_read(dir)); + + sb = dir->i_sb; + + if (!namelen) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + bh = ocfs2_bread(dir, 0, &status, 0); + if (!bh) { + mlog_errno(status); + goto bail; + } + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) bh->b_data; + while (1) { + if ((char *)de >= sb->s_blocksize + bh->b_data) { + brelse(bh); + bh = NULL; + + if (i_size_read(dir) <= offset) { + status = ocfs2_extend_dir(osb, + dir, + parent_fe_bh, + &bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + BUG_ON(!bh); + *ret_de_bh = bh; + get_bh(*ret_de_bh); + goto bail; + } + bh = ocfs2_bread(dir, + offset >> sb->s_blocksize_bits, + &status, + 0); + if (!bh) { + mlog_errno(status); + goto bail; + } + /* move to next block */ + de = (struct ocfs2_dir_entry *) bh->b_data; + } + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + status = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + status = -EEXIST; + goto bail; + } + if (((le64_to_cpu(de->inode) == 0) && + (le16_to_cpu(de->rec_len) >= rec_len)) || + (le16_to_cpu(de->rec_len) >= + (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { + /* Ok, we found a spot. Return this bh and let + * the caller actually fill it in. */ + *ret_de_bh = bh; + get_bh(*ret_de_bh); + status = 0; + goto bail; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); + } + + status = 0; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h new file mode 100644 index 000000000000..5f614ec9649c --- /dev/null +++ b/fs/ocfs2/dir.h @@ -0,0 +1,54 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dir.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_DIR_H +#define OCFS2_DIR_H + +int ocfs2_check_dir_for_entry(struct inode *dir, + const char *name, + int namelen); +int ocfs2_empty_dir(struct inode *inode); /* FIXME: to namei.c */ +int ocfs2_find_files_on_disk(const char *name, + int namelen, + u64 *blkno, + struct inode *inode, + struct buffer_head **dirent_bh, + struct ocfs2_dir_entry **dirent); +int ocfs2_readdir(struct file *filp, void *dirent, filldir_t filldir); +int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, + struct inode *dir, + struct buffer_head *parent_fe_bh, + const char *name, + int namelen, + struct buffer_head **ret_de_bh); +struct ocfs2_alloc_context; +int ocfs2_do_extend_dir(struct super_block *sb, + struct ocfs2_journal_handle *handle, + struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct buffer_head **new_bh); +#endif /* OCFS2_DIR_H */ diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile new file mode 100644 index 000000000000..ce3f7c29d270 --- /dev/null +++ b/fs/ocfs2/dlm/Makefile @@ -0,0 +1,8 @@ +EXTRA_CFLAGS += -Ifs/ocfs2 + +obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o + +ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ + dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o + +ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o diff --git a/fs/ocfs2/dlm/dlmapi.h b/fs/ocfs2/dlm/dlmapi.h new file mode 100644 index 000000000000..53652f51c0e1 --- /dev/null +++ b/fs/ocfs2/dlm/dlmapi.h @@ -0,0 +1,214 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmapi.h + * + * externally exported dlm interfaces + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMAPI_H +#define DLMAPI_H + +struct dlm_lock; +struct dlm_ctxt; + +/* NOTE: changes made to this enum should be reflected in dlmdebug.c */ +enum dlm_status { + DLM_NORMAL = 0, /* 0: request in progress */ + DLM_GRANTED, /* 1: request granted */ + DLM_DENIED, /* 2: request denied */ + DLM_DENIED_NOLOCKS, /* 3: request denied, out of system resources */ + DLM_WORKING, /* 4: async request in progress */ + DLM_BLOCKED, /* 5: lock request blocked */ + DLM_BLOCKED_ORPHAN, /* 6: lock request blocked by a orphan lock*/ + DLM_DENIED_GRACE_PERIOD, /* 7: topological change in progress */ + DLM_SYSERR, /* 8: system error */ + DLM_NOSUPPORT, /* 9: unsupported */ + DLM_CANCELGRANT, /* 10: can't cancel convert: already granted */ + DLM_IVLOCKID, /* 11: bad lockid */ + DLM_SYNC, /* 12: synchronous request granted */ + DLM_BADTYPE, /* 13: bad resource type */ + DLM_BADRESOURCE, /* 14: bad resource handle */ + DLM_MAXHANDLES, /* 15: no more resource handles */ + DLM_NOCLINFO, /* 16: can't contact cluster manager */ + DLM_NOLOCKMGR, /* 17: can't contact lock manager */ + DLM_NOPURGED, /* 18: can't contact purge daemon */ + DLM_BADARGS, /* 19: bad api args */ + DLM_VOID, /* 20: no status */ + DLM_NOTQUEUED, /* 21: NOQUEUE was specified and request failed */ + DLM_IVBUFLEN, /* 22: invalid resource name length */ + DLM_CVTUNGRANT, /* 23: attempted to convert ungranted lock */ + DLM_BADPARAM, /* 24: invalid lock mode specified */ + DLM_VALNOTVALID, /* 25: value block has been invalidated */ + DLM_REJECTED, /* 26: request rejected, unrecognized client */ + DLM_ABORT, /* 27: blocked lock request cancelled */ + DLM_CANCEL, /* 28: conversion request cancelled */ + DLM_IVRESHANDLE, /* 29: invalid resource handle */ + DLM_DEADLOCK, /* 30: deadlock recovery refused this request */ + DLM_DENIED_NOASTS, /* 31: failed to allocate AST */ + DLM_FORWARD, /* 32: request must wait for primary's response */ + DLM_TIMEOUT, /* 33: timeout value for lock has expired */ + DLM_IVGROUPID, /* 34: invalid group specification */ + DLM_VERS_CONFLICT, /* 35: version conflicts prevent request handling */ + DLM_BAD_DEVICE_PATH, /* 36: Locks device does not exist or path wrong */ + DLM_NO_DEVICE_PERMISSION, /* 37: Client has insufficient pers for device */ + DLM_NO_CONTROL_DEVICE, /* 38: Cannot set options on opened device */ + + DLM_RECOVERING, /* 39: extension, allows caller to fail a lock + request if it is being recovered */ + DLM_MIGRATING, /* 40: extension, allows caller to fail a lock + request if it is being migrated */ + DLM_MAXSTATS, /* 41: upper limit for return code validation */ +}; + +/* for pretty-printing dlm_status error messages */ +const char *dlm_errmsg(enum dlm_status err); +/* for pretty-printing dlm_status error names */ +const char *dlm_errname(enum dlm_status err); + +/* Eventually the DLM will use standard errno values, but in the + * meantime this lets us track dlm errors as they bubble up. When we + * bring its error reporting into line with the rest of the stack, + * these can just be replaced with calls to mlog_errno. */ +#define dlm_error(st) do { \ + if ((st) != DLM_RECOVERING && \ + (st) != DLM_MIGRATING && \ + (st) != DLM_FORWARD) \ + mlog(ML_ERROR, "dlm status = %s\n", dlm_errname((st))); \ +} while (0) + +#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_PUT_LVB 0x02 +#define DLM_LKSB_GET_LVB 0x04 +#define DLM_LKSB_UNUSED2 0x08 +#define DLM_LKSB_UNUSED3 0x10 +#define DLM_LKSB_UNUSED4 0x20 +#define DLM_LKSB_UNUSED5 0x40 +#define DLM_LKSB_UNUSED6 0x80 + +#define DLM_LVB_LEN 64 + +/* Callers are only allowed access to the lvb and status members of + * this struct. */ +struct dlm_lockstatus { + enum dlm_status status; + u32 flags; + struct dlm_lock *lockid; + char lvb[DLM_LVB_LEN]; +}; + +/* Valid lock modes. */ +#define LKM_IVMODE (-1) /* invalid mode */ +#define LKM_NLMODE 0 /* null lock */ +#define LKM_CRMODE 1 /* concurrent read unsupported */ +#define LKM_CWMODE 2 /* concurrent write unsupported */ +#define LKM_PRMODE 3 /* protected read */ +#define LKM_PWMODE 4 /* protected write unsupported */ +#define LKM_EXMODE 5 /* exclusive */ +#define LKM_MAXMODE 5 +#define LKM_MODEMASK 0xff + +/* Flags passed to dlmlock and dlmunlock: + * reserved: flags used by the "real" dlm + * only a few are supported by this dlm + * (U) = unsupported by ocfs2 dlm */ +#define LKM_ORPHAN 0x00000010 /* this lock is orphanable (U) */ +#define LKM_PARENTABLE 0x00000020 /* this lock was orphaned (U) */ +#define LKM_BLOCK 0x00000040 /* blocking lock request (U) */ +#define LKM_LOCAL 0x00000080 /* local lock request */ +#define LKM_VALBLK 0x00000100 /* lock value block request */ +#define LKM_NOQUEUE 0x00000200 /* non blocking request */ +#define LKM_CONVERT 0x00000400 /* conversion request */ +#define LKM_NODLCKWT 0x00000800 /* this lock wont deadlock (U) */ +#define LKM_UNLOCK 0x00001000 /* deallocate this lock */ +#define LKM_CANCEL 0x00002000 /* cancel conversion request */ +#define LKM_DEQALL 0x00004000 /* remove all locks held by proc (U) */ +#define LKM_INVVALBLK 0x00008000 /* invalidate lock value block */ +#define LKM_SYNCSTS 0x00010000 /* return synchronous status if poss (U) */ +#define LKM_TIMEOUT 0x00020000 /* lock request contains timeout (U) */ +#define LKM_SNGLDLCK 0x00040000 /* request can self-deadlock (U) */ +#define LKM_FINDLOCAL 0x00080000 /* find local lock request (U) */ +#define LKM_PROC_OWNED 0x00100000 /* owned by process, not group (U) */ +#define LKM_XID 0x00200000 /* use transaction id for deadlock (U) */ +#define LKM_XID_CONFLICT 0x00400000 /* do not allow lock inheritance (U) */ +#define LKM_FORCE 0x00800000 /* force unlock flag */ +#define LKM_REVVALBLK 0x01000000 /* temporary solution: re-validate + lock value block (U) */ +/* unused */ +#define LKM_UNUSED1 0x00000001 /* unused */ +#define LKM_UNUSED2 0x00000002 /* unused */ +#define LKM_UNUSED3 0x00000004 /* unused */ +#define LKM_UNUSED4 0x00000008 /* unused */ +#define LKM_UNUSED5 0x02000000 /* unused */ +#define LKM_UNUSED6 0x04000000 /* unused */ +#define LKM_UNUSED7 0x08000000 /* unused */ + +/* ocfs2 extensions: internal only + * should never be used by caller */ +#define LKM_MIGRATION 0x10000000 /* extension: lockres is to be migrated + to another node */ +#define LKM_PUT_LVB 0x20000000 /* extension: lvb is being passed + should be applied to lockres */ +#define LKM_GET_LVB 0x40000000 /* extension: lvb should be copied + from lockres when lock is granted */ +#define LKM_RECOVERY 0x80000000 /* extension: flag for recovery lock + used to avoid recovery rwsem */ + + +typedef void (dlm_astlockfunc_t)(void *); +typedef void (dlm_bastlockfunc_t)(void *, int); +typedef void (dlm_astunlockfunc_t)(void *, enum dlm_status); + +enum dlm_status dlmlock(struct dlm_ctxt *dlm, + int mode, + struct dlm_lockstatus *lksb, + int flags, + const char *name, + dlm_astlockfunc_t *ast, + void *data, + dlm_bastlockfunc_t *bast); + +enum dlm_status dlmunlock(struct dlm_ctxt *dlm, + struct dlm_lockstatus *lksb, + int flags, + dlm_astunlockfunc_t *unlockast, + void *data); + +struct dlm_ctxt * dlm_register_domain(const char *domain, u32 key); + +void dlm_unregister_domain(struct dlm_ctxt *dlm); + +void dlm_print_one_lock(struct dlm_lock *lockid); + +typedef void (dlm_eviction_func)(int, void *); +struct dlm_eviction_cb { + struct list_head ec_item; + dlm_eviction_func *ec_func; + void *ec_data; +}; +void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, + dlm_eviction_func *f, + void *data); +void dlm_register_eviction_cb(struct dlm_ctxt *dlm, + struct dlm_eviction_cb *cb); +void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb); + +#endif /* DLMAPI_H */ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c new file mode 100644 index 000000000000..8d17d28ef91c --- /dev/null +++ b/fs/ocfs2/dlm/dlmast.c @@ -0,0 +1,466 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmast.c + * + * AST and BAST functionality for local and remote nodes + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/spinlock.h> + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" +#include "cluster/endian.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock); +static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); + +/* Should be called as an ast gets queued to see if the new + * lock level will obsolete a pending bast. + * For example, if dlm_thread queued a bast for an EX lock that + * was blocking another EX, but before sending the bast the + * lock owner downconverted to NL, the bast is now obsolete. + * Only the ast should be sent. + * This is needed because the lock and convert paths can queue + * asts out-of-band (not waiting for dlm_thread) in order to + * allow for LKM_NOQUEUE to get immediate responses. */ +static int dlm_should_cancel_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + assert_spin_locked(&dlm->ast_lock); + assert_spin_locked(&lock->spinlock); + + if (lock->ml.highest_blocked == LKM_IVMODE) + return 0; + BUG_ON(lock->ml.highest_blocked == LKM_NLMODE); + + if (lock->bast_pending && + list_empty(&lock->bast_list)) + /* old bast already sent, ok */ + return 0; + + if (lock->ml.type == LKM_EXMODE) + /* EX blocks anything left, any bast still valid */ + return 0; + else if (lock->ml.type == LKM_NLMODE) + /* NL blocks nothing, no reason to send any bast, cancel it */ + return 1; + else if (lock->ml.highest_blocked != LKM_EXMODE) + /* PR only blocks EX */ + return 1; + + return 0; +} + +static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + mlog_entry_void(); + + BUG_ON(!dlm); + BUG_ON(!lock); + + assert_spin_locked(&dlm->ast_lock); + if (!list_empty(&lock->ast_list)) { + mlog(ML_ERROR, "ast list not empty!! pending=%d, newlevel=%d\n", + lock->ast_pending, lock->ml.type); + BUG(); + } + BUG_ON(!list_empty(&lock->ast_list)); + if (lock->ast_pending) + mlog(0, "lock has an ast getting flushed right now\n"); + + /* putting lock on list, add a ref */ + dlm_lock_get(lock); + spin_lock(&lock->spinlock); + + /* check to see if this ast obsoletes the bast */ + if (dlm_should_cancel_bast(dlm, lock)) { + struct dlm_lock_resource *res = lock->lockres; + mlog(0, "%s: cancelling bast for %.*s\n", + dlm->name, res->lockname.len, res->lockname.name); + lock->bast_pending = 0; + list_del_init(&lock->bast_list); + lock->ml.highest_blocked = LKM_IVMODE; + /* removing lock from list, remove a ref. guaranteed + * this won't be the last ref because of the get above, + * so res->spinlock will not be taken here */ + dlm_lock_put(lock); + /* free up the reserved bast that we are cancelling. + * guaranteed that this will not be the last reserved + * ast because *both* an ast and a bast were reserved + * to get to this point. the res->spinlock will not be + * taken here */ + dlm_lockres_release_ast(dlm, res); + } + list_add_tail(&lock->ast_list, &dlm->pending_asts); + lock->ast_pending = 1; + spin_unlock(&lock->spinlock); +} + +void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + mlog_entry_void(); + + BUG_ON(!dlm); + BUG_ON(!lock); + + spin_lock(&dlm->ast_lock); + __dlm_queue_ast(dlm, lock); + spin_unlock(&dlm->ast_lock); +} + + +static void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + mlog_entry_void(); + + BUG_ON(!dlm); + BUG_ON(!lock); + assert_spin_locked(&dlm->ast_lock); + + BUG_ON(!list_empty(&lock->bast_list)); + if (lock->bast_pending) + mlog(0, "lock has a bast getting flushed right now\n"); + + /* putting lock on list, add a ref */ + dlm_lock_get(lock); + spin_lock(&lock->spinlock); + list_add_tail(&lock->bast_list, &dlm->pending_basts); + lock->bast_pending = 1; + spin_unlock(&lock->spinlock); +} + +void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + mlog_entry_void(); + + BUG_ON(!dlm); + BUG_ON(!lock); + + spin_lock(&dlm->ast_lock); + __dlm_queue_bast(dlm, lock); + spin_unlock(&dlm->ast_lock); +} + +static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + struct dlm_lockstatus *lksb = lock->lksb; + BUG_ON(!lksb); + + /* only updates if this node masters the lockres */ + if (res->owner == dlm->node_num) { + + spin_lock(&res->spinlock); + /* check the lksb flags for the direction */ + if (lksb->flags & DLM_LKSB_GET_LVB) { + mlog(0, "getting lvb from lockres for %s node\n", + lock->ml.node == dlm->node_num ? "master" : + "remote"); + memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); + } else if (lksb->flags & DLM_LKSB_PUT_LVB) { + mlog(0, "setting lvb from lockres for %s node\n", + lock->ml.node == dlm->node_num ? "master" : + "remote"); + memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); + } + spin_unlock(&res->spinlock); + } + + /* reset any lvb flags on the lksb */ + lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); +} + +void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + dlm_astlockfunc_t *fn; + struct dlm_lockstatus *lksb; + + mlog_entry_void(); + + lksb = lock->lksb; + fn = lock->ast; + BUG_ON(lock->ml.node != dlm->node_num); + + dlm_update_lvb(dlm, res, lock); + (*fn)(lock->astdata); +} + + +int dlm_do_remote_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + int ret; + struct dlm_lockstatus *lksb; + int lksbflags; + + mlog_entry_void(); + + lksb = lock->lksb; + BUG_ON(lock->ml.node == dlm->node_num); + + lksbflags = lksb->flags; + dlm_update_lvb(dlm, res, lock); + + /* lock request came from another node + * go do the ast over there */ + ret = dlm_send_proxy_ast(dlm, res, lock, lksbflags); + return ret; +} + +void dlm_do_local_bast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock, int blocked_type) +{ + dlm_bastlockfunc_t *fn = lock->bast; + + mlog_entry_void(); + BUG_ON(lock->ml.node != dlm->node_num); + + (*fn)(lock->astdata, blocked_type); +} + + + +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data) +{ + int ret; + unsigned int locklen; + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; + char *name; + struct list_head *iter, *head=NULL; + u64 cookie; + u32 flags; + + if (!dlm_grab(dlm)) { + dlm_error(DLM_REJECTED); + return DLM_REJECTED; + } + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + name = past->name; + locklen = past->namelen; + cookie = be64_to_cpu(past->cookie); + flags = be32_to_cpu(past->flags); + + if (locklen > DLM_LOCKID_NAME_MAX) { + ret = DLM_IVBUFLEN; + mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); + goto leave; + } + + if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == + (LKM_PUT_LVB|LKM_GET_LVB)) { + mlog(ML_ERROR, "both PUT and GET lvb specified\n"); + ret = DLM_BADARGS; + goto leave; + } + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : + (flags & LKM_GET_LVB ? "get lvb" : "none")); + + mlog(0, "type=%d, blocked_type=%d\n", past->type, past->blocked_type); + + if (past->type != DLM_AST && + past->type != DLM_BAST) { + mlog(ML_ERROR, "Unknown ast type! %d, cookie=%"MLFu64", " + "name=%.*s\n", past->type, cookie, locklen, name); + ret = DLM_IVLOCKID; + goto leave; + } + + res = dlm_lookup_lockres(dlm, name, locklen); + if (!res) { + mlog(ML_ERROR, "got %sast for unknown lockres! " + "cookie=%"MLFu64", name=%.*s, namelen=%u\n", + past->type == DLM_AST ? "" : "b", + cookie, locklen, name, locklen); + ret = DLM_IVLOCKID; + goto leave; + } + + /* cannot get a proxy ast message if this node owns it */ + BUG_ON(res->owner == dlm->node_num); + + mlog(0, "lockres %.*s\n", res->lockname.len, res->lockname.name); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(0, "responding with DLM_RECOVERING!\n"); + ret = DLM_RECOVERING; + goto unlock_out; + } + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "responding with DLM_MIGRATING!\n"); + ret = DLM_MIGRATING; + goto unlock_out; + } + /* try convert queue for both ast/bast */ + head = &res->converting; + lock = NULL; + list_for_each(iter, head) { + lock = list_entry (iter, struct dlm_lock, list); + if (be64_to_cpu(lock->ml.cookie) == cookie) + goto do_ast; + } + + /* if not on convert, try blocked for ast, granted for bast */ + if (past->type == DLM_AST) + head = &res->blocked; + else + head = &res->granted; + + list_for_each(iter, head) { + lock = list_entry (iter, struct dlm_lock, list); + if (be64_to_cpu(lock->ml.cookie) == cookie) + goto do_ast; + } + + mlog(ML_ERROR, "got %sast for unknown lock! cookie=%"MLFu64", " + "name=%.*s, namelen=%u\n", + past->type == DLM_AST ? "" : "b", cookie, locklen, name, locklen); + + ret = DLM_NORMAL; +unlock_out: + spin_unlock(&res->spinlock); + goto leave; + +do_ast: + ret = DLM_NORMAL; + if (past->type == DLM_AST) { + /* do not alter lock refcount. switching lists. */ + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); + mlog(0, "ast: adding to granted list... type=%d, " + "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); + if (lock->ml.convert_type != LKM_IVMODE) { + lock->ml.type = lock->ml.convert_type; + lock->ml.convert_type = LKM_IVMODE; + } else { + // should already be there.... + } + + lock->lksb->status = DLM_NORMAL; + + /* if we requested the lvb, fetch it into our lksb now */ + if (flags & LKM_GET_LVB) { + BUG_ON(!(lock->lksb->flags & DLM_LKSB_GET_LVB)); + memcpy(lock->lksb->lvb, past->lvb, DLM_LVB_LEN); + } + } + spin_unlock(&res->spinlock); + + if (past->type == DLM_AST) + dlm_do_local_ast(dlm, res, lock); + else + dlm_do_local_bast(dlm, res, lock, past->blocked_type); + +leave: + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + return ret; +} + + + +int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_lock *lock, int msg_type, + int blocked_type, int flags) +{ + int ret = 0; + struct dlm_proxy_ast past; + struct kvec vec[2]; + size_t veclen = 1; + int status; + + mlog_entry("res %.*s, to=%u, type=%d, blocked_type=%d\n", + res->lockname.len, res->lockname.name, lock->ml.node, + msg_type, blocked_type); + + memset(&past, 0, sizeof(struct dlm_proxy_ast)); + past.node_idx = dlm->node_num; + past.type = msg_type; + past.blocked_type = blocked_type; + past.namelen = res->lockname.len; + memcpy(past.name, res->lockname.name, past.namelen); + past.cookie = lock->ml.cookie; + + vec[0].iov_len = sizeof(struct dlm_proxy_ast); + vec[0].iov_base = &past; + if (flags & DLM_LKSB_GET_LVB) { + mlog(0, "returning requested LVB data\n"); + be32_add_cpu(&past.flags, LKM_GET_LVB); + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + ret = o2net_send_message_vec(DLM_PROXY_AST_MSG, dlm->key, vec, veclen, + lock->ml.node, &status); + if (ret < 0) + mlog_errno(ret); + else { + if (status == DLM_RECOVERING) { + mlog(ML_ERROR, "sent AST to node %u, it thinks this " + "node is dead!\n", lock->ml.node); + BUG(); + } else if (status == DLM_MIGRATING) { + mlog(ML_ERROR, "sent AST to node %u, it returned " + "DLM_MIGRATING!\n", lock->ml.node); + BUG(); + } else if (status != DLM_NORMAL) { + mlog(ML_ERROR, "AST to node %u returned %d!\n", + lock->ml.node, status); + /* ignore it */ + } + ret = 0; + } + return ret; +} diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h new file mode 100644 index 000000000000..3fecba0a6023 --- /dev/null +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -0,0 +1,884 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmcommon.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMCOMMON_H +#define DLMCOMMON_H + +#include <linux/kref.h> + +#define DLM_HB_NODE_DOWN_PRI (0xf000000) +#define DLM_HB_NODE_UP_PRI (0x8000000) + +#define DLM_LOCKID_NAME_MAX 32 + +#define DLM_DOMAIN_NAME_MAX_LEN 255 +#define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES +#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes +#define DLM_THREAD_MS 200 // flush at least every 200 ms + +#define DLM_HASH_BITS 7 +#define DLM_HASH_SIZE (1 << DLM_HASH_BITS) +#define DLM_HASH_MASK (DLM_HASH_SIZE - 1) + +enum dlm_ast_type { + DLM_AST = 0, + DLM_BAST, + DLM_ASTUNLOCK +}; + + +#define LKM_VALID_FLAGS (LKM_VALBLK | LKM_CONVERT | LKM_UNLOCK | \ + LKM_CANCEL | LKM_INVVALBLK | LKM_FORCE | \ + LKM_RECOVERY | LKM_LOCAL | LKM_NOQUEUE) + +#define DLM_RECOVERY_LOCK_NAME "$RECOVERY" +#define DLM_RECOVERY_LOCK_NAME_LEN 9 + +static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) +{ + if (name_len == DLM_RECOVERY_LOCK_NAME_LEN && + memcmp(lock_name, DLM_RECOVERY_LOCK_NAME, name_len)==0) + return 1; + return 0; +} + +#define DLM_RECO_STATE_ACTIVE 0x0001 + +struct dlm_recovery_ctxt +{ + struct list_head resources; + struct list_head received; + struct list_head node_data; + u8 new_master; + u8 dead_node; + u16 state; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + wait_queue_head_t event; +}; + +enum dlm_ctxt_state { + DLM_CTXT_NEW = 0, + DLM_CTXT_JOINED, + DLM_CTXT_IN_SHUTDOWN, + DLM_CTXT_LEAVING, +}; + +struct dlm_ctxt +{ + struct list_head list; + struct list_head *resources; + struct list_head dirty_list; + struct list_head purge_list; + struct list_head pending_asts; + struct list_head pending_basts; + unsigned int purge_count; + spinlock_t spinlock; + spinlock_t ast_lock; + char *name; + u8 node_num; + u32 key; + u8 joining_node; + wait_queue_head_t dlm_join_events; + unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + struct dlm_recovery_ctxt reco; + spinlock_t master_lock; + struct list_head master_list; + struct list_head mle_hb_events; + + /* these give a really vague idea of the system load */ + atomic_t local_resources; + atomic_t remote_resources; + atomic_t unknown_resources; + + /* NOTE: Next three are protected by dlm_domain_lock */ + struct kref dlm_refs; + enum dlm_ctxt_state dlm_state; + unsigned int num_joins; + + struct o2hb_callback_func dlm_hb_up; + struct o2hb_callback_func dlm_hb_down; + struct task_struct *dlm_thread_task; + struct task_struct *dlm_reco_thread_task; + wait_queue_head_t dlm_thread_wq; + wait_queue_head_t dlm_reco_thread_wq; + wait_queue_head_t ast_wq; + wait_queue_head_t migration_wq; + + struct work_struct dispatched_work; + struct list_head work_list; + spinlock_t work_lock; + struct list_head dlm_domain_handlers; + struct list_head dlm_eviction_callbacks; +}; + +/* these keventd work queue items are for less-frequently + * called functions that cannot be directly called from the + * net message handlers for some reason, usually because + * they need to send net messages of their own. */ +void dlm_dispatch_work(void *data); + +struct dlm_lock_resource; +struct dlm_work_item; + +typedef void (dlm_workfunc_t)(struct dlm_work_item *, void *); + +struct dlm_request_all_locks_priv +{ + u8 reco_master; + u8 dead_node; +}; + +struct dlm_mig_lockres_priv +{ + struct dlm_lock_resource *lockres; + u8 real_master; +}; + +struct dlm_assert_master_priv +{ + struct dlm_lock_resource *lockres; + u8 request_from; + u32 flags; + unsigned ignore_higher:1; +}; + + +struct dlm_work_item +{ + struct list_head list; + dlm_workfunc_t *func; + struct dlm_ctxt *dlm; + void *data; + union { + struct dlm_request_all_locks_priv ral; + struct dlm_mig_lockres_priv ml; + struct dlm_assert_master_priv am; + } u; +}; + +static inline void dlm_init_work_item(struct dlm_ctxt *dlm, + struct dlm_work_item *i, + dlm_workfunc_t *f, void *data) +{ + memset(i, 0, sizeof(*i)); + i->func = f; + INIT_LIST_HEAD(&i->list); + i->data = data; + i->dlm = dlm; /* must have already done a dlm_grab on this! */ +} + + + +static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, + u8 node) +{ + assert_spin_locked(&dlm->spinlock); + + dlm->joining_node = node; + wake_up(&dlm->dlm_join_events); +} + +#define DLM_LOCK_RES_UNINITED 0x00000001 +#define DLM_LOCK_RES_RECOVERING 0x00000002 +#define DLM_LOCK_RES_READY 0x00000004 +#define DLM_LOCK_RES_DIRTY 0x00000008 +#define DLM_LOCK_RES_IN_PROGRESS 0x00000010 +#define DLM_LOCK_RES_MIGRATING 0x00000020 + +#define DLM_PURGE_INTERVAL_MS (8 * 1000) + +struct dlm_lock_resource +{ + /* WARNING: Please see the comment in dlm_init_lockres before + * adding fields here. */ + struct list_head list; + struct kref refs; + + /* please keep these next 3 in this order + * some funcs want to iterate over all lists */ + struct list_head granted; + struct list_head converting; + struct list_head blocked; + + struct list_head dirty; + struct list_head recovering; // dlm_recovery_ctxt.resources list + + /* unused lock resources have their last_used stamped and are + * put on a list for the dlm thread to run. */ + struct list_head purge; + unsigned long last_used; + + unsigned migration_pending:1; + atomic_t asts_reserved; + spinlock_t spinlock; + wait_queue_head_t wq; + u8 owner; //node which owns the lock resource, or unknown + u16 state; + struct qstr lockname; + char lvb[DLM_LVB_LEN]; +}; + +struct dlm_migratable_lock +{ + __be64 cookie; + + /* these 3 are just padding for the in-memory structure, but + * list and flags are actually used when sent over the wire */ + __be16 pad1; + u8 list; // 0=granted, 1=converting, 2=blocked + u8 flags; + + s8 type; + s8 convert_type; + s8 highest_blocked; + u8 node; +}; // 16 bytes + +struct dlm_lock +{ + struct dlm_migratable_lock ml; + + struct list_head list; + struct list_head ast_list; + struct list_head bast_list; + struct dlm_lock_resource *lockres; + spinlock_t spinlock; + struct kref lock_refs; + + // ast and bast must be callable while holding a spinlock! + dlm_astlockfunc_t *ast; + dlm_bastlockfunc_t *bast; + void *astdata; + struct dlm_lockstatus *lksb; + unsigned ast_pending:1, + bast_pending:1, + convert_pending:1, + lock_pending:1, + cancel_pending:1, + unlock_pending:1, + lksb_kernel_allocated:1; +}; + + +#define DLM_LKSB_UNUSED1 0x01 +#define DLM_LKSB_PUT_LVB 0x02 +#define DLM_LKSB_GET_LVB 0x04 +#define DLM_LKSB_UNUSED2 0x08 +#define DLM_LKSB_UNUSED3 0x10 +#define DLM_LKSB_UNUSED4 0x20 +#define DLM_LKSB_UNUSED5 0x40 +#define DLM_LKSB_UNUSED6 0x80 + + +enum dlm_lockres_list { + DLM_GRANTED_LIST = 0, + DLM_CONVERTING_LIST, + DLM_BLOCKED_LIST +}; + +static inline struct list_head * +dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) +{ + struct list_head *ret = NULL; + if (idx == DLM_GRANTED_LIST) + ret = &res->granted; + else if (idx == DLM_CONVERTING_LIST) + ret = &res->converting; + else if (idx == DLM_BLOCKED_LIST) + ret = &res->blocked; + else + BUG(); + return ret; +} + + + + +struct dlm_node_iter +{ + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int curnode; +}; + + +enum { + DLM_MASTER_REQUEST_MSG = 500, + DLM_UNUSED_MSG1, /* 501 */ + DLM_ASSERT_MASTER_MSG, /* 502 */ + DLM_CREATE_LOCK_MSG, /* 503 */ + DLM_CONVERT_LOCK_MSG, /* 504 */ + DLM_PROXY_AST_MSG, /* 505 */ + DLM_UNLOCK_LOCK_MSG, /* 506 */ + DLM_UNUSED_MSG2, /* 507 */ + DLM_MIGRATE_REQUEST_MSG, /* 508 */ + DLM_MIG_LOCKRES_MSG, /* 509 */ + DLM_QUERY_JOIN_MSG, /* 510 */ + DLM_ASSERT_JOINED_MSG, /* 511 */ + DLM_CANCEL_JOIN_MSG, /* 512 */ + DLM_EXIT_DOMAIN_MSG, /* 513 */ + DLM_MASTER_REQUERY_MSG, /* 514 */ + DLM_LOCK_REQUEST_MSG, /* 515 */ + DLM_RECO_DATA_DONE_MSG, /* 516 */ + DLM_BEGIN_RECO_MSG, /* 517 */ + DLM_FINALIZE_RECO_MSG /* 518 */ +}; + +struct dlm_reco_node_data +{ + int state; + u8 node_num; + struct list_head list; +}; + +enum { + DLM_RECO_NODE_DATA_DEAD = -1, + DLM_RECO_NODE_DATA_INIT = 0, + DLM_RECO_NODE_DATA_REQUESTING, + DLM_RECO_NODE_DATA_REQUESTED, + DLM_RECO_NODE_DATA_RECEIVING, + DLM_RECO_NODE_DATA_DONE, + DLM_RECO_NODE_DATA_FINALIZE_SENT, +}; + + +enum { + DLM_MASTER_RESP_NO = 0, + DLM_MASTER_RESP_YES, + DLM_MASTER_RESP_MAYBE, + DLM_MASTER_RESP_ERROR +}; + + +struct dlm_master_request +{ + u8 node_idx; + u8 namelen; + __be16 pad1; + __be32 flags; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_ASSERT_MASTER_MLE_CLEANUP 0x00000001 +#define DLM_ASSERT_MASTER_REQUERY 0x00000002 +#define DLM_ASSERT_MASTER_FINISH_MIGRATION 0x00000004 +struct dlm_assert_master +{ + u8 node_idx; + u8 namelen; + __be16 pad1; + __be32 flags; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_migrate_request +{ + u8 master; + u8 new_master; + u8 namelen; + u8 pad1; + __be32 pad2; + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_master_requery +{ + u8 pad1; + u8 pad2; + u8 node_idx; + u8 namelen; + __be32 pad3; + u8 name[O2NM_MAX_NAME_LEN]; +}; + +#define DLM_MRES_RECOVERY 0x01 +#define DLM_MRES_MIGRATION 0x02 +#define DLM_MRES_ALL_DONE 0x04 + +/* + * We would like to get one whole lockres into a single network + * message whenever possible. Generally speaking, there will be + * at most one dlm_lock on a lockres for each node in the cluster, + * plus (infrequently) any additional locks coming in from userdlm. + * + * struct _dlm_lockres_page + * { + * dlm_migratable_lockres mres; + * dlm_migratable_lock ml[DLM_MAX_MIGRATABLE_LOCKS]; + * u8 pad[DLM_MIG_LOCKRES_RESERVED]; + * }; + * + * from ../cluster/tcp.h + * NET_MAX_PAYLOAD_BYTES (4096 - sizeof(net_msg)) + * (roughly 4080 bytes) + * and sizeof(dlm_migratable_lockres) = 112 bytes + * and sizeof(dlm_migratable_lock) = 16 bytes + * + * Choosing DLM_MAX_MIGRATABLE_LOCKS=240 and + * DLM_MIG_LOCKRES_RESERVED=128 means we have this: + * + * (DLM_MAX_MIGRATABLE_LOCKS * sizeof(dlm_migratable_lock)) + + * sizeof(dlm_migratable_lockres) + DLM_MIG_LOCKRES_RESERVED = + * NET_MAX_PAYLOAD_BYTES + * (240 * 16) + 112 + 128 = 4080 + * + * So a lockres would need more than 240 locks before it would + * use more than one network packet to recover. Not too bad. + */ +#define DLM_MAX_MIGRATABLE_LOCKS 240 + +struct dlm_migratable_lockres +{ + u8 master; + u8 lockname_len; + u8 num_locks; // locks sent in this structure + u8 flags; + __be32 total_locks; // locks to be sent for this migration cookie + __be64 mig_cookie; // cookie for this lockres migration + // or zero if not needed + // 16 bytes + u8 lockname[DLM_LOCKID_NAME_MAX]; + // 48 bytes + u8 lvb[DLM_LVB_LEN]; + // 112 bytes + struct dlm_migratable_lock ml[0]; // 16 bytes each, begins at byte 112 +}; +#define DLM_MIG_LOCKRES_MAX_LEN \ + (sizeof(struct dlm_migratable_lockres) + \ + (sizeof(struct dlm_migratable_lock) * \ + DLM_MAX_MIGRATABLE_LOCKS) ) + +/* from above, 128 bytes + * for some undetermined future use */ +#define DLM_MIG_LOCKRES_RESERVED (NET_MAX_PAYLOAD_BYTES - \ + DLM_MIG_LOCKRES_MAX_LEN) + +struct dlm_create_lock +{ + __be64 cookie; + + __be32 flags; + u8 pad1; + u8 node_idx; + s8 requested_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_convert_lock +{ + __be64 cookie; + + __be32 flags; + u8 pad1; + u8 node_idx; + s8 requested_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_CONVERT_LOCK_MAX_LEN (sizeof(struct dlm_convert_lock)+DLM_LVB_LEN) + +struct dlm_unlock_lock +{ + __be64 cookie; + + __be32 flags; + __be16 pad1; + u8 node_idx; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_UNLOCK_LOCK_MAX_LEN (sizeof(struct dlm_unlock_lock)+DLM_LVB_LEN) + +struct dlm_proxy_ast +{ + __be64 cookie; + + __be32 flags; + u8 node_idx; + u8 type; + u8 blocked_type; + u8 namelen; + + u8 name[O2NM_MAX_NAME_LEN]; + + s8 lvb[0]; +}; +#define DLM_PROXY_AST_MAX_LEN (sizeof(struct dlm_proxy_ast)+DLM_LVB_LEN) + +#define DLM_MOD_KEY (0x666c6172) +enum dlm_query_join_response { + JOIN_DISALLOW = 0, + JOIN_OK, + JOIN_OK_NO_MAP, +}; + +struct dlm_lock_request +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + +struct dlm_reco_data_done +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; + + /* unused for now */ + /* eventually we can use this to attempt + * lvb recovery based on each node's info */ + u8 reco_lvb[DLM_LVB_LEN]; +}; + +struct dlm_begin_reco +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + + +struct dlm_query_join_request +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_assert_joined +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_cancel_join +{ + u8 node_idx; + u8 pad1[2]; + u8 name_len; + u8 domain[O2NM_MAX_NAME_LEN]; +}; + +struct dlm_exit_domain +{ + u8 node_idx; + u8 pad1[3]; +}; + +struct dlm_finalize_reco +{ + u8 node_idx; + u8 dead_node; + __be16 pad1; + __be32 pad2; +}; + +static inline enum dlm_status +__dlm_lockres_state_to_status(struct dlm_lock_resource *res) +{ + enum dlm_status status = DLM_NORMAL; + + assert_spin_locked(&res->spinlock); + + if (res->state & DLM_LOCK_RES_RECOVERING) + status = DLM_RECOVERING; + else if (res->state & DLM_LOCK_RES_MIGRATING) + status = DLM_MIGRATING; + else if (res->state & DLM_LOCK_RES_IN_PROGRESS) + status = DLM_FORWARD; + + return status; +} + +struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, + struct dlm_lockstatus *lksb); +void dlm_lock_get(struct dlm_lock *lock); +void dlm_lock_put(struct dlm_lock *lock); + +void dlm_lock_attach_lockres(struct dlm_lock *lock, + struct dlm_lock_resource *res); + +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data); + +void dlm_revert_pending_convert(struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_revert_pending_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock); + +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data); +void dlm_commit_pending_cancel(struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_commit_pending_unlock(struct dlm_lock_resource *res, + struct dlm_lock *lock); + +int dlm_launch_thread(struct dlm_ctxt *dlm); +void dlm_complete_thread(struct dlm_ctxt *dlm); +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); +void dlm_wait_for_recovery(struct dlm_ctxt *dlm); + +void dlm_put(struct dlm_ctxt *dlm); +struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); +int dlm_domain_fully_joined(struct dlm_ctxt *dlm); + +void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void dlm_purge_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres); +void dlm_lockres_get(struct dlm_lock_resource *res); +void dlm_lockres_put(struct dlm_lock_resource *res); +void __dlm_unhash_lockres(struct dlm_lock_resource *res); +void __dlm_insert_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len); +struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len); + +int dlm_is_host_down(int errno); +void dlm_change_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner); +struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, + const char *lockid, + int flags); +struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int namelen); + +void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); +void dlm_do_local_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock); +int dlm_do_remote_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock); +void dlm_do_local_bast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int blocked_type); +int dlm_send_proxy_ast_msg(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int msg_type, + int blocked_type, int flags); +static inline int dlm_send_proxy_bast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int blocked_type) +{ + return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_BAST, + blocked_type, 0); +} + +static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + int flags) +{ + return dlm_send_proxy_ast_msg(dlm, res, lock, DLM_AST, + 0, flags); +} + +void dlm_print_one_lock_resource(struct dlm_lock_resource *res); +void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); + +u8 dlm_nm_this_node(struct dlm_ctxt *dlm); +void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); + + +int dlm_nm_init(struct dlm_ctxt *dlm); +int dlm_heartbeat_init(struct dlm_ctxt *dlm); +void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); +void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); + +int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); +int dlm_migrate_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target); +int dlm_finish_migration(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 old_master); +void dlm_lockres_release_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res); + +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data); +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); + +int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int ignore_higher, + u8 request_from, + u32 flags); + + +int dlm_send_one_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres, + u8 send_to, + u8 flags); +void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); + +/* will exit holding res->spinlock, but may drop in function */ +void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); +void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); + +/* will exit holding res->spinlock, but may drop in function */ +static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) +{ + __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| + DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_MIGRATING)); +} + + +int dlm_init_mle_cache(void); +void dlm_destroy_mle_cache(void); +void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); +void dlm_clean_master_list(struct dlm_ctxt *dlm, + u8 dead_node); +int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); + + +static inline const char * dlm_lock_mode_name(int mode) +{ + switch (mode) { + case LKM_EXMODE: + return "EX"; + case LKM_PRMODE: + return "PR"; + case LKM_NLMODE: + return "NL"; + } + return "UNKNOWN"; +} + + +static inline int dlm_lock_compatible(int existing, int request) +{ + /* NO_LOCK compatible with all */ + if (request == LKM_NLMODE || + existing == LKM_NLMODE) + return 1; + + /* EX incompatible with all non-NO_LOCK */ + if (request == LKM_EXMODE) + return 0; + + /* request must be PR, which is compatible with PR */ + if (existing == LKM_PRMODE) + return 1; + + return 0; +} + +static inline int dlm_lock_on_list(struct list_head *head, + struct dlm_lock *lock) +{ + struct list_head *iter; + struct dlm_lock *tmplock; + + list_for_each(iter, head) { + tmplock = list_entry(iter, struct dlm_lock, list); + if (tmplock == lock) + return 1; + } + return 0; +} + + +static inline enum dlm_status dlm_err_to_dlm_status(int err) +{ + enum dlm_status ret; + if (err == -ENOMEM) + ret = DLM_SYSERR; + else if (err == -ETIMEDOUT || o2net_link_down(err, NULL)) + ret = DLM_NOLOCKMGR; + else if (err == -EINVAL) + ret = DLM_BADPARAM; + else if (err == -ENAMETOOLONG) + ret = DLM_IVBUFLEN; + else + ret = DLM_BADARGS; + return ret; +} + + +static inline void dlm_node_iter_init(unsigned long *map, + struct dlm_node_iter *iter) +{ + memcpy(iter->node_map, map, sizeof(iter->node_map)); + iter->curnode = -1; +} + +static inline int dlm_node_iter_next(struct dlm_node_iter *iter) +{ + int bit; + bit = find_next_bit(iter->node_map, O2NM_MAX_NODES, iter->curnode+1); + if (bit >= O2NM_MAX_NODES) { + iter->curnode = O2NM_MAX_NODES; + return -ENOENT; + } + iter->curnode = bit; + return bit; +} + + + +#endif /* DLMCOMMON_H */ diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c new file mode 100644 index 000000000000..6001b22a997d --- /dev/null +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -0,0 +1,530 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmconvert.c + * + * underlying calls for lock conversion + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/spinlock.h> + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmconvert.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +/* NOTE: __dlmconvert_master is the only function in here that + * needs a spinlock held on entry (res->spinlock) and it is the + * only one that holds a lock on exit (res->spinlock). + * All other functions in here need no locks and drop all of + * the locks that they acquire. */ +static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, + int type, int *call_ast, + int *kick_thread); +static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); + +/* + * this is only called directly by dlmlock(), and only when the + * local node is the owner of the lockres + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: see __dlmconvert_master + */ +enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + int call_ast = 0, kick_thread = 0; + enum dlm_status status; + + spin_lock(&res->spinlock); + /* we are not in a network handler, this is fine */ + __dlm_wait_on_lockres(res); + __dlm_lockres_reserve_ast(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + + status = __dlmconvert_master(dlm, res, lock, flags, type, + &call_ast, &kick_thread); + + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + if (status != DLM_NORMAL && status != DLM_NOTQUEUED) + dlm_error(status); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + if (kick_thread) + dlm_kick_thread(dlm, res); + + return status; +} + +/* performs lock conversion at the lockres master site + * locking: + * caller needs: res->spinlock + * taken: takes and drops lock->spinlock + * held on exit: res->spinlock + * returns: DLM_NORMAL, DLM_NOTQUEUED, DLM_DENIED + * call_ast: whether ast should be called for this lock + * kick_thread: whether dlm_kick_thread should be called + */ +static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, + int type, int *call_ast, + int *kick_thread) +{ + enum dlm_status status = DLM_NORMAL; + struct list_head *iter; + struct dlm_lock *tmplock=NULL; + + assert_spin_locked(&res->spinlock); + + mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n", + lock->ml.type, lock->ml.convert_type, type); + + spin_lock(&lock->spinlock); + + /* already converting? */ + if (lock->ml.convert_type != LKM_IVMODE) { + mlog(ML_ERROR, "attempted to convert a lock with a lock " + "conversion pending\n"); + status = DLM_DENIED; + goto unlock_exit; + } + + /* must be on grant queue to convert */ + if (!dlm_lock_on_list(&res->granted, lock)) { + mlog(ML_ERROR, "attempted to convert a lock not on grant " + "queue\n"); + status = DLM_DENIED; + goto unlock_exit; + } + + if (flags & LKM_VALBLK) { + switch (lock->ml.type) { + case LKM_EXMODE: + /* EX + LKM_VALBLK + convert == set lvb */ + mlog(0, "will set lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + lock->lksb->flags |= DLM_LKSB_PUT_LVB; + break; + case LKM_PRMODE: + case LKM_NLMODE: + /* refetch if new level is not NL */ + if (type > LKM_NLMODE) { + mlog(0, "will fetch new value into " + "lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } else { + mlog(0, "will NOT fetch new value " + "into lvb: converting %s->%s\n", + dlm_lock_mode_name(lock->ml.type), + dlm_lock_mode_name(type)); + flags &= ~(LKM_VALBLK); + } + break; + } + } + + + /* in-place downconvert? */ + if (type <= lock->ml.type) + goto grant; + + /* upconvert from here on */ + status = DLM_NORMAL; + list_for_each(iter, &res->granted) { + tmplock = list_entry(iter, struct dlm_lock, list); + if (tmplock == lock) + continue; + if (!dlm_lock_compatible(tmplock->ml.type, type)) + goto switch_queues; + } + + list_for_each(iter, &res->converting) { + tmplock = list_entry(iter, struct dlm_lock, list); + if (!dlm_lock_compatible(tmplock->ml.type, type)) + goto switch_queues; + /* existing conversion requests take precedence */ + if (!dlm_lock_compatible(tmplock->ml.convert_type, type)) + goto switch_queues; + } + + /* fall thru to grant */ + +grant: + mlog(0, "res %.*s, granting %s lock\n", res->lockname.len, + res->lockname.name, dlm_lock_mode_name(type)); + /* immediately grant the new lock type */ + lock->lksb->status = DLM_NORMAL; + if (lock->ml.node == dlm->node_num) + mlog(0, "doing in-place convert for nonlocal lock\n"); + lock->ml.type = type; + status = DLM_NORMAL; + *call_ast = 1; + goto unlock_exit; + +switch_queues: + if (flags & LKM_NOQUEUE) { + mlog(0, "failed to convert NOQUEUE lock %.*s from " + "%d to %d...\n", res->lockname.len, res->lockname.name, + lock->ml.type, type); + status = DLM_NOTQUEUED; + goto unlock_exit; + } + mlog(0, "res %.*s, queueing...\n", res->lockname.len, + res->lockname.name); + + lock->ml.convert_type = type; + /* do not alter lock refcount. switching lists. */ + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->converting); + +unlock_exit: + spin_unlock(&lock->spinlock); + if (status == DLM_DENIED) { + __dlm_print_one_lock_resource(res); + } + if (status == DLM_NORMAL) + *kick_thread = 1; + return status; +} + +void dlm_revert_pending_convert(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* do not alter lock refcount. switching lists. */ + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); + lock->ml.convert_type = LKM_IVMODE; + lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); +} + +/* messages the master site to do lock conversion + * locking: + * caller needs: none + * taken: takes and drops res->spinlock, uses DLM_LOCK_RES_IN_PROGRESS + * held on exit: none + * returns: DLM_NORMAL, DLM_RECOVERING, status from remote node + */ +enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + enum dlm_status status; + + mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type, + lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS); + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(0, "bailing out early since res is RECOVERING " + "on secondary queue\n"); + /* __dlm_print_one_lock_resource(res); */ + status = DLM_RECOVERING; + goto bail; + } + /* will exit this call with spinlock held */ + __dlm_wait_on_lockres(res); + + if (lock->ml.convert_type != LKM_IVMODE) { + __dlm_print_one_lock_resource(res); + mlog(ML_ERROR, "converting a remote lock that is already " + "converting! (cookie=%"MLFu64", conv=%d)\n", + lock->ml.cookie, lock->ml.convert_type); + status = DLM_DENIED; + goto bail; + } + res->state |= DLM_LOCK_RES_IN_PROGRESS; + /* move lock to local convert queue */ + /* do not alter lock refcount. switching lists. */ + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->converting); + lock->convert_pending = 1; + lock->ml.convert_type = type; + + if (flags & LKM_VALBLK) { + if (lock->ml.type == LKM_EXMODE) { + flags |= LKM_PUT_LVB; + lock->lksb->flags |= DLM_LKSB_PUT_LVB; + } else { + if (lock->ml.convert_type == LKM_NLMODE) + flags &= ~LKM_VALBLK; + else { + flags |= LKM_GET_LVB; + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } + } + } + spin_unlock(&res->spinlock); + + /* no locks held here. + * need to wait for a reply as to whether it got queued or not. */ + status = dlm_send_remote_convert_request(dlm, res, lock, flags, type); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + lock->convert_pending = 0; + /* if it failed, move it back to granted queue */ + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + dlm_revert_pending_convert(res, lock); + } +bail: + spin_unlock(&res->spinlock); + + /* TODO: should this be a wake_one? */ + /* wake up any IN_PROGRESS waiters */ + wake_up(&res->wq); + + return status; +} + +/* sends DLM_CONVERT_LOCK_MSG to master site + * locking: + * caller needs: none + * taken: none + * held on exit: none + * returns: DLM_NOLOCKMGR, status from remote node + */ +static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type) +{ + struct dlm_convert_lock convert; + int tmpret; + enum dlm_status ret; + int status = 0; + struct kvec vec[2]; + size_t veclen = 1; + + mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + + memset(&convert, 0, sizeof(struct dlm_convert_lock)); + convert.node_idx = dlm->node_num; + convert.requested_type = type; + convert.cookie = lock->ml.cookie; + convert.namelen = res->lockname.len; + convert.flags = cpu_to_be32(flags); + memcpy(convert.name, res->lockname.name, convert.namelen); + + vec[0].iov_len = sizeof(struct dlm_convert_lock); + vec[0].iov_base = &convert; + + if (flags & LKM_PUT_LVB) { + /* extra data to send if we are updating lvb */ + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + tmpret = o2net_send_message_vec(DLM_CONVERT_LOCK_MSG, dlm->key, + vec, veclen, res->owner, &status); + if (tmpret >= 0) { + // successfully sent and received + ret = status; // this is already a dlm_status + if (ret == DLM_RECOVERING) { + mlog(0, "node %u returned DLM_RECOVERING from convert " + "message!\n", res->owner); + } else if (ret == DLM_MIGRATING) { + mlog(0, "node %u returned DLM_MIGRATING from convert " + "message!\n", res->owner); + } else if (ret == DLM_FORWARD) { + mlog(0, "node %u returned DLM_FORWARD from convert " + "message!\n", res->owner); + } else if (ret != DLM_NORMAL && ret != DLM_NOTQUEUED) + dlm_error(ret); + } else { + mlog_errno(tmpret); + if (dlm_is_host_down(tmpret)) { + ret = DLM_RECOVERING; + mlog(0, "node %u died so returning DLM_RECOVERING " + "from convert message!\n", res->owner); + } else { + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +/* handler for DLM_CONVERT_LOCK_MSG on master site + * locking: + * caller needs: none + * taken: takes and drop res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_IVLOCKID, DLM_BADARGS, + * status from __dlmconvert_master + */ +int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct list_head *iter; + struct dlm_lock *lock = NULL; + struct dlm_lockstatus *lksb; + enum dlm_status status = DLM_NORMAL; + u32 flags; + int call_ast = 0, kick_thread = 0; + + if (!dlm_grab(dlm)) { + dlm_error(DLM_REJECTED); + return DLM_REJECTED; + } + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + if (cnv->namelen > DLM_LOCKID_NAME_MAX) { + status = DLM_IVBUFLEN; + dlm_error(status); + goto leave; + } + + flags = be32_to_cpu(cnv->flags); + + if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == + (LKM_PUT_LVB|LKM_GET_LVB)) { + mlog(ML_ERROR, "both PUT and GET lvb specified\n"); + status = DLM_BADARGS; + goto leave; + } + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : + (flags & LKM_GET_LVB ? "get lvb" : "none")); + + status = DLM_IVLOCKID; + res = dlm_lookup_lockres(dlm, cnv->name, cnv->namelen); + if (!res) { + dlm_error(status); + goto leave; + } + + spin_lock(&res->spinlock); + list_for_each(iter, &res->granted) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.cookie == cnv->cookie && + lock->ml.node == cnv->node_idx) { + dlm_lock_get(lock); + break; + } + lock = NULL; + } + spin_unlock(&res->spinlock); + if (!lock) { + status = DLM_IVLOCKID; + dlm_error(status); + goto leave; + } + + /* found the lock */ + lksb = lock->lksb; + + /* see if caller needed to get/put lvb */ + if (flags & LKM_PUT_LVB) { + BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + lksb->flags |= DLM_LKSB_PUT_LVB; + memcpy(&lksb->lvb[0], &cnv->lvb[0], DLM_LVB_LEN); + } else if (flags & LKM_GET_LVB) { + BUG_ON(lksb->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + lksb->flags |= DLM_LKSB_GET_LVB; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status == DLM_NORMAL) { + __dlm_lockres_reserve_ast(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + status = __dlmconvert_master(dlm, res, lock, flags, + cnv->requested_type, + &call_ast, &kick_thread); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + } + spin_unlock(&res->spinlock); + + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); + } + +leave: + if (!lock) + mlog(ML_ERROR, "did not find lock to convert on grant queue! " + "cookie=%"MLFu64"\n", + cnv->cookie); + else + dlm_lock_put(lock); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + if (kick_thread) + dlm_kick_thread(dlm, res); + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} diff --git a/fs/ocfs2/dlm/dlmconvert.h b/fs/ocfs2/dlm/dlmconvert.h new file mode 100644 index 000000000000..b2e3677df878 --- /dev/null +++ b/fs/ocfs2/dlm/dlmconvert.h @@ -0,0 +1,35 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmconvert.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMCONVERT_H +#define DLMCONVERT_H + +enum dlm_status dlmconvert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); +enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags, int type); + +#endif diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c new file mode 100644 index 000000000000..f339fe27975a --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -0,0 +1,246 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.c + * + * debug functionality for the dlm + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/sysctl.h> +#include <linux/spinlock.h> + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdebug.h" + +#include "dlmdomain.h" +#include "dlmdebug.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +void dlm_print_one_lock_resource(struct dlm_lock_resource *res) +{ + mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", + res->lockname.len, res->lockname.name, + res->owner, res->state); + spin_lock(&res->spinlock); + __dlm_print_one_lock_resource(res); + spin_unlock(&res->spinlock); +} + +void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) +{ + struct list_head *iter2; + struct dlm_lock *lock; + + assert_spin_locked(&res->spinlock); + + mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", + res->lockname.len, res->lockname.name, + res->owner, res->state); + mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", + res->last_used, list_empty(&res->purge) ? "no" : "yes"); + mlog(ML_NOTICE, " granted queue: \n"); + list_for_each(iter2, &res->granted) { + lock = list_entry(iter2, struct dlm_lock, list); + spin_lock(&lock->spinlock); + mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " + "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", + lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, + list_empty(&lock->ast_list) ? 'y' : 'n', + lock->ast_pending ? 'y' : 'n', + list_empty(&lock->bast_list) ? 'y' : 'n', + lock->bast_pending ? 'y' : 'n'); + spin_unlock(&lock->spinlock); + } + mlog(ML_NOTICE, " converting queue: \n"); + list_for_each(iter2, &res->converting) { + lock = list_entry(iter2, struct dlm_lock, list); + spin_lock(&lock->spinlock); + mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " + "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", + lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, + list_empty(&lock->ast_list) ? 'y' : 'n', + lock->ast_pending ? 'y' : 'n', + list_empty(&lock->bast_list) ? 'y' : 'n', + lock->bast_pending ? 'y' : 'n'); + spin_unlock(&lock->spinlock); + } + mlog(ML_NOTICE, " blocked queue: \n"); + list_for_each(iter2, &res->blocked) { + lock = list_entry(iter2, struct dlm_lock, list); + spin_lock(&lock->spinlock); + mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " + "cookie=%"MLFu64", ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", + lock->ml.type, lock->ml.convert_type, lock->ml.node, lock->ml.cookie, + list_empty(&lock->ast_list) ? 'y' : 'n', + lock->ast_pending ? 'y' : 'n', + list_empty(&lock->bast_list) ? 'y' : 'n', + lock->bast_pending ? 'y' : 'n'); + spin_unlock(&lock->spinlock); + } +} + +void dlm_print_one_lock(struct dlm_lock *lockid) +{ + dlm_print_one_lock_resource(lockid->lockres); +} +EXPORT_SYMBOL_GPL(dlm_print_one_lock); + +void dlm_dump_lock_resources(struct dlm_ctxt *dlm) +{ + struct dlm_lock_resource *res; + struct list_head *iter; + struct list_head *bucket; + int i; + + mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", + dlm->name, dlm->node_num, dlm->key); + if (!dlm || !dlm->name) { + mlog(ML_ERROR, "dlm=%p\n", dlm); + return; + } + + spin_lock(&dlm->spinlock); + for (i=0; i<DLM_HASH_SIZE; i++) { + bucket = &(dlm->resources[i]); + list_for_each(iter, bucket) { + res = list_entry(iter, struct dlm_lock_resource, list); + dlm_print_one_lock_resource(res); + } + } + spin_unlock(&dlm->spinlock); +} + +static const char *dlm_errnames[] = { + [DLM_NORMAL] = "DLM_NORMAL", + [DLM_GRANTED] = "DLM_GRANTED", + [DLM_DENIED] = "DLM_DENIED", + [DLM_DENIED_NOLOCKS] = "DLM_DENIED_NOLOCKS", + [DLM_WORKING] = "DLM_WORKING", + [DLM_BLOCKED] = "DLM_BLOCKED", + [DLM_BLOCKED_ORPHAN] = "DLM_BLOCKED_ORPHAN", + [DLM_DENIED_GRACE_PERIOD] = "DLM_DENIED_GRACE_PERIOD", + [DLM_SYSERR] = "DLM_SYSERR", + [DLM_NOSUPPORT] = "DLM_NOSUPPORT", + [DLM_CANCELGRANT] = "DLM_CANCELGRANT", + [DLM_IVLOCKID] = "DLM_IVLOCKID", + [DLM_SYNC] = "DLM_SYNC", + [DLM_BADTYPE] = "DLM_BADTYPE", + [DLM_BADRESOURCE] = "DLM_BADRESOURCE", + [DLM_MAXHANDLES] = "DLM_MAXHANDLES", + [DLM_NOCLINFO] = "DLM_NOCLINFO", + [DLM_NOLOCKMGR] = "DLM_NOLOCKMGR", + [DLM_NOPURGED] = "DLM_NOPURGED", + [DLM_BADARGS] = "DLM_BADARGS", + [DLM_VOID] = "DLM_VOID", + [DLM_NOTQUEUED] = "DLM_NOTQUEUED", + [DLM_IVBUFLEN] = "DLM_IVBUFLEN", + [DLM_CVTUNGRANT] = "DLM_CVTUNGRANT", + [DLM_BADPARAM] = "DLM_BADPARAM", + [DLM_VALNOTVALID] = "DLM_VALNOTVALID", + [DLM_REJECTED] = "DLM_REJECTED", + [DLM_ABORT] = "DLM_ABORT", + [DLM_CANCEL] = "DLM_CANCEL", + [DLM_IVRESHANDLE] = "DLM_IVRESHANDLE", + [DLM_DEADLOCK] = "DLM_DEADLOCK", + [DLM_DENIED_NOASTS] = "DLM_DENIED_NOASTS", + [DLM_FORWARD] = "DLM_FORWARD", + [DLM_TIMEOUT] = "DLM_TIMEOUT", + [DLM_IVGROUPID] = "DLM_IVGROUPID", + [DLM_VERS_CONFLICT] = "DLM_VERS_CONFLICT", + [DLM_BAD_DEVICE_PATH] = "DLM_BAD_DEVICE_PATH", + [DLM_NO_DEVICE_PERMISSION] = "DLM_NO_DEVICE_PERMISSION", + [DLM_NO_CONTROL_DEVICE ] = "DLM_NO_CONTROL_DEVICE ", + [DLM_RECOVERING] = "DLM_RECOVERING", + [DLM_MIGRATING] = "DLM_MIGRATING", + [DLM_MAXSTATS] = "DLM_MAXSTATS", +}; + +static const char *dlm_errmsgs[] = { + [DLM_NORMAL] = "request in progress", + [DLM_GRANTED] = "request granted", + [DLM_DENIED] = "request denied", + [DLM_DENIED_NOLOCKS] = "request denied, out of system resources", + [DLM_WORKING] = "async request in progress", + [DLM_BLOCKED] = "lock request blocked", + [DLM_BLOCKED_ORPHAN] = "lock request blocked by a orphan lock", + [DLM_DENIED_GRACE_PERIOD] = "topological change in progress", + [DLM_SYSERR] = "system error", + [DLM_NOSUPPORT] = "unsupported", + [DLM_CANCELGRANT] = "can't cancel convert: already granted", + [DLM_IVLOCKID] = "bad lockid", + [DLM_SYNC] = "synchronous request granted", + [DLM_BADTYPE] = "bad resource type", + [DLM_BADRESOURCE] = "bad resource handle", + [DLM_MAXHANDLES] = "no more resource handles", + [DLM_NOCLINFO] = "can't contact cluster manager", + [DLM_NOLOCKMGR] = "can't contact lock manager", + [DLM_NOPURGED] = "can't contact purge daemon", + [DLM_BADARGS] = "bad api args", + [DLM_VOID] = "no status", + [DLM_NOTQUEUED] = "NOQUEUE was specified and request failed", + [DLM_IVBUFLEN] = "invalid resource name length", + [DLM_CVTUNGRANT] = "attempted to convert ungranted lock", + [DLM_BADPARAM] = "invalid lock mode specified", + [DLM_VALNOTVALID] = "value block has been invalidated", + [DLM_REJECTED] = "request rejected, unrecognized client", + [DLM_ABORT] = "blocked lock request cancelled", + [DLM_CANCEL] = "conversion request cancelled", + [DLM_IVRESHANDLE] = "invalid resource handle", + [DLM_DEADLOCK] = "deadlock recovery refused this request", + [DLM_DENIED_NOASTS] = "failed to allocate AST", + [DLM_FORWARD] = "request must wait for primary's response", + [DLM_TIMEOUT] = "timeout value for lock has expired", + [DLM_IVGROUPID] = "invalid group specification", + [DLM_VERS_CONFLICT] = "version conflicts prevent request handling", + [DLM_BAD_DEVICE_PATH] = "Locks device does not exist or path wrong", + [DLM_NO_DEVICE_PERMISSION] = "Client has insufficient perms for device", + [DLM_NO_CONTROL_DEVICE] = "Cannot set options on opened device ", + [DLM_RECOVERING] = "lock resource being recovered", + [DLM_MIGRATING] = "lock resource being migrated", + [DLM_MAXSTATS] = "invalid error number", +}; + +const char *dlm_errmsg(enum dlm_status err) +{ + if (err >= DLM_MAXSTATS || err < 0) + return dlm_errmsgs[DLM_MAXSTATS]; + return dlm_errmsgs[err]; +} +EXPORT_SYMBOL_GPL(dlm_errmsg); + +const char *dlm_errname(enum dlm_status err) +{ + if (err >= DLM_MAXSTATS || err < 0) + return dlm_errnames[DLM_MAXSTATS]; + return dlm_errnames[err]; +} +EXPORT_SYMBOL_GPL(dlm_errname); diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h new file mode 100644 index 000000000000..6858510c3ccd --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdebug.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDEBUG_H +#define DLMDEBUG_H + +void dlm_dump_lock_resources(struct dlm_ctxt *dlm); + +#endif diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c new file mode 100644 index 000000000000..da3c22045f89 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -0,0 +1,1469 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdomain.c + * + * defines domain join / leave apis + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/err.h> + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmdebug.h" +#include "dlmdomain.h" + +#include "dlmver.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) +#include "cluster/masklog.h" + +/* + * + * spinlock lock ordering: if multiple locks are needed, obey this ordering: + * dlm_domain_lock + * struct dlm_ctxt->spinlock + * struct dlm_lock_resource->spinlock + * struct dlm_ctxt->master_lock + * struct dlm_ctxt->ast_lock + * dlm_master_list_entry->spinlock + * dlm_lock->spinlock + * + */ + +spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(dlm_domains); +static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); + +#define DLM_DOMAIN_BACKOFF_MS 200 + +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data); +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data); +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data); +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data); + +static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); + +void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) +{ + list_del_init(&lockres->list); + dlm_lockres_put(lockres); +} + +void __dlm_insert_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct list_head *bucket; + struct qstr *q; + + assert_spin_locked(&dlm->spinlock); + + q = &res->lockname; + q->hash = full_name_hash(q->name, q->len); + bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); + + /* get a reference for our hashtable */ + dlm_lockres_get(res); + + list_add_tail(&res->list, bucket); +} + +struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len) +{ + unsigned int hash; + struct list_head *iter; + struct dlm_lock_resource *tmpres=NULL; + struct list_head *bucket; + + mlog_entry("%.*s\n", len, name); + + assert_spin_locked(&dlm->spinlock); + + hash = full_name_hash(name, len); + + bucket = &(dlm->resources[hash & DLM_HASH_MASK]); + + /* check for pre-existing lock */ + list_for_each(iter, bucket) { + tmpres = list_entry(iter, struct dlm_lock_resource, list); + if (tmpres->lockname.len == len && + memcmp(tmpres->lockname.name, name, len) == 0) { + dlm_lockres_get(tmpres); + break; + } + + tmpres = NULL; + } + return tmpres; +} + +struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int len) +{ + struct dlm_lock_resource *res; + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, len); + spin_unlock(&dlm->spinlock); + return res; +} + +static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) +{ + struct dlm_ctxt *tmp = NULL; + struct list_head *iter; + + assert_spin_locked(&dlm_domain_lock); + + /* tmp->name here is always NULL terminated, + * but domain may not be! */ + list_for_each(iter, &dlm_domains) { + tmp = list_entry (iter, struct dlm_ctxt, list); + if (strlen(tmp->name) == len && + memcmp(tmp->name, domain, len)==0) + break; + tmp = NULL; + } + + return tmp; +} + +/* For null terminated domain strings ONLY */ +static struct dlm_ctxt * __dlm_lookup_domain(const char *domain) +{ + assert_spin_locked(&dlm_domain_lock); + + return __dlm_lookup_domain_full(domain, strlen(domain)); +} + + +/* returns true on one of two conditions: + * 1) the domain does not exist + * 2) the domain exists and it's state is "joined" */ +static int dlm_wait_on_domain_helper(const char *domain) +{ + int ret = 0; + struct dlm_ctxt *tmp = NULL; + + spin_lock(&dlm_domain_lock); + + tmp = __dlm_lookup_domain(domain); + if (!tmp) + ret = 1; + else if (tmp->dlm_state == DLM_CTXT_JOINED) + ret = 1; + + spin_unlock(&dlm_domain_lock); + return ret; +} + +static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) +{ + if (dlm->resources) + free_page((unsigned long) dlm->resources); + + if (dlm->name) + kfree(dlm->name); + + kfree(dlm); +} + +/* A little strange - this function will be called while holding + * dlm_domain_lock and is expected to be holding it on the way out. We + * will however drop and reacquire it multiple times */ +static void dlm_ctxt_release(struct kref *kref) +{ + struct dlm_ctxt *dlm; + + dlm = container_of(kref, struct dlm_ctxt, dlm_refs); + + BUG_ON(dlm->num_joins); + BUG_ON(dlm->dlm_state == DLM_CTXT_JOINED); + + /* we may still be in the list if we hit an error during join. */ + list_del_init(&dlm->list); + + spin_unlock(&dlm_domain_lock); + + mlog(0, "freeing memory from domain %s\n", dlm->name); + + wake_up(&dlm_domain_events); + + dlm_free_ctxt_mem(dlm); + + spin_lock(&dlm_domain_lock); +} + +void dlm_put(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm_domain_lock); + kref_put(&dlm->dlm_refs, dlm_ctxt_release); + spin_unlock(&dlm_domain_lock); +} + +static void __dlm_get(struct dlm_ctxt *dlm) +{ + kref_get(&dlm->dlm_refs); +} + +/* given a questionable reference to a dlm object, gets a reference if + * it can find it in the list, otherwise returns NULL in which case + * you shouldn't trust your pointer. */ +struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) +{ + struct list_head *iter; + struct dlm_ctxt *target = NULL; + + spin_lock(&dlm_domain_lock); + + list_for_each(iter, &dlm_domains) { + target = list_entry (iter, struct dlm_ctxt, list); + + if (target == dlm) { + __dlm_get(target); + break; + } + + target = NULL; + } + + spin_unlock(&dlm_domain_lock); + + return target; +} + +int dlm_domain_fully_joined(struct dlm_ctxt *dlm) +{ + int ret; + + spin_lock(&dlm_domain_lock); + ret = (dlm->dlm_state == DLM_CTXT_JOINED) || + (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN); + spin_unlock(&dlm_domain_lock); + + return ret; +} + +static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) +{ + dlm_unregister_domain_handlers(dlm); + dlm_complete_thread(dlm); + dlm_complete_recovery_thread(dlm); + + /* We've left the domain. Now we can take ourselves out of the + * list and allow the kref stuff to help us free the + * memory. */ + spin_lock(&dlm_domain_lock); + list_del_init(&dlm->list); + spin_unlock(&dlm_domain_lock); + + /* Wake up anyone waiting for us to remove this domain */ + wake_up(&dlm_domain_events); +} + +static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) +{ + int i; + struct dlm_lock_resource *res; + + mlog(0, "Migrating locks from domain %s\n", dlm->name); +restart: + spin_lock(&dlm->spinlock); + for (i=0; i<DLM_HASH_SIZE; i++) { + while (!list_empty(&dlm->resources[i])) { + res = list_entry(dlm->resources[i].next, + struct dlm_lock_resource, list); + /* need reference when manually grabbing lockres */ + dlm_lockres_get(res); + /* this should unhash the lockres + * and exit with dlm->spinlock */ + mlog(0, "purging res=%p\n", res); + if (dlm_lockres_is_dirty(dlm, res)) { + /* HACK! this should absolutely go. + * need to figure out why some empty + * lockreses are still marked dirty */ + mlog(ML_ERROR, "lockres %.*s dirty!\n", + res->lockname.len, res->lockname.name); + + spin_unlock(&dlm->spinlock); + dlm_kick_thread(dlm, res); + wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); + dlm_lockres_put(res); + goto restart; + } + dlm_purge_lockres(dlm, res); + dlm_lockres_put(res); + } + } + spin_unlock(&dlm->spinlock); + + mlog(0, "DONE Migrating locks from domain %s\n", dlm->name); +} + +static int dlm_no_joining_node(struct dlm_ctxt *dlm) +{ + int ret; + + spin_lock(&dlm->spinlock); + ret = dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN; + spin_unlock(&dlm->spinlock); + + return ret; +} + +static void dlm_mark_domain_leaving(struct dlm_ctxt *dlm) +{ + /* Yikes, a double spinlock! I need domain_lock for the dlm + * state and the dlm spinlock for join state... Sorry! */ +again: + spin_lock(&dlm_domain_lock); + spin_lock(&dlm->spinlock); + + if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "Node %d is joining, we wait on it.\n", + dlm->joining_node); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + + wait_event(dlm->dlm_join_events, dlm_no_joining_node(dlm)); + goto again; + } + + dlm->dlm_state = DLM_CTXT_LEAVING; + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); +} + +static void __dlm_print_nodes(struct dlm_ctxt *dlm) +{ + int node = -1; + + assert_spin_locked(&dlm->spinlock); + + mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name); + + while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + mlog(ML_NOTICE, " node %d\n", node); + } +} + +static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + unsigned int node; + struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; + + mlog_entry("%p %u %p", msg, len, data); + + if (!dlm_grab(dlm)) + return 0; + + node = exit_msg->node_idx; + + mlog(0, "Node %u leaves domain %s\n", node, dlm->name); + + spin_lock(&dlm->spinlock); + clear_bit(node, dlm->domain_map); + __dlm_print_nodes(dlm); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, node, 0); + + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + + return 0; +} + +static int dlm_send_one_domain_exit(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + struct dlm_exit_domain leave_msg; + + mlog(0, "Asking node %u if we can leave the domain %s me = %u\n", + node, dlm->name, dlm->node_num); + + memset(&leave_msg, 0, sizeof(leave_msg)); + leave_msg.node_idx = dlm->node_num; + + status = o2net_send_message(DLM_EXIT_DOMAIN_MSG, dlm->key, + &leave_msg, sizeof(leave_msg), node, + NULL); + + mlog(0, "status return %d from o2net_send_message\n", status); + + return status; +} + + +static void dlm_leave_domain(struct dlm_ctxt *dlm) +{ + int node, clear_node, status; + + /* At this point we've migrated away all our locks and won't + * accept mastership of new ones. The dlm is responsible for + * almost nothing now. We make sure not to confuse any joining + * nodes and then commence shutdown procedure. */ + + spin_lock(&dlm->spinlock); + /* Clear ourselves from the domain map */ + clear_bit(dlm->node_num, dlm->domain_map); + while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, + 0)) < O2NM_MAX_NODES) { + /* Drop the dlm spinlock. This is safe wrt the domain_map. + * -nodes cannot be added now as the + * query_join_handlers knows to respond with OK_NO_MAP + * -we catch the right network errors if a node is + * removed from the map while we're sending him the + * exit message. */ + spin_unlock(&dlm->spinlock); + + clear_node = 1; + + status = dlm_send_one_domain_exit(dlm, node); + if (status < 0 && + status != -ENOPROTOOPT && + status != -ENOTCONN) { + mlog(ML_NOTICE, "Error %d sending domain exit message " + "to node %d\n", status, node); + + /* Not sure what to do here but lets sleep for + * a bit in case this was a transient + * error... */ + msleep(DLM_DOMAIN_BACKOFF_MS); + clear_node = 0; + } + + spin_lock(&dlm->spinlock); + /* If we're not clearing the node bit then we intend + * to loop back around to try again. */ + if (clear_node) + clear_bit(node, dlm->domain_map); + } + spin_unlock(&dlm->spinlock); +} + +int dlm_joined(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + + if (dlm->dlm_state == DLM_CTXT_JOINED) + ret = 1; + + spin_unlock(&dlm_domain_lock); + + return ret; +} + +int dlm_shutting_down(struct dlm_ctxt *dlm) +{ + int ret = 0; + + spin_lock(&dlm_domain_lock); + + if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN) + ret = 1; + + spin_unlock(&dlm_domain_lock); + + return ret; +} + +void dlm_unregister_domain(struct dlm_ctxt *dlm) +{ + int leave = 0; + + spin_lock(&dlm_domain_lock); + BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); + BUG_ON(!dlm->num_joins); + + dlm->num_joins--; + if (!dlm->num_joins) { + /* We mark it "in shutdown" now so new register + * requests wait until we've completely left the + * domain. Don't use DLM_CTXT_LEAVING yet as we still + * want new domain joins to communicate with us at + * least until we've completed migration of our + * resources. */ + dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN; + leave = 1; + } + spin_unlock(&dlm_domain_lock); + + if (leave) { + mlog(0, "shutting down domain %s\n", dlm->name); + + /* We changed dlm state, notify the thread */ + dlm_kick_thread(dlm, NULL); + + dlm_migrate_all_locks(dlm); + dlm_mark_domain_leaving(dlm); + dlm_leave_domain(dlm); + dlm_complete_dlm_shutdown(dlm); + } + dlm_put(dlm); +} +EXPORT_SYMBOL_GPL(dlm_unregister_domain); + +static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_query_join_request *query; + enum dlm_query_join_response response; + struct dlm_ctxt *dlm = NULL; + + query = (struct dlm_query_join_request *) msg->buf; + + mlog(0, "node %u wants to join domain %s\n", query->node_idx, + query->domain); + + /* + * If heartbeat doesn't consider the node live, tell it + * to back off and try again. This gives heartbeat a chance + * to catch up. + */ + if (!o2hb_check_node_heartbeating(query->node_idx)) { + mlog(0, "node %u is not in our live map yet\n", + query->node_idx); + + response = JOIN_DISALLOW; + goto respond; + } + + response = JOIN_OK_NO_MAP; + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(query->domain, query->name_len); + /* Once the dlm ctxt is marked as leaving then we don't want + * to be put in someone's domain map. */ + if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + spin_lock(&dlm->spinlock); + + if (dlm->dlm_state == DLM_CTXT_NEW && + dlm->joining_node == DLM_LOCK_RES_OWNER_UNKNOWN) { + /*If this is a brand new context and we + * haven't started our join process yet, then + * the other node won the race. */ + response = JOIN_OK_NO_MAP; + } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* Disallow parallel joins. */ + response = JOIN_DISALLOW; + } else { + /* Alright we're fully a part of this domain + * so we keep some state as to who's joining + * and indicate to him that needs to be fixed + * up. */ + response = JOIN_OK; + __dlm_set_joining_node(dlm, query->node_idx); + } + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + +respond: + mlog(0, "We respond with %u\n", response); + + return response; +} + +static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_assert_joined *assert; + struct dlm_ctxt *dlm = NULL; + + assert = (struct dlm_assert_joined *) msg->buf; + + mlog(0, "node %u asserts join on domain %s\n", assert->node_idx, + assert->domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(assert->domain, assert->name_len); + /* XXX should we consider no dlm ctxt an error? */ + if (dlm) { + spin_lock(&dlm->spinlock); + + /* Alright, this node has officially joined our + * domain. Set him in the map and clean up our + * leftover join state. */ + BUG_ON(dlm->joining_node != assert->node_idx); + set_bit(assert->node_idx, dlm->domain_map); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + + __dlm_print_nodes(dlm); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, assert->node_idx, 1); + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + + return 0; +} + +static int dlm_cancel_join_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_cancel_join *cancel; + struct dlm_ctxt *dlm = NULL; + + cancel = (struct dlm_cancel_join *) msg->buf; + + mlog(0, "node %u cancels join on domain %s\n", cancel->node_idx, + cancel->domain); + + spin_lock(&dlm_domain_lock); + dlm = __dlm_lookup_domain_full(cancel->domain, cancel->name_len); + + if (dlm) { + spin_lock(&dlm->spinlock); + + /* Yikes, this guy wants to cancel his join. No + * problem, we simply cleanup our join state. */ + BUG_ON(dlm->joining_node != cancel->node_idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + + spin_unlock(&dlm->spinlock); + } + spin_unlock(&dlm_domain_lock); + + return 0; +} + +static int dlm_send_one_join_cancel(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + struct dlm_cancel_join cancel_msg; + + memset(&cancel_msg, 0, sizeof(cancel_msg)); + cancel_msg.node_idx = dlm->node_num; + cancel_msg.name_len = strlen(dlm->name); + memcpy(cancel_msg.domain, dlm->name, cancel_msg.name_len); + + status = o2net_send_message(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + &cancel_msg, sizeof(cancel_msg), node, + NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + return status; +} + +/* map_size should be in bytes. */ +static int dlm_send_join_cancels(struct dlm_ctxt *dlm, + unsigned long *node_map, + unsigned int map_size) +{ + int status, tmpstat; + unsigned int node; + + if (map_size != (BITS_TO_LONGS(O2NM_MAX_NODES) * + sizeof(unsigned long))) { + mlog(ML_ERROR, + "map_size %u != BITS_TO_LONGS(O2NM_MAX_NODES) %u\n", + map_size, BITS_TO_LONGS(O2NM_MAX_NODES)); + return -EINVAL; + } + + status = 0; + node = -1; + while ((node = find_next_bit(node_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + tmpstat = dlm_send_one_join_cancel(dlm, node); + if (tmpstat) { + mlog(ML_ERROR, "Error return %d cancelling join on " + "node %d\n", tmpstat, node); + if (!status) + status = tmpstat; + } + } + + if (status) + mlog_errno(status); + return status; +} + +static int dlm_request_join(struct dlm_ctxt *dlm, + int node, + enum dlm_query_join_response *response) +{ + int status, retval; + struct dlm_query_join_request join_msg; + + mlog(0, "querying node %d\n", node); + + memset(&join_msg, 0, sizeof(join_msg)); + join_msg.node_idx = dlm->node_num; + join_msg.name_len = strlen(dlm->name); + memcpy(join_msg.domain, dlm->name, join_msg.name_len); + + status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg, + sizeof(join_msg), node, &retval); + if (status < 0 && status != -ENOPROTOOPT) { + mlog_errno(status); + goto bail; + } + + /* -ENOPROTOOPT from the net code means the other side isn't + listening for our message type -- that's fine, it means + his dlm isn't up, so we can consider him a 'yes' but not + joined into the domain. */ + if (status == -ENOPROTOOPT) { + status = 0; + *response = JOIN_OK_NO_MAP; + } else if (retval == JOIN_DISALLOW || + retval == JOIN_OK || + retval == JOIN_OK_NO_MAP) { + *response = retval; + } else { + status = -EINVAL; + mlog(ML_ERROR, "invalid response %d from node %u\n", retval, + node); + } + + mlog(0, "status %d, node %d response is %d\n", status, node, + *response); + +bail: + return status; +} + +static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, + unsigned int node) +{ + int status; + struct dlm_assert_joined assert_msg; + + mlog(0, "Sending join assert to node %u\n", node); + + memset(&assert_msg, 0, sizeof(assert_msg)); + assert_msg.node_idx = dlm->node_num; + assert_msg.name_len = strlen(dlm->name); + memcpy(assert_msg.domain, dlm->name, assert_msg.name_len); + + status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + &assert_msg, sizeof(assert_msg), node, + NULL); + if (status < 0) + mlog_errno(status); + + return status; +} + +static void dlm_send_join_asserts(struct dlm_ctxt *dlm, + unsigned long *node_map) +{ + int status, node, live; + + status = 0; + node = -1; + while ((node = find_next_bit(node_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + do { + /* It is very important that this message be + * received so we spin until either the node + * has died or it gets the message. */ + status = dlm_send_one_join_assert(dlm, node); + + spin_lock(&dlm->spinlock); + live = test_bit(node, dlm->live_nodes_map); + spin_unlock(&dlm->spinlock); + + if (status) { + mlog(ML_ERROR, "Error return %d asserting " + "join on node %d\n", status, node); + + /* give us some time between errors... */ + if (live) + msleep(DLM_DOMAIN_BACKOFF_MS); + } + } while (status && live); + } +} + +struct domain_join_ctxt { + unsigned long live_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long yes_resp_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +static int dlm_should_restart_join(struct dlm_ctxt *dlm, + struct domain_join_ctxt *ctxt, + enum dlm_query_join_response response) +{ + int ret; + + if (response == JOIN_DISALLOW) { + mlog(0, "Latest response of disallow -- should restart\n"); + return 1; + } + + spin_lock(&dlm->spinlock); + /* For now, we restart the process if the node maps have + * changed at all */ + ret = memcmp(ctxt->live_map, dlm->live_nodes_map, + sizeof(dlm->live_nodes_map)); + spin_unlock(&dlm->spinlock); + + if (ret) + mlog(0, "Node maps changed -- should restart\n"); + + return ret; +} + +static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) +{ + int status = 0, tmpstat, node; + struct domain_join_ctxt *ctxt; + enum dlm_query_join_response response; + + mlog_entry("%p", dlm); + + ctxt = kcalloc(1, sizeof(*ctxt), GFP_KERNEL); + if (!ctxt) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* group sem locking should work for us here -- we're already + * registered for heartbeat events so filling this should be + * atomic wrt getting those handlers called. */ + o2hb_fill_node_map(dlm->live_nodes_map, sizeof(dlm->live_nodes_map)); + + spin_lock(&dlm->spinlock); + memcpy(ctxt->live_map, dlm->live_nodes_map, sizeof(ctxt->live_map)); + + __dlm_set_joining_node(dlm, dlm->node_num); + + spin_unlock(&dlm->spinlock); + + node = -1; + while ((node = find_next_bit(ctxt->live_map, O2NM_MAX_NODES, + node + 1)) < O2NM_MAX_NODES) { + if (node == dlm->node_num) + continue; + + status = dlm_request_join(dlm, node, &response); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Ok, either we got a response or the node doesn't have a + * dlm up. */ + if (response == JOIN_OK) + set_bit(node, ctxt->yes_resp_map); + + if (dlm_should_restart_join(dlm, ctxt, response)) { + status = -EAGAIN; + goto bail; + } + } + + mlog(0, "Yay, done querying nodes!\n"); + + /* Yay, everyone agree's we can join the domain. My domain is + * comprised of all nodes who were put in the + * yes_resp_map. Copy that into our domain map and send a join + * assert message to clean up everyone elses state. */ + spin_lock(&dlm->spinlock); + memcpy(dlm->domain_map, ctxt->yes_resp_map, + sizeof(ctxt->yes_resp_map)); + set_bit(dlm->node_num, dlm->domain_map); + spin_unlock(&dlm->spinlock); + + dlm_send_join_asserts(dlm, ctxt->yes_resp_map); + + /* Joined state *must* be set before the joining node + * information, otherwise the query_join handler may read no + * current joiner but a state of NEW and tell joining nodes + * we're not in the domain. */ + spin_lock(&dlm_domain_lock); + dlm->dlm_state = DLM_CTXT_JOINED; + dlm->num_joins++; + spin_unlock(&dlm_domain_lock); + +bail: + spin_lock(&dlm->spinlock); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + if (!status) + __dlm_print_nodes(dlm); + spin_unlock(&dlm->spinlock); + + if (ctxt) { + /* Do we need to send a cancel message to any nodes? */ + if (status < 0) { + tmpstat = dlm_send_join_cancels(dlm, + ctxt->yes_resp_map, + sizeof(ctxt->yes_resp_map)); + if (tmpstat < 0) + mlog_errno(tmpstat); + } + kfree(ctxt); + } + + mlog(0, "returning %d\n", status); + return status; +} + +static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm) +{ + o2hb_unregister_callback(&dlm->dlm_hb_up); + o2hb_unregister_callback(&dlm->dlm_hb_down); + o2net_unregister_handler_list(&dlm->dlm_domain_handlers); +} + +static int dlm_register_domain_handlers(struct dlm_ctxt *dlm) +{ + int status; + + mlog(0, "registering handlers.\n"); + + o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB, + dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI); + status = o2hb_register_callback(&dlm->dlm_hb_down); + if (status) + goto bail; + + o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB, + dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI); + status = o2hb_register_callback(&dlm->dlm_hb_up); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MASTER_REQUEST_MSG, dlm->key, + sizeof(struct dlm_master_request), + dlm_master_request_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key, + sizeof(struct dlm_assert_master), + dlm_assert_master_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CREATE_LOCK_MSG, dlm->key, + sizeof(struct dlm_create_lock), + dlm_create_lock_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CONVERT_LOCK_MSG, dlm->key, + DLM_CONVERT_LOCK_MAX_LEN, + dlm_convert_lock_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_UNLOCK_LOCK_MSG, dlm->key, + DLM_UNLOCK_LOCK_MAX_LEN, + dlm_unlock_lock_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_PROXY_AST_MSG, dlm->key, + DLM_PROXY_AST_MAX_LEN, + dlm_proxy_ast_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_EXIT_DOMAIN_MSG, dlm->key, + sizeof(struct dlm_exit_domain), + dlm_exit_domain_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MIGRATE_REQUEST_MSG, dlm->key, + sizeof(struct dlm_migrate_request), + dlm_migrate_request_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MIG_LOCKRES_MSG, dlm->key, + DLM_MIG_LOCKRES_MAX_LEN, + dlm_mig_lockres_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_MASTER_REQUERY_MSG, dlm->key, + sizeof(struct dlm_master_requery), + dlm_master_requery_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_LOCK_REQUEST_MSG, dlm->key, + sizeof(struct dlm_lock_request), + dlm_request_all_locks_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_RECO_DATA_DONE_MSG, dlm->key, + sizeof(struct dlm_reco_data_done), + dlm_reco_data_done_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_BEGIN_RECO_MSG, dlm->key, + sizeof(struct dlm_begin_reco), + dlm_begin_reco_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_FINALIZE_RECO_MSG, dlm->key, + sizeof(struct dlm_finalize_reco), + dlm_finalize_reco_handler, + dlm, &dlm->dlm_domain_handlers); + if (status) + goto bail; + +bail: + if (status) + dlm_unregister_domain_handlers(dlm); + + return status; +} + +static int dlm_join_domain(struct dlm_ctxt *dlm) +{ + int status; + + BUG_ON(!dlm); + + mlog(0, "Join domain %s\n", dlm->name); + + status = dlm_register_domain_handlers(dlm); + if (status) { + mlog_errno(status); + goto bail; + } + + status = dlm_launch_thread(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = dlm_launch_recovery_thread(dlm); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + do { + unsigned int backoff; + status = dlm_try_to_join_domain(dlm); + + /* If we're racing another node to the join, then we + * need to back off temporarily and let them + * complete. */ + if (status == -EAGAIN) { + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + /* + * <chip> After you! + * <dale> No, after you! + * <chip> I insist! + * <dale> But you first! + * ... + */ + backoff = (unsigned int)(jiffies & 0x3); + backoff *= DLM_DOMAIN_BACKOFF_MS; + mlog(0, "backoff %d\n", backoff); + msleep(backoff); + } + } while (status == -EAGAIN); + + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + wake_up(&dlm_domain_events); + + if (status) { + dlm_unregister_domain_handlers(dlm); + dlm_complete_thread(dlm); + dlm_complete_recovery_thread(dlm); + } + + return status; +} + +static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, + u32 key) +{ + int i; + struct dlm_ctxt *dlm = NULL; + + dlm = kcalloc(1, sizeof(*dlm), GFP_KERNEL); + if (!dlm) { + mlog_errno(-ENOMEM); + goto leave; + } + + dlm->name = kmalloc(strlen(domain) + 1, GFP_KERNEL); + if (dlm->name == NULL) { + mlog_errno(-ENOMEM); + kfree(dlm); + dlm = NULL; + goto leave; + } + + dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); + if (!dlm->resources) { + mlog_errno(-ENOMEM); + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + goto leave; + } + memset(dlm->resources, 0, PAGE_SIZE); + + for (i=0; i<DLM_HASH_SIZE; i++) + INIT_LIST_HEAD(&dlm->resources[i]); + + strcpy(dlm->name, domain); + dlm->key = key; + dlm->node_num = o2nm_this_node(); + + spin_lock_init(&dlm->spinlock); + spin_lock_init(&dlm->master_lock); + spin_lock_init(&dlm->ast_lock); + INIT_LIST_HEAD(&dlm->list); + INIT_LIST_HEAD(&dlm->dirty_list); + INIT_LIST_HEAD(&dlm->reco.resources); + INIT_LIST_HEAD(&dlm->reco.received); + INIT_LIST_HEAD(&dlm->reco.node_data); + INIT_LIST_HEAD(&dlm->purge_list); + INIT_LIST_HEAD(&dlm->dlm_domain_handlers); + dlm->reco.state = 0; + + INIT_LIST_HEAD(&dlm->pending_asts); + INIT_LIST_HEAD(&dlm->pending_basts); + + mlog(0, "dlm->recovery_map=%p, &(dlm->recovery_map[0])=%p\n", + dlm->recovery_map, &(dlm->recovery_map[0])); + + memset(dlm->recovery_map, 0, sizeof(dlm->recovery_map)); + memset(dlm->live_nodes_map, 0, sizeof(dlm->live_nodes_map)); + memset(dlm->domain_map, 0, sizeof(dlm->domain_map)); + + dlm->dlm_thread_task = NULL; + dlm->dlm_reco_thread_task = NULL; + init_waitqueue_head(&dlm->dlm_thread_wq); + init_waitqueue_head(&dlm->dlm_reco_thread_wq); + init_waitqueue_head(&dlm->reco.event); + init_waitqueue_head(&dlm->ast_wq); + init_waitqueue_head(&dlm->migration_wq); + INIT_LIST_HEAD(&dlm->master_list); + INIT_LIST_HEAD(&dlm->mle_hb_events); + + dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; + init_waitqueue_head(&dlm->dlm_join_events); + + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + atomic_set(&dlm->local_resources, 0); + atomic_set(&dlm->remote_resources, 0); + atomic_set(&dlm->unknown_resources, 0); + + spin_lock_init(&dlm->work_lock); + INIT_LIST_HEAD(&dlm->work_list); + INIT_WORK(&dlm->dispatched_work, dlm_dispatch_work, dlm); + + kref_init(&dlm->dlm_refs); + dlm->dlm_state = DLM_CTXT_NEW; + + INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks); + + mlog(0, "context init: refcount %u\n", + atomic_read(&dlm->dlm_refs.refcount)); + +leave: + return dlm; +} + +/* + * dlm_register_domain: one-time setup per "domain" + */ +struct dlm_ctxt * dlm_register_domain(const char *domain, + u32 key) +{ + int ret; + struct dlm_ctxt *dlm = NULL; + struct dlm_ctxt *new_ctxt = NULL; + + if (strlen(domain) > O2NM_MAX_NAME_LEN) { + ret = -ENAMETOOLONG; + mlog(ML_ERROR, "domain name length too long\n"); + goto leave; + } + + if (!o2hb_check_local_node_heartbeating()) { + mlog(ML_ERROR, "the local node has not been configured, or is " + "not heartbeating\n"); + ret = -EPROTO; + goto leave; + } + + mlog(0, "register called for domain \"%s\"\n", domain); + +retry: + dlm = NULL; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + mlog_errno(ret); + goto leave; + } + + spin_lock(&dlm_domain_lock); + + dlm = __dlm_lookup_domain(domain); + if (dlm) { + if (dlm->dlm_state != DLM_CTXT_JOINED) { + spin_unlock(&dlm_domain_lock); + + mlog(0, "This ctxt is not joined yet!\n"); + wait_event_interruptible(dlm_domain_events, + dlm_wait_on_domain_helper( + domain)); + goto retry; + } + + __dlm_get(dlm); + dlm->num_joins++; + + spin_unlock(&dlm_domain_lock); + + ret = 0; + goto leave; + } + + /* doesn't exist */ + if (!new_ctxt) { + spin_unlock(&dlm_domain_lock); + + new_ctxt = dlm_alloc_ctxt(domain, key); + if (new_ctxt) + goto retry; + + ret = -ENOMEM; + mlog_errno(ret); + goto leave; + } + + /* a little variable switch-a-roo here... */ + dlm = new_ctxt; + new_ctxt = NULL; + + /* add the new domain */ + list_add_tail(&dlm->list, &dlm_domains); + spin_unlock(&dlm_domain_lock); + + ret = dlm_join_domain(dlm); + if (ret) { + mlog_errno(ret); + dlm_put(dlm); + goto leave; + } + + ret = 0; +leave: + if (new_ctxt) + dlm_free_ctxt_mem(new_ctxt); + + if (ret < 0) + dlm = ERR_PTR(ret); + + return dlm; +} +EXPORT_SYMBOL_GPL(dlm_register_domain); + +static LIST_HEAD(dlm_join_handlers); + +static void dlm_unregister_net_handlers(void) +{ + o2net_unregister_handler_list(&dlm_join_handlers); +} + +static int dlm_register_net_handlers(void) +{ + int status = 0; + + status = o2net_register_handler(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, + sizeof(struct dlm_query_join_request), + dlm_query_join_handler, + NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, + sizeof(struct dlm_assert_joined), + dlm_assert_joined_handler, + NULL, &dlm_join_handlers); + if (status) + goto bail; + + status = o2net_register_handler(DLM_CANCEL_JOIN_MSG, DLM_MOD_KEY, + sizeof(struct dlm_cancel_join), + dlm_cancel_join_handler, + NULL, &dlm_join_handlers); + +bail: + if (status < 0) + dlm_unregister_net_handlers(); + + return status; +} + +/* Domain eviction callback handling. + * + * The file system requires notification of node death *before* the + * dlm completes it's recovery work, otherwise it may be able to + * acquire locks on resources requiring recovery. Since the dlm can + * evict a node from it's domain *before* heartbeat fires, a similar + * mechanism is required. */ + +/* Eviction is not expected to happen often, so a per-domain lock is + * not necessary. Eviction callbacks are allowed to sleep for short + * periods of time. */ +static DECLARE_RWSEM(dlm_callback_sem); + +void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, + int node_num) +{ + struct list_head *iter; + struct dlm_eviction_cb *cb; + + down_read(&dlm_callback_sem); + list_for_each(iter, &dlm->dlm_eviction_callbacks) { + cb = list_entry(iter, struct dlm_eviction_cb, ec_item); + + cb->ec_func(node_num, cb->ec_data); + } + up_read(&dlm_callback_sem); +} + +void dlm_setup_eviction_cb(struct dlm_eviction_cb *cb, + dlm_eviction_func *f, + void *data) +{ + INIT_LIST_HEAD(&cb->ec_item); + cb->ec_func = f; + cb->ec_data = data; +} +EXPORT_SYMBOL_GPL(dlm_setup_eviction_cb); + +void dlm_register_eviction_cb(struct dlm_ctxt *dlm, + struct dlm_eviction_cb *cb) +{ + down_write(&dlm_callback_sem); + list_add_tail(&cb->ec_item, &dlm->dlm_eviction_callbacks); + up_write(&dlm_callback_sem); +} +EXPORT_SYMBOL_GPL(dlm_register_eviction_cb); + +void dlm_unregister_eviction_cb(struct dlm_eviction_cb *cb) +{ + down_write(&dlm_callback_sem); + list_del_init(&cb->ec_item); + up_write(&dlm_callback_sem); +} +EXPORT_SYMBOL_GPL(dlm_unregister_eviction_cb); + +static int __init dlm_init(void) +{ + int status; + + dlm_print_version(); + + status = dlm_init_mle_cache(); + if (status) + return -1; + + status = dlm_register_net_handlers(); + if (status) { + dlm_destroy_mle_cache(); + return -1; + } + + return 0; +} + +static void __exit dlm_exit (void) +{ + dlm_unregister_net_handlers(); + dlm_destroy_mle_cache(); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(dlm_init); +module_exit(dlm_exit); diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h new file mode 100644 index 000000000000..2f7f60bfeb3b --- /dev/null +++ b/fs/ocfs2/dlm/dlmdomain.h @@ -0,0 +1,36 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmdomain.h + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + +#ifndef DLMDOMAIN_H +#define DLMDOMAIN_H + +extern spinlock_t dlm_domain_lock; +extern struct list_head dlm_domains; + +int dlm_joined(struct dlm_ctxt *dlm); +int dlm_shutting_down(struct dlm_ctxt *dlm); +void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, + int node_num); + +#endif diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c new file mode 100644 index 000000000000..dd2d24dc25e0 --- /dev/null +++ b/fs/ocfs2/dlm/dlmfs.c @@ -0,0 +1,640 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfs.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. This file handles the virtual file system + * used for communication with userspace. Credit should go to ramfs, + * which was a template for the fs side of this module. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +/* Simple VFS hooks based on: */ +/* + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> + +#include <asm/uaccess.h> + + +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" + +#include "userdlm.h" + +#include "dlmfsver.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + +static struct super_operations dlmfs_ops; +static struct file_operations dlmfs_file_operations; +static struct inode_operations dlmfs_dir_inode_operations; +static struct inode_operations dlmfs_root_inode_operations; +static struct inode_operations dlmfs_file_inode_operations; +static kmem_cache_t *dlmfs_inode_cache; + +struct workqueue_struct *user_dlm_worker; + +/* + * decodes a set of open flags into a valid lock level and a set of flags. + * returns < 0 if we have invalid flags + * flags which mean something to us: + * O_RDONLY -> PRMODE level + * O_WRONLY -> EXMODE level + * + * O_NONBLOCK -> LKM_NOQUEUE + */ +static int dlmfs_decode_open_flags(int open_flags, + int *level, + int *flags) +{ + if (open_flags & (O_WRONLY|O_RDWR)) + *level = LKM_EXMODE; + else + *level = LKM_PRMODE; + + *flags = 0; + if (open_flags & O_NONBLOCK) + *flags |= LKM_NOQUEUE; + + return 0; +} + +static int dlmfs_file_open(struct inode *inode, + struct file *file) +{ + int status, level, flags; + struct dlmfs_filp_private *fp = NULL; + struct dlmfs_inode_private *ip; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "open called on inode %lu, flags 0x%x\n", inode->i_ino, + file->f_flags); + + status = dlmfs_decode_open_flags(file->f_flags, &level, &flags); + if (status < 0) + goto bail; + + /* We don't want to honor O_APPEND at read/write time as it + * doesn't make sense for LVB writes. */ + file->f_flags &= ~O_APPEND; + + fp = kmalloc(sizeof(*fp), GFP_KERNEL); + if (!fp) { + status = -ENOMEM; + goto bail; + } + fp->fp_lock_level = level; + + ip = DLMFS_I(inode); + + status = user_dlm_cluster_lock(&ip->ip_lockres, level, flags); + if (status < 0) { + /* this is a strange error to return here but I want + * to be able userspace to be able to distinguish a + * valid lock request from one that simply couldn't be + * granted. */ + if (flags & LKM_NOQUEUE && status == -EAGAIN) + status = -ETXTBSY; + kfree(fp); + goto bail; + } + + file->private_data = fp; +bail: + return status; +} + +static int dlmfs_file_release(struct inode *inode, + struct file *file) +{ + int level, status; + struct dlmfs_inode_private *ip = DLMFS_I(inode); + struct dlmfs_filp_private *fp = + (struct dlmfs_filp_private *) file->private_data; + + if (S_ISDIR(inode->i_mode)) + BUG(); + + mlog(0, "close called on inode %lu\n", inode->i_ino); + + status = 0; + if (fp) { + level = fp->fp_lock_level; + if (level != LKM_IVMODE) + user_dlm_cluster_unlock(&ip->ip_lockres, level); + + kfree(fp); + file->private_data = NULL; + } + + return 0; +} + +static ssize_t dlmfs_file_read(struct file *filp, + char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t readlen; + char *lvb_buf; + struct inode *inode = filp->f_dentry->d_inode; + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return 0; + + if (!count) + return 0; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + /* don't read past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + readlen = i_size_read(inode) - *ppos; + else + readlen = count - *ppos; + + lvb_buf = kmalloc(readlen, GFP_KERNEL); + if (!lvb_buf) + return -ENOMEM; + + user_dlm_read_lvb(inode, lvb_buf, readlen); + bytes_left = __copy_to_user(buf, lvb_buf, readlen); + readlen -= bytes_left; + + kfree(lvb_buf); + + *ppos = *ppos + readlen; + + mlog(0, "read %zd bytes\n", readlen); + return readlen; +} + +static ssize_t dlmfs_file_write(struct file *filp, + const char __user *buf, + size_t count, + loff_t *ppos) +{ + int bytes_left; + ssize_t writelen; + char *lvb_buf; + struct inode *inode = filp->f_dentry->d_inode; + + mlog(0, "inode %lu, count = %zu, *ppos = %llu\n", + inode->i_ino, count, *ppos); + + if (*ppos >= i_size_read(inode)) + return -ENOSPC; + + if (!count) + return 0; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* don't write past the lvb */ + if ((count + *ppos) > i_size_read(inode)) + writelen = i_size_read(inode) - *ppos; + else + writelen = count - *ppos; + + lvb_buf = kmalloc(writelen, GFP_KERNEL); + if (!lvb_buf) + return -ENOMEM; + + bytes_left = copy_from_user(lvb_buf, buf, writelen); + writelen -= bytes_left; + if (writelen) + user_dlm_write_lvb(inode, lvb_buf, writelen); + + kfree(lvb_buf); + + *ppos = *ppos + writelen; + mlog(0, "wrote %zd bytes\n", writelen); + return writelen; +} + +static void dlmfs_init_once(void *foo, + kmem_cache_t *cachep, + unsigned long flags) +{ + struct dlmfs_inode_private *ip = + (struct dlmfs_inode_private *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + ip->ip_dlm = NULL; + ip->ip_parent = NULL; + + inode_init_once(&ip->ip_vfs_inode); + } +} + +static struct inode *dlmfs_alloc_inode(struct super_block *sb) +{ + struct dlmfs_inode_private *ip; + + ip = kmem_cache_alloc(dlmfs_inode_cache, SLAB_NOFS); + if (!ip) + return NULL; + + return &ip->ip_vfs_inode; +} + +static void dlmfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(dlmfs_inode_cache, DLMFS_I(inode)); +} + +static void dlmfs_clear_inode(struct inode *inode) +{ + int status; + struct dlmfs_inode_private *ip; + + if (!inode) + return; + + mlog(0, "inode %lu\n", inode->i_ino); + + ip = DLMFS_I(inode); + + if (S_ISREG(inode->i_mode)) { + status = user_dlm_destroy_lock(&ip->ip_lockres); + if (status < 0) + mlog_errno(status); + iput(ip->ip_parent); + goto clear_fields; + } + + mlog(0, "we're a directory, ip->ip_dlm = 0x%p\n", ip->ip_dlm); + /* we must be a directory. If required, lets unregister the + * dlm context now. */ + if (ip->ip_dlm) + user_dlm_unregister_context(ip->ip_dlm); +clear_fields: + ip->ip_parent = NULL; + ip->ip_dlm = NULL; +} + +static struct backing_dev_info dlmfs_backing_dev_info = { + .ra_pages = 0, /* No readahead */ + .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, +}; + +static struct inode *dlmfs_get_root_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + int mode = S_IFDIR | 0755; + struct dlmfs_inode_private *ip; + + if (inode) { + ip = DLMFS_I(inode); + + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_nlink++; + + inode->i_fop = &simple_dir_operations; + inode->i_op = &dlmfs_root_inode_operations; + } + + return inode; +} + +static struct inode *dlmfs_get_inode(struct inode *parent, + struct dentry *dentry, + int mode) +{ + struct super_block *sb = parent->i_sb; + struct inode * inode = new_inode(sb); + struct dlmfs_inode_private *ip; + + if (!inode) + return NULL; + + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + + ip = DLMFS_I(inode); + ip->ip_dlm = DLMFS_I(parent)->ip_dlm; + + switch (mode & S_IFMT) { + default: + /* for now we don't support anything other than + * directories and regular files. */ + BUG(); + break; + case S_IFREG: + inode->i_op = &dlmfs_file_inode_operations; + inode->i_fop = &dlmfs_file_operations; + + i_size_write(inode, DLM_LVB_LEN); + + user_dlm_lock_res_init(&ip->ip_lockres, dentry); + + /* released at clear_inode time, this insures that we + * get to drop the dlm reference on each lock *before* + * we call the unregister code for releasing parent + * directories. */ + ip->ip_parent = igrab(parent); + BUG_ON(!ip->ip_parent); + break; + case S_IFDIR: + inode->i_op = &dlmfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == + * 2 (for "." entry) */ + inode->i_nlink++; + break; + } + + if (parent->i_mode & S_ISGID) { + inode->i_gid = parent->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } + + return inode; +} + +/* + * File creation. Allocate an inode, and we're done.. + */ +/* SMP-safe */ +static int dlmfs_mkdir(struct inode * dir, + struct dentry * dentry, + int mode) +{ + int status; + struct inode *inode = NULL; + struct qstr *domain = &dentry->d_name; + struct dlmfs_inode_private *ip; + struct dlm_ctxt *dlm; + + mlog(0, "mkdir %.*s\n", domain->len, domain->name); + + /* verify that we have a proper domain */ + if (domain->len >= O2NM_MAX_NAME_LEN) { + status = -EINVAL; + mlog(ML_ERROR, "invalid domain name for directory.\n"); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFDIR); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + ip = DLMFS_I(inode); + + dlm = user_dlm_register_context(domain); + if (IS_ERR(dlm)) { + status = PTR_ERR(dlm); + mlog(ML_ERROR, "Error %d could not register domain \"%.*s\"\n", + status, domain->len, domain->name); + goto bail; + } + ip->ip_dlm = dlm; + + dir->i_nlink++; + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + + status = 0; +bail: + if (status < 0) + iput(inode); + return status; +} + +static int dlmfs_create(struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + int status = 0; + struct inode *inode; + struct qstr *name = &dentry->d_name; + + mlog(0, "create %.*s\n", name->len, name->name); + + /* verify name is valid and doesn't contain any dlm reserved + * characters */ + if (name->len >= USER_DLM_LOCK_ID_MAX_LEN || + name->name[0] == '$') { + status = -EINVAL; + mlog(ML_ERROR, "invalid lock name, %.*s\n", name->len, + name->name); + goto bail; + } + + inode = dlmfs_get_inode(dir, dentry, mode | S_IFREG); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ +bail: + return status; +} + +static int dlmfs_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + struct inode *inode = dentry->d_inode; + + mlog(0, "unlink inode %lu\n", inode->i_ino); + + /* if there are no current holders, or none that are waiting + * to acquire a lock, this basically destroys our lockres. */ + status = user_dlm_destroy_lock(&DLMFS_I(inode)->ip_lockres); + if (status < 0) { + mlog(ML_ERROR, "unlink %.*s, error %d from destroy\n", + dentry->d_name.len, dentry->d_name.name, status); + goto bail; + } + status = simple_unlink(dir, dentry); +bail: + return status; +} + +static int dlmfs_fill_super(struct super_block * sb, + void * data, + int silent) +{ + struct inode * inode; + struct dentry * root; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = DLMFS_MAGIC; + sb->s_op = &dlmfs_ops; + inode = dlmfs_get_root_inode(sb); + if (!inode) + return -ENOMEM; + + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + sb->s_root = root; + return 0; +} + +static struct file_operations dlmfs_file_operations = { + .open = dlmfs_file_open, + .release = dlmfs_file_release, + .read = dlmfs_file_read, + .write = dlmfs_file_write, +}; + +static struct inode_operations dlmfs_dir_inode_operations = { + .create = dlmfs_create, + .lookup = simple_lookup, + .unlink = dlmfs_unlink, +}; + +/* this way we can restrict mkdir to only the toplevel of the fs. */ +static struct inode_operations dlmfs_root_inode_operations = { + .lookup = simple_lookup, + .mkdir = dlmfs_mkdir, + .rmdir = simple_rmdir, +}; + +static struct super_operations dlmfs_ops = { + .statfs = simple_statfs, + .alloc_inode = dlmfs_alloc_inode, + .destroy_inode = dlmfs_destroy_inode, + .clear_inode = dlmfs_clear_inode, + .drop_inode = generic_delete_inode, +}; + +static struct inode_operations dlmfs_file_inode_operations = { + .getattr = simple_getattr, +}; + +static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super); +} + +static struct file_system_type dlmfs_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2_dlmfs", + .get_sb = dlmfs_get_sb, + .kill_sb = kill_litter_super, +}; + +static int __init init_dlmfs_fs(void) +{ + int status; + int cleanup_inode = 0, cleanup_worker = 0; + + dlmfs_print_version(); + + dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", + sizeof(struct dlmfs_inode_private), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + dlmfs_init_once, NULL); + if (!dlmfs_inode_cache) + return -ENOMEM; + cleanup_inode = 1; + + user_dlm_worker = create_singlethread_workqueue("user_dlm"); + if (!user_dlm_worker) { + status = -ENOMEM; + goto bail; + } + cleanup_worker = 1; + + status = register_filesystem(&dlmfs_fs_type); +bail: + if (status) { + if (cleanup_inode) + kmem_cache_destroy(dlmfs_inode_cache); + if (cleanup_worker) + destroy_workqueue(user_dlm_worker); + } else + printk("OCFS2 User DLM kernel interface loaded\n"); + return status; +} + +static void __exit exit_dlmfs_fs(void) +{ + unregister_filesystem(&dlmfs_fs_type); + + flush_workqueue(user_dlm_worker); + destroy_workqueue(user_dlm_worker); + + if (kmem_cache_destroy(dlmfs_inode_cache)) + printk(KERN_INFO "dlmfs_inode_cache: not all structures " + "were freed\n"); +} + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +module_init(init_dlmfs_fs) +module_exit(exit_dlmfs_fs) diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c new file mode 100644 index 000000000000..d2be3ad841f9 --- /dev/null +++ b/fs/ocfs2/dlm/dlmfsver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfsver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include "dlmfsver.h" + +#define DLM_BUILD_VERSION "1.3.3" + +#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION + +void dlmfs_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmfsver.h b/fs/ocfs2/dlm/dlmfsver.h new file mode 100644 index 000000000000..f35eadbed25c --- /dev/null +++ b/fs/ocfs2/dlm/dlmfsver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef DLMFS_VER_H +#define DLMFS_VER_H + +void dlmfs_print_version(void); + +#endif /* DLMFS_VER_H */ diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c new file mode 100644 index 000000000000..d1a0038557a3 --- /dev/null +++ b/fs/ocfs2/dlm/dlmlock.c @@ -0,0 +1,676 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmlock.c + * + * underlying calls for lock creation + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/spinlock.h> +#include <linux/delay.h> + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#include "dlmconvert.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED; +static u64 dlm_next_cookie = 1; + +static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags); +static void dlm_init_lock(struct dlm_lock *newlock, int type, + u8 node, u64 cookie); +static void dlm_lock_release(struct kref *kref); +static void dlm_lock_detach_lockres(struct dlm_lock *lock); + +/* Tell us whether we can grant a new lock request. + * locking: + * caller needs: res->spinlock + * taken: none + * held on exit: none + * returns: 1 if the lock can be granted, 0 otherwise. + */ +static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + struct list_head *iter; + struct dlm_lock *tmplock; + + list_for_each(iter, &res->granted) { + tmplock = list_entry(iter, struct dlm_lock, list); + + if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) + return 0; + } + + list_for_each(iter, &res->converting) { + tmplock = list_entry(iter, struct dlm_lock, list); + + if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) + return 0; + } + + return 1; +} + +/* performs lock creation at the lockres master site + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_NOTQUEUED + */ +static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + int call_ast = 0, kick_thread = 0; + enum dlm_status status = DLM_NORMAL; + + mlog_entry("type=%d\n", lock->ml.type); + + spin_lock(&res->spinlock); + /* if called from dlm_create_lock_handler, need to + * ensure it will not sleep in dlm_wait_on_lockres */ + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL && + lock->ml.node != dlm->node_num) { + /* erf. state changed after lock was dropped. */ + spin_unlock(&res->spinlock); + dlm_error(status); + return status; + } + __dlm_wait_on_lockres(res); + __dlm_lockres_reserve_ast(res); + + if (dlm_can_grant_new_lock(res, lock)) { + mlog(0, "I can grant this lock right away\n"); + /* got it right away */ + lock->lksb->status = DLM_NORMAL; + status = DLM_NORMAL; + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->granted); + + /* for the recovery lock, we can't allow the ast + * to be queued since the dlmthread is already + * frozen. but the recovery lock is always locked + * with LKM_NOQUEUE so we do not need the ast in + * this special case */ + if (!dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + kick_thread = 1; + call_ast = 1; + } + } else { + /* for NOQUEUE request, unless we get the + * lock right away, return DLM_NOTQUEUED */ + if (flags & LKM_NOQUEUE) + status = DLM_NOTQUEUED; + else { + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->blocked); + kick_thread = 1; + } + } + + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* either queue the ast or release it */ + if (call_ast) + dlm_queue_ast(dlm, lock); + else + dlm_lockres_release_ast(dlm, res); + + dlm_lockres_calc_usage(dlm, res); + if (kick_thread) + dlm_kick_thread(dlm, res); + + return status; +} + +void dlm_revert_pending_lock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* remove from local queue if it failed */ + list_del_init(&lock->list); + lock->lksb->flags &= ~DLM_LKSB_GET_LVB; +} + + +/* + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_DENIED, DLM_RECOVERING, or net status + */ +static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + enum dlm_status status = DLM_DENIED; + + mlog_entry("type=%d\n", lock->ml.type); + mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, + res->lockname.name, flags); + + spin_lock(&res->spinlock); + + /* will exit this call with spinlock held */ + __dlm_wait_on_lockres(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + + /* add lock to local (secondary) queue */ + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->blocked); + lock->lock_pending = 1; + spin_unlock(&res->spinlock); + + /* spec seems to say that you will get DLM_NORMAL when the lock + * has been queued, meaning we need to wait for a reply here. */ + status = dlm_send_remote_lock_request(dlm, res, lock, flags); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + lock->lock_pending = 0; + if (status != DLM_NORMAL) { + if (status != DLM_NOTQUEUED) + dlm_error(status); + dlm_revert_pending_lock(res, lock); + dlm_lock_put(lock); + } + spin_unlock(&res->spinlock); + + dlm_lockres_calc_usage(dlm, res); + + wake_up(&res->wq); + return status; +} + + +/* for remote lock creation. + * locking: + * caller needs: none, but need res->state & DLM_LOCK_RES_IN_PROGRESS + * taken: none + * held on exit: none + * returns: DLM_NOLOCKMGR, or net status + */ +static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, int flags) +{ + struct dlm_create_lock create; + int tmpret, status = 0; + enum dlm_status ret; + + mlog_entry_void(); + + memset(&create, 0, sizeof(create)); + create.node_idx = dlm->node_num; + create.requested_type = lock->ml.type; + create.cookie = lock->ml.cookie; + create.namelen = res->lockname.len; + create.flags = cpu_to_be32(flags); + memcpy(create.name, res->lockname.name, create.namelen); + + tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, + sizeof(create), res->owner, &status); + if (tmpret >= 0) { + // successfully sent and received + ret = status; // this is already a dlm_status + } else { + mlog_errno(tmpret); + if (dlm_is_host_down(tmpret)) { + ret = DLM_RECOVERING; + mlog(0, "node %u died so returning DLM_RECOVERING " + "from lock message!\n", res->owner); + } else { + ret = dlm_err_to_dlm_status(tmpret); + } + } + + return ret; +} + +void dlm_lock_get(struct dlm_lock *lock) +{ + kref_get(&lock->lock_refs); +} + +void dlm_lock_put(struct dlm_lock *lock) +{ + kref_put(&lock->lock_refs, dlm_lock_release); +} + +static void dlm_lock_release(struct kref *kref) +{ + struct dlm_lock *lock; + + lock = container_of(kref, struct dlm_lock, lock_refs); + + BUG_ON(!list_empty(&lock->list)); + BUG_ON(!list_empty(&lock->ast_list)); + BUG_ON(!list_empty(&lock->bast_list)); + BUG_ON(lock->ast_pending); + BUG_ON(lock->bast_pending); + + dlm_lock_detach_lockres(lock); + + if (lock->lksb_kernel_allocated) { + mlog(0, "freeing kernel-allocated lksb\n"); + kfree(lock->lksb); + } + kfree(lock); +} + +/* associate a lock with it's lockres, getting a ref on the lockres */ +void dlm_lock_attach_lockres(struct dlm_lock *lock, + struct dlm_lock_resource *res) +{ + dlm_lockres_get(res); + lock->lockres = res; +} + +/* drop ref on lockres, if there is still one associated with lock */ +static void dlm_lock_detach_lockres(struct dlm_lock *lock) +{ + struct dlm_lock_resource *res; + + res = lock->lockres; + if (res) { + lock->lockres = NULL; + mlog(0, "removing lock's lockres reference\n"); + dlm_lockres_put(res); + } +} + +static void dlm_init_lock(struct dlm_lock *newlock, int type, + u8 node, u64 cookie) +{ + INIT_LIST_HEAD(&newlock->list); + INIT_LIST_HEAD(&newlock->ast_list); + INIT_LIST_HEAD(&newlock->bast_list); + spin_lock_init(&newlock->spinlock); + newlock->ml.type = type; + newlock->ml.convert_type = LKM_IVMODE; + newlock->ml.highest_blocked = LKM_IVMODE; + newlock->ml.node = node; + newlock->ml.pad1 = 0; + newlock->ml.list = 0; + newlock->ml.flags = 0; + newlock->ast = NULL; + newlock->bast = NULL; + newlock->astdata = NULL; + newlock->ml.cookie = cpu_to_be64(cookie); + newlock->ast_pending = 0; + newlock->bast_pending = 0; + newlock->convert_pending = 0; + newlock->lock_pending = 0; + newlock->unlock_pending = 0; + newlock->cancel_pending = 0; + newlock->lksb_kernel_allocated = 0; + + kref_init(&newlock->lock_refs); +} + +struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, + struct dlm_lockstatus *lksb) +{ + struct dlm_lock *lock; + int kernel_allocated = 0; + + lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); + if (!lock) + return NULL; + + if (!lksb) { + /* zero memory only if kernel-allocated */ + lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); + if (!lksb) { + kfree(lock); + return NULL; + } + kernel_allocated = 1; + } + + dlm_init_lock(lock, type, node, cookie); + if (kernel_allocated) + lock->lksb_kernel_allocated = 1; + lock->lksb = lksb; + lksb->lockid = lock; + return lock; +} + +/* handler for lock creation net message + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_SYSERR, DLM_IVLOCKID, DLM_NOTQUEUED + */ +int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_create_lock *create = (struct dlm_create_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *newlock = NULL; + struct dlm_lockstatus *lksb = NULL; + enum dlm_status status = DLM_NORMAL; + char *name; + unsigned int namelen; + + BUG_ON(!dlm); + + mlog_entry_void(); + + if (!dlm_grab(dlm)) + return DLM_REJECTED; + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + name = create->name; + namelen = create->namelen; + + status = DLM_IVBUFLEN; + if (namelen > DLM_LOCKID_NAME_MAX) { + dlm_error(status); + goto leave; + } + + status = DLM_SYSERR; + newlock = dlm_new_lock(create->requested_type, + create->node_idx, + be64_to_cpu(create->cookie), NULL); + if (!newlock) { + dlm_error(status); + goto leave; + } + + lksb = newlock->lksb; + + if (be32_to_cpu(create->flags) & LKM_GET_LVB) { + lksb->flags |= DLM_LKSB_GET_LVB; + mlog(0, "set DLM_LKSB_GET_LVB flag\n"); + } + + status = DLM_IVLOCKID; + res = dlm_lookup_lockres(dlm, name, namelen); + if (!res) { + dlm_error(status); + goto leave; + } + + spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + spin_unlock(&res->spinlock); + + if (status != DLM_NORMAL) { + mlog(0, "lockres recovering/migrating/in-progress\n"); + goto leave; + } + + dlm_lock_attach_lockres(newlock, res); + + status = dlmlock_master(dlm, res, newlock, be32_to_cpu(create->flags)); +leave: + if (status != DLM_NORMAL) + if (newlock) + dlm_lock_put(newlock); + + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} + + +/* fetch next node-local (u8 nodenum + u56 cookie) into u64 */ +static inline void dlm_get_next_cookie(u8 node_num, u64 *cookie) +{ + u64 tmpnode = node_num; + + /* shift single byte of node num into top 8 bits */ + tmpnode <<= 56; + + spin_lock(&dlm_cookie_lock); + *cookie = (dlm_next_cookie | tmpnode); + if (++dlm_next_cookie & 0xff00000000000000ull) { + mlog(0, "This node's cookie will now wrap!\n"); + dlm_next_cookie = 1; + } + spin_unlock(&dlm_cookie_lock); +} + +enum dlm_status dlmlock(struct dlm_ctxt *dlm, int mode, + struct dlm_lockstatus *lksb, int flags, + const char *name, dlm_astlockfunc_t *ast, void *data, + dlm_bastlockfunc_t *bast) +{ + enum dlm_status status; + struct dlm_lock_resource *res = NULL; + struct dlm_lock *lock = NULL; + int convert = 0, recovery = 0; + + /* yes this function is a mess. + * TODO: clean this up. lots of common code in the + * lock and convert paths, especially in the retry blocks */ + if (!lksb) { + dlm_error(DLM_BADARGS); + return DLM_BADARGS; + } + + status = DLM_BADPARAM; + if (mode != LKM_EXMODE && mode != LKM_PRMODE && mode != LKM_NLMODE) { + dlm_error(status); + goto error; + } + + if (flags & ~LKM_VALID_FLAGS) { + dlm_error(status); + goto error; + } + + convert = (flags & LKM_CONVERT); + recovery = (flags & LKM_RECOVERY); + + if (recovery && + (!dlm_is_recovery_lock(name, strlen(name)) || convert) ) { + dlm_error(status); + goto error; + } + if (convert && (flags & LKM_LOCAL)) { + mlog(ML_ERROR, "strange LOCAL convert request!\n"); + goto error; + } + + if (convert) { + /* CONVERT request */ + + /* if converting, must pass in a valid dlm_lock */ + lock = lksb->lockid; + if (!lock) { + mlog(ML_ERROR, "NULL lock pointer in convert " + "request\n"); + goto error; + } + + res = lock->lockres; + if (!res) { + mlog(ML_ERROR, "NULL lockres pointer in convert " + "request\n"); + goto error; + } + dlm_lockres_get(res); + + /* XXX: for ocfs2 purposes, the ast/bast/astdata/lksb are + * static after the original lock call. convert requests will + * ensure that everything is the same, or return DLM_BADARGS. + * this means that DLM_DENIED_NOASTS will never be returned. + */ + if (lock->lksb != lksb || lock->ast != ast || + lock->bast != bast || lock->astdata != data) { + status = DLM_BADARGS; + mlog(ML_ERROR, "new args: lksb=%p, ast=%p, bast=%p, " + "astdata=%p\n", lksb, ast, bast, data); + mlog(ML_ERROR, "orig args: lksb=%p, ast=%p, bast=%p, " + "astdata=%p\n", lock->lksb, lock->ast, + lock->bast, lock->astdata); + goto error; + } +retry_convert: + dlm_wait_for_recovery(dlm); + + if (res->owner == dlm->node_num) + status = dlmconvert_master(dlm, res, lock, flags, mode); + else + status = dlmconvert_remote(dlm, res, lock, flags, mode); + if (status == DLM_RECOVERING || status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* for now, see how this works without sleeping + * and just retry right away. I suspect the reco + * or migration will complete fast enough that + * no waiting will be necessary */ + mlog(0, "retrying convert with migration/recovery/" + "in-progress\n"); + msleep(100); + goto retry_convert; + } + } else { + u64 tmpcookie; + + /* LOCK request */ + status = DLM_BADARGS; + if (!name) { + dlm_error(status); + goto error; + } + + status = DLM_IVBUFLEN; + if (strlen(name) > DLM_LOCKID_NAME_MAX || strlen(name) < 1) { + dlm_error(status); + goto error; + } + + dlm_get_next_cookie(dlm->node_num, &tmpcookie); + lock = dlm_new_lock(mode, dlm->node_num, tmpcookie, lksb); + if (!lock) { + dlm_error(status); + goto error; + } + + if (!recovery) + dlm_wait_for_recovery(dlm); + + /* find or create the lock resource */ + res = dlm_get_lock_resource(dlm, name, flags); + if (!res) { + status = DLM_IVLOCKID; + dlm_error(status); + goto error; + } + + mlog(0, "type=%d, flags = 0x%x\n", mode, flags); + mlog(0, "creating lock: lock=%p res=%p\n", lock, res); + + dlm_lock_attach_lockres(lock, res); + lock->ast = ast; + lock->bast = bast; + lock->astdata = data; + +retry_lock: + if (flags & LKM_VALBLK) { + mlog(0, "LKM_VALBLK passed by caller\n"); + + /* LVB requests for non PR, PW or EX locks are + * ignored. */ + if (mode < LKM_PRMODE) + flags &= ~LKM_VALBLK; + else { + flags |= LKM_GET_LVB; + lock->lksb->flags |= DLM_LKSB_GET_LVB; + } + } + + if (res->owner == dlm->node_num) + status = dlmlock_master(dlm, res, lock, flags); + else + status = dlmlock_remote(dlm, res, lock, flags); + + if (status == DLM_RECOVERING || status == DLM_MIGRATING || + status == DLM_FORWARD) { + mlog(0, "retrying lock with migration/" + "recovery/in progress\n"); + msleep(100); + dlm_wait_for_recovery(dlm); + goto retry_lock; + } + + if (status != DLM_NORMAL) { + lock->lksb->flags &= ~DLM_LKSB_GET_LVB; + if (status != DLM_NOTQUEUED) + dlm_error(status); + goto error; + } + } + +error: + if (status != DLM_NORMAL) { + if (lock && !convert) + dlm_lock_put(lock); + // this is kind of unnecessary + lksb->status = status; + } + + /* put lockres ref from the convert path + * or from dlm_get_lock_resource */ + if (res) + dlm_lockres_put(res); + + return status; +} +EXPORT_SYMBOL_GPL(dlmlock); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c new file mode 100644 index 000000000000..27e984f7e4cd --- /dev/null +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -0,0 +1,2664 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmmod.c + * + * standalone DLM module + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/spinlock.h> +#include <linux/delay.h> + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdebug.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) +#include "cluster/masklog.h" + +enum dlm_mle_type { + DLM_MLE_BLOCK, + DLM_MLE_MASTER, + DLM_MLE_MIGRATION +}; + +struct dlm_lock_name +{ + u8 len; + u8 name[DLM_LOCKID_NAME_MAX]; +}; + +struct dlm_master_list_entry +{ + struct list_head list; + struct list_head hb_events; + struct dlm_ctxt *dlm; + spinlock_t spinlock; + wait_queue_head_t wq; + atomic_t woken; + struct kref mle_refs; + unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + u8 master; + u8 new_master; + enum dlm_mle_type type; + struct o2hb_callback_func mle_hb_up; + struct o2hb_callback_func mle_hb_down; + union { + struct dlm_lock_resource *res; + struct dlm_lock_name name; + } u; +}; + +static void dlm_mle_node_down(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, + int idx); +static void dlm_mle_node_up(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, + int idx); + +static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); +static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, + unsigned int namelen, void *nodemap, + u32 flags); + +static inline int dlm_mle_equal(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + const char *name, + unsigned int namelen) +{ + struct dlm_lock_resource *res; + + if (dlm != mle->dlm) + return 0; + + if (mle->type == DLM_MLE_BLOCK || + mle->type == DLM_MLE_MIGRATION) { + if (namelen != mle->u.name.len || + memcmp(name, mle->u.name.name, namelen)!=0) + return 0; + } else { + res = mle->u.res; + if (namelen != res->lockname.len || + memcmp(res->lockname.name, name, namelen) != 0) + return 0; + } + return 1; +} + +#if 0 +/* Code here is included but defined out as it aids debugging */ + +void dlm_print_one_mle(struct dlm_master_list_entry *mle) +{ + int i = 0, refs; + char *type; + char attached; + u8 master; + unsigned int namelen; + const char *name; + struct kref *k; + + k = &mle->mle_refs; + if (mle->type == DLM_MLE_BLOCK) + type = "BLK"; + else if (mle->type == DLM_MLE_MASTER) + type = "MAS"; + else + type = "MIG"; + refs = atomic_read(&k->refcount); + master = mle->master; + attached = (list_empty(&mle->hb_events) ? 'N' : 'Y'); + + if (mle->type != DLM_MLE_MASTER) { + namelen = mle->u.name.len; + name = mle->u.name.name; + } else { + namelen = mle->u.res->lockname.len; + name = mle->u.res->lockname.name; + } + + mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", + i, type, refs, master, mle->new_master, attached, + namelen, namelen, name); +} + +static void dlm_dump_mles(struct dlm_ctxt *dlm) +{ + struct dlm_master_list_entry *mle; + struct list_head *iter; + + mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); + mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); + spin_lock(&dlm->master_lock); + list_for_each(iter, &dlm->master_list) { + mle = list_entry(iter, struct dlm_master_list_entry, list); + dlm_print_one_mle(mle); + } + spin_unlock(&dlm->master_lock); +} + +int dlm_dump_all_mles(const char __user *data, unsigned int len) +{ + struct list_head *iter; + struct dlm_ctxt *dlm; + + spin_lock(&dlm_domain_lock); + list_for_each(iter, &dlm_domains) { + dlm = list_entry (iter, struct dlm_ctxt, list); + mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); + dlm_dump_mles(dlm); + } + spin_unlock(&dlm_domain_lock); + return len; +} +EXPORT_SYMBOL_GPL(dlm_dump_all_mles); + +#endif /* 0 */ + + +static kmem_cache_t *dlm_mle_cache = NULL; + + +static void dlm_mle_release(struct kref *kref); +static void dlm_init_mle(struct dlm_master_list_entry *mle, + enum dlm_mle_type type, + struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, + unsigned int namelen); +static void dlm_put_mle(struct dlm_master_list_entry *mle); +static void __dlm_put_mle(struct dlm_master_list_entry *mle); +static int dlm_find_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry **mle, + char *name, unsigned int namelen); + +static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); + + +static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int *blocked); +static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int blocked); +static int dlm_add_migration_mle(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + struct dlm_master_list_entry **oldmle, + const char *name, unsigned int namelen, + u8 new_master, u8 master); + +static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res); +static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target); + + +int dlm_is_host_down(int errno) +{ + switch (errno) { + case -EBADF: + case -ECONNREFUSED: + case -ENOTCONN: + case -ECONNRESET: + case -EPIPE: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ETIMEDOUT: + case -ECONNABORTED: + case -ENETDOWN: + case -ENETUNREACH: + case -ENETRESET: + case -ESHUTDOWN: + case -ENOPROTOOPT: + case -EINVAL: /* if returned from our tcp code, + this means there is no socket */ + return 1; + } + return 0; +} + + +/* + * MASTER LIST FUNCTIONS + */ + + +/* + * regarding master list entries and heartbeat callbacks: + * + * in order to avoid sleeping and allocation that occurs in + * heartbeat, master list entries are simply attached to the + * dlm's established heartbeat callbacks. the mle is attached + * when it is created, and since the dlm->spinlock is held at + * that time, any heartbeat event will be properly discovered + * by the mle. the mle needs to be detached from the + * dlm->mle_hb_events list as soon as heartbeat events are no + * longer useful to the mle, and before the mle is freed. + * + * as a general rule, heartbeat events are no longer needed by + * the mle once an "answer" regarding the lock master has been + * received. + */ +static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + assert_spin_locked(&dlm->spinlock); + + list_add_tail(&mle->hb_events, &dlm->mle_hb_events); +} + + +static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + if (!list_empty(&mle->hb_events)) + list_del_init(&mle->hb_events); +} + + +static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle) +{ + spin_lock(&dlm->spinlock); + __dlm_mle_detach_hb_events(dlm, mle); + spin_unlock(&dlm->spinlock); +} + +/* remove from list and free */ +static void __dlm_put_mle(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + BUG_ON(!atomic_read(&mle->mle_refs.refcount)); + + kref_put(&mle->mle_refs, dlm_mle_release); +} + + +/* must not have any spinlocks coming in */ +static void dlm_put_mle(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); +} + +static inline void dlm_get_mle(struct dlm_master_list_entry *mle) +{ + kref_get(&mle->mle_refs); +} + +static void dlm_init_mle(struct dlm_master_list_entry *mle, + enum dlm_mle_type type, + struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, + unsigned int namelen) +{ + assert_spin_locked(&dlm->spinlock); + + mle->dlm = dlm; + mle->type = type; + INIT_LIST_HEAD(&mle->list); + INIT_LIST_HEAD(&mle->hb_events); + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + spin_lock_init(&mle->spinlock); + init_waitqueue_head(&mle->wq); + atomic_set(&mle->woken, 0); + kref_init(&mle->mle_refs); + memset(mle->response_map, 0, sizeof(mle->response_map)); + mle->master = O2NM_MAX_NODES; + mle->new_master = O2NM_MAX_NODES; + + if (mle->type == DLM_MLE_MASTER) { + BUG_ON(!res); + mle->u.res = res; + } else if (mle->type == DLM_MLE_BLOCK) { + BUG_ON(!name); + memcpy(mle->u.name.name, name, namelen); + mle->u.name.len = namelen; + } else /* DLM_MLE_MIGRATION */ { + BUG_ON(!name); + memcpy(mle->u.name.name, name, namelen); + mle->u.name.len = namelen; + } + + /* copy off the node_map and register hb callbacks on our copy */ + memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map)); + memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map)); + clear_bit(dlm->node_num, mle->vote_map); + clear_bit(dlm->node_num, mle->node_map); + + /* attach the mle to the domain node up/down events */ + __dlm_mle_attach_hb_events(dlm, mle); +} + + +/* returns 1 if found, 0 if not */ +static int dlm_find_mle(struct dlm_ctxt *dlm, + struct dlm_master_list_entry **mle, + char *name, unsigned int namelen) +{ + struct dlm_master_list_entry *tmpmle; + struct list_head *iter; + + assert_spin_locked(&dlm->master_lock); + + list_for_each(iter, &dlm->master_list) { + tmpmle = list_entry(iter, struct dlm_master_list_entry, list); + if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) + continue; + dlm_get_mle(tmpmle); + *mle = tmpmle; + return 1; + } + return 0; +} + +void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) +{ + struct dlm_master_list_entry *mle; + struct list_head *iter; + + assert_spin_locked(&dlm->spinlock); + + list_for_each(iter, &dlm->mle_hb_events) { + mle = list_entry(iter, struct dlm_master_list_entry, + hb_events); + if (node_up) + dlm_mle_node_up(dlm, mle, NULL, idx); + else + dlm_mle_node_down(dlm, mle, NULL, idx); + } +} + +static void dlm_mle_node_down(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, int idx) +{ + spin_lock(&mle->spinlock); + + if (!test_bit(idx, mle->node_map)) + mlog(0, "node %u already removed from nodemap!\n", idx); + else + clear_bit(idx, mle->node_map); + + spin_unlock(&mle->spinlock); +} + +static void dlm_mle_node_up(struct dlm_ctxt *dlm, + struct dlm_master_list_entry *mle, + struct o2nm_node *node, int idx) +{ + spin_lock(&mle->spinlock); + + if (test_bit(idx, mle->node_map)) + mlog(0, "node %u already in node map!\n", idx); + else + set_bit(idx, mle->node_map); + + spin_unlock(&mle->spinlock); +} + + +int dlm_init_mle_cache(void) +{ + dlm_mle_cache = kmem_cache_create("dlm_mle_cache", + sizeof(struct dlm_master_list_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (dlm_mle_cache == NULL) + return -ENOMEM; + return 0; +} + +void dlm_destroy_mle_cache(void) +{ + if (dlm_mle_cache) + kmem_cache_destroy(dlm_mle_cache); +} + +static void dlm_mle_release(struct kref *kref) +{ + struct dlm_master_list_entry *mle; + struct dlm_ctxt *dlm; + + mlog_entry_void(); + + mle = container_of(kref, struct dlm_master_list_entry, mle_refs); + dlm = mle->dlm; + + if (mle->type != DLM_MLE_MASTER) { + mlog(0, "calling mle_release for %.*s, type %d\n", + mle->u.name.len, mle->u.name.name, mle->type); + } else { + mlog(0, "calling mle_release for %.*s, type %d\n", + mle->u.res->lockname.len, + mle->u.res->lockname.name, mle->type); + } + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + /* remove from list if not already */ + if (!list_empty(&mle->list)) + list_del_init(&mle->list); + + /* detach the mle from the domain node up/down events */ + __dlm_mle_detach_hb_events(dlm, mle); + + /* NOTE: kfree under spinlock here. + * if this is bad, we can move this to a freelist. */ + kmem_cache_free(dlm_mle_cache, mle); +} + + +/* + * LOCK RESOURCE FUNCTIONS + */ + +static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 owner) +{ + assert_spin_locked(&res->spinlock); + + mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner); + + if (owner == dlm->node_num) + atomic_inc(&dlm->local_resources); + else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN) + atomic_inc(&dlm->unknown_resources); + else + atomic_inc(&dlm->remote_resources); + + res->owner = owner; +} + +void dlm_change_lockres_owner(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 owner) +{ + assert_spin_locked(&res->spinlock); + + if (owner == res->owner) + return; + + if (res->owner == dlm->node_num) + atomic_dec(&dlm->local_resources); + else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) + atomic_dec(&dlm->unknown_resources); + else + atomic_dec(&dlm->remote_resources); + + dlm_set_lockres_owner(dlm, res, owner); +} + + +static void dlm_lockres_release(struct kref *kref) +{ + struct dlm_lock_resource *res; + + res = container_of(kref, struct dlm_lock_resource, refs); + + /* This should not happen -- all lockres' have a name + * associated with them at init time. */ + BUG_ON(!res->lockname.name); + + mlog(0, "destroying lockres %.*s\n", res->lockname.len, + res->lockname.name); + + /* By the time we're ready to blow this guy away, we shouldn't + * be on any lists. */ + BUG_ON(!list_empty(&res->list)); + BUG_ON(!list_empty(&res->granted)); + BUG_ON(!list_empty(&res->converting)); + BUG_ON(!list_empty(&res->blocked)); + BUG_ON(!list_empty(&res->dirty)); + BUG_ON(!list_empty(&res->recovering)); + BUG_ON(!list_empty(&res->purge)); + + kfree(res->lockname.name); + + kfree(res); +} + +void dlm_lockres_get(struct dlm_lock_resource *res) +{ + kref_get(&res->refs); +} + +void dlm_lockres_put(struct dlm_lock_resource *res) +{ + kref_put(&res->refs, dlm_lockres_release); +} + +static void dlm_init_lockres(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + const char *name, unsigned int namelen) +{ + char *qname; + + /* If we memset here, we lose our reference to the kmalloc'd + * res->lockname.name, so be sure to init every field + * correctly! */ + + qname = (char *) res->lockname.name; + memcpy(qname, name, namelen); + + res->lockname.len = namelen; + res->lockname.hash = full_name_hash(name, namelen); + + init_waitqueue_head(&res->wq); + spin_lock_init(&res->spinlock); + INIT_LIST_HEAD(&res->list); + INIT_LIST_HEAD(&res->granted); + INIT_LIST_HEAD(&res->converting); + INIT_LIST_HEAD(&res->blocked); + INIT_LIST_HEAD(&res->dirty); + INIT_LIST_HEAD(&res->recovering); + INIT_LIST_HEAD(&res->purge); + atomic_set(&res->asts_reserved, 0); + res->migration_pending = 0; + + kref_init(&res->refs); + + /* just for consistency */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN); + spin_unlock(&res->spinlock); + + res->state = DLM_LOCK_RES_IN_PROGRESS; + + res->last_used = 0; + + memset(res->lvb, 0, DLM_LVB_LEN); +} + +struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, + const char *name, + unsigned int namelen) +{ + struct dlm_lock_resource *res; + + res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + if (!res) + return NULL; + + res->lockname.name = kmalloc(namelen, GFP_KERNEL); + if (!res->lockname.name) { + kfree(res); + return NULL; + } + + dlm_init_lockres(dlm, res, name, namelen); + return res; +} + +/* + * lookup a lock resource by name. + * may already exist in the hashtable. + * lockid is null terminated + * + * if not, allocate enough for the lockres and for + * the temporary structure used in doing the mastering. + * + * also, do a lookup in the dlm->master_list to see + * if another node has begun mastering the same lock. + * if so, there should be a block entry in there + * for this name, and we should *not* attempt to master + * the lock here. need to wait around for that node + * to assert_master (or die). + * + */ +struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, + const char *lockid, + int flags) +{ + struct dlm_lock_resource *tmpres=NULL, *res=NULL; + struct dlm_master_list_entry *mle = NULL; + struct dlm_master_list_entry *alloc_mle = NULL; + int blocked = 0; + int ret, nodenum; + struct dlm_node_iter iter; + unsigned int namelen; + int tries = 0; + + BUG_ON(!lockid); + + namelen = strlen(lockid); + + mlog(0, "get lockres %s (len %d)\n", lockid, namelen); + +lookup: + spin_lock(&dlm->spinlock); + tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); + if (tmpres) { + spin_unlock(&dlm->spinlock); + mlog(0, "found in hash!\n"); + if (res) + dlm_lockres_put(res); + res = tmpres; + goto leave; + } + + if (!res) { + spin_unlock(&dlm->spinlock); + mlog(0, "allocating a new resource\n"); + /* nothing found and we need to allocate one. */ + alloc_mle = (struct dlm_master_list_entry *) + kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + if (!alloc_mle) + goto leave; + res = dlm_new_lockres(dlm, lockid, namelen); + if (!res) + goto leave; + goto lookup; + } + + mlog(0, "no lockres found, allocated our own: %p\n", res); + + if (flags & LKM_LOCAL) { + /* caller knows it's safe to assume it's not mastered elsewhere + * DONE! return right away */ + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, dlm->node_num); + __dlm_insert_lockres(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + /* lockres still marked IN_PROGRESS */ + goto wake_waiters; + } + + /* check master list to see if another node has started mastering it */ + spin_lock(&dlm->master_lock); + + /* if we found a block, wait for lock to be mastered by another node */ + blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); + if (blocked) { + if (mle->type == DLM_MLE_MASTER) { + mlog(ML_ERROR, "master entry for nonexistent lock!\n"); + BUG(); + } else if (mle->type == DLM_MLE_MIGRATION) { + /* migration is in progress! */ + /* the good news is that we now know the + * "current" master (mle->master). */ + + spin_unlock(&dlm->master_lock); + assert_spin_locked(&dlm->spinlock); + + /* set the lockres owner and hash it */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, mle->master); + __dlm_insert_lockres(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + + /* master is known, detach */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + mle = NULL; + goto wake_waiters; + } + } else { + /* go ahead and try to master lock on this node */ + mle = alloc_mle; + /* make sure this does not get freed below */ + alloc_mle = NULL; + dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); + set_bit(dlm->node_num, mle->maybe_map); + list_add(&mle->list, &dlm->master_list); + } + + /* at this point there is either a DLM_MLE_BLOCK or a + * DLM_MLE_MASTER on the master list, so it's safe to add the + * lockres to the hashtable. anyone who finds the lock will + * still have to wait on the IN_PROGRESS. */ + + /* finally add the lockres to its hash bucket */ + __dlm_insert_lockres(dlm, res); + /* get an extra ref on the mle in case this is a BLOCK + * if so, the creator of the BLOCK may try to put the last + * ref at this time in the assert master handler, so we + * need an extra one to keep from a bad ptr deref. */ + dlm_get_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + /* must wait for lock to be mastered elsewhere */ + if (blocked) + goto wait; + +redo_request: + ret = -EINVAL; + dlm_node_iter_init(mle->vote_map, &iter); + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + ret = dlm_do_master_request(mle, nodenum); + if (ret < 0) + mlog_errno(ret); + if (mle->master != O2NM_MAX_NODES) { + /* found a master ! */ + break; + } + } + +wait: + /* keep going until the response map includes all nodes */ + ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); + if (ret < 0) { + mlog(0, "%s:%.*s: node map changed, redo the " + "master request now, blocked=%d\n", + dlm->name, res->lockname.len, + res->lockname.name, blocked); + if (++tries > 20) { + mlog(ML_ERROR, "%s:%.*s: spinning on " + "dlm_wait_for_lock_mastery, blocked=%d\n", + dlm->name, res->lockname.len, + res->lockname.name, blocked); + dlm_print_one_lock_resource(res); + /* dlm_print_one_mle(mle); */ + tries = 0; + } + goto redo_request; + } + + mlog(0, "lockres mastered by %u\n", res->owner); + /* make sure we never continue without this */ + BUG_ON(res->owner == O2NM_MAX_NODES); + + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + /* put the extra ref */ + dlm_put_mle(mle); + +wake_waiters: + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + +leave: + /* need to free the unused mle */ + if (alloc_mle) + kmem_cache_free(dlm_mle_cache, alloc_mle); + + return res; +} + + +#define DLM_MASTERY_TIMEOUT_MS 5000 + +static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int *blocked) +{ + u8 m; + int ret, bit; + int map_changed, voting_done; + int assert, sleep; + +recheck: + ret = 0; + assert = 0; + + /* check if another node has already become the owner */ + spin_lock(&res->spinlock); + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + spin_unlock(&res->spinlock); + goto leave; + } + spin_unlock(&res->spinlock); + + spin_lock(&mle->spinlock); + m = mle->master; + map_changed = (memcmp(mle->vote_map, mle->node_map, + sizeof(mle->vote_map)) != 0); + voting_done = (memcmp(mle->vote_map, mle->response_map, + sizeof(mle->vote_map)) == 0); + + /* restart if we hit any errors */ + if (map_changed) { + int b; + mlog(0, "%s: %.*s: node map changed, restarting\n", + dlm->name, res->lockname.len, res->lockname.name); + ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked); + b = (mle->type == DLM_MLE_BLOCK); + if ((*blocked && !b) || (!*blocked && b)) { + mlog(0, "%s:%.*s: status change: old=%d new=%d\n", + dlm->name, res->lockname.len, res->lockname.name, + *blocked, b); + *blocked = b; + } + spin_unlock(&mle->spinlock); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + mlog(0, "%s:%.*s: restart lock mastery succeeded, " + "rechecking now\n", dlm->name, res->lockname.len, + res->lockname.name); + goto recheck; + } + + if (m != O2NM_MAX_NODES) { + /* another node has done an assert! + * all done! */ + sleep = 0; + } else { + sleep = 1; + /* have all nodes responded? */ + if (voting_done && !*blocked) { + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (dlm->node_num <= bit) { + /* my node number is lowest. + * now tell other nodes that I am + * mastering this. */ + mle->master = dlm->node_num; + assert = 1; + sleep = 0; + } + /* if voting is done, but we have not received + * an assert master yet, we must sleep */ + } + } + + spin_unlock(&mle->spinlock); + + /* sleep if we haven't finished voting yet */ + if (sleep) { + unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); + + /* + if (atomic_read(&mle->mle_refs.refcount) < 2) + mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, + atomic_read(&mle->mle_refs.refcount), + res->lockname.len, res->lockname.name); + */ + atomic_set(&mle->woken, 0); + (void)wait_event_timeout(mle->wq, + (atomic_read(&mle->woken) == 1), + timeo); + if (res->owner == O2NM_MAX_NODES) { + mlog(0, "waiting again\n"); + goto recheck; + } + mlog(0, "done waiting, master is %u\n", res->owner); + ret = 0; + goto leave; + } + + ret = 0; /* done */ + if (assert) { + m = dlm->node_num; + mlog(0, "about to master %.*s here, this=%u\n", + res->lockname.len, res->lockname.name, m); + ret = dlm_do_assert_master(dlm, res->lockname.name, + res->lockname.len, mle->vote_map, 0); + if (ret) { + /* This is a failure in the network path, + * not in the response to the assert_master + * (any nonzero response is a BUG on this node). + * Most likely a socket just got disconnected + * due to node death. */ + mlog_errno(ret); + } + /* no longer need to restart lock mastery. + * all living nodes have been contacted. */ + ret = 0; + } + + /* set the lockres owner */ + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, m); + spin_unlock(&res->spinlock); + +leave: + return ret; +} + +struct dlm_bitmap_diff_iter +{ + int curnode; + unsigned long *orig_bm; + unsigned long *cur_bm; + unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; +}; + +enum dlm_node_state_change +{ + NODE_DOWN = -1, + NODE_NO_CHANGE = 0, + NODE_UP +}; + +static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter, + unsigned long *orig_bm, + unsigned long *cur_bm) +{ + unsigned long p1, p2; + int i; + + iter->curnode = -1; + iter->orig_bm = orig_bm; + iter->cur_bm = cur_bm; + + for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) { + p1 = *(iter->orig_bm + i); + p2 = *(iter->cur_bm + i); + iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1); + } +} + +static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter, + enum dlm_node_state_change *state) +{ + int bit; + + if (iter->curnode >= O2NM_MAX_NODES) + return -ENOENT; + + bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES, + iter->curnode+1); + if (bit >= O2NM_MAX_NODES) { + iter->curnode = O2NM_MAX_NODES; + return -ENOENT; + } + + /* if it was there in the original then this node died */ + if (test_bit(bit, iter->orig_bm)) + *state = NODE_DOWN; + else + *state = NODE_UP; + + iter->curnode = bit; + return bit; +} + + +static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + int blocked) +{ + struct dlm_bitmap_diff_iter bdi; + enum dlm_node_state_change sc; + int node; + int ret = 0; + + mlog(0, "something happened such that the " + "master process may need to be restarted!\n"); + + assert_spin_locked(&mle->spinlock); + + dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map); + node = dlm_bitmap_diff_iter_next(&bdi, &sc); + while (node >= 0) { + if (sc == NODE_UP) { + /* a node came up. easy. might not even need + * to talk to it if its node number is higher + * or if we are already blocked. */ + mlog(0, "node up! %d\n", node); + if (blocked) + goto next; + + if (node > dlm->node_num) { + mlog(0, "node > this node. skipping.\n"); + goto next; + } + + /* redo the master request, but only for the new node */ + mlog(0, "sending request to new node\n"); + clear_bit(node, mle->response_map); + set_bit(node, mle->vote_map); + } else { + mlog(ML_ERROR, "node down! %d\n", node); + + /* if the node wasn't involved in mastery skip it, + * but clear it out from the maps so that it will + * not affect mastery of this lockres */ + clear_bit(node, mle->response_map); + clear_bit(node, mle->vote_map); + if (!test_bit(node, mle->maybe_map)) + goto next; + + /* if we're already blocked on lock mastery, and the + * dead node wasn't the expected master, or there is + * another node in the maybe_map, keep waiting */ + if (blocked) { + int lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, 0); + + /* act like it was never there */ + clear_bit(node, mle->maybe_map); + + if (node != lowest) + goto next; + + mlog(ML_ERROR, "expected master %u died while " + "this node was blocked waiting on it!\n", + node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "still blocked. waiting " + "on %u now\n", lowest); + goto next; + } + + /* mle is an MLE_BLOCK, but there is now + * nothing left to block on. we need to return + * all the way back out and try again with + * an MLE_MASTER. dlm_do_local_recovery_cleanup + * has already run, so the mle refcount is ok */ + mlog(0, "no longer blocking. we can " + "try to master this here\n"); + mle->type = DLM_MLE_MASTER; + memset(mle->maybe_map, 0, + sizeof(mle->maybe_map)); + memset(mle->response_map, 0, + sizeof(mle->maybe_map)); + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + mle->u.res = res; + set_bit(dlm->node_num, mle->maybe_map); + + ret = -EAGAIN; + goto next; + } + + clear_bit(node, mle->maybe_map); + if (node > dlm->node_num) + goto next; + + mlog(0, "dead node in map!\n"); + /* yuck. go back and re-contact all nodes + * in the vote_map, removing this node. */ + memset(mle->response_map, 0, + sizeof(mle->response_map)); + } + ret = -EAGAIN; +next: + node = dlm_bitmap_diff_iter_next(&bdi, &sc); + } + return ret; +} + + +/* + * DLM_MASTER_REQUEST_MSG + * + * returns: 0 on success, + * -errno on a network error + * + * on error, the caller should assume the target node is "dead" + * + */ + +static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) +{ + struct dlm_ctxt *dlm = mle->dlm; + struct dlm_master_request request; + int ret, response=0, resend; + + memset(&request, 0, sizeof(request)); + request.node_idx = dlm->node_num; + + BUG_ON(mle->type == DLM_MLE_MIGRATION); + + if (mle->type != DLM_MLE_MASTER) { + request.namelen = mle->u.name.len; + memcpy(request.name, mle->u.name.name, request.namelen); + } else { + request.namelen = mle->u.res->lockname.len; + memcpy(request.name, mle->u.res->lockname.name, + request.namelen); + } + +again: + ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request, + sizeof(request), to, &response); + if (ret < 0) { + if (ret == -ESRCH) { + /* should never happen */ + mlog(ML_ERROR, "TCP stack not ready!\n"); + BUG(); + } else if (ret == -EINVAL) { + mlog(ML_ERROR, "bad args passed to o2net!\n"); + BUG(); + } else if (ret == -ENOMEM) { + mlog(ML_ERROR, "out of memory while trying to send " + "network message! retrying\n"); + /* this is totally crude */ + msleep(50); + goto again; + } else if (!dlm_is_host_down(ret)) { + /* not a network error. bad. */ + mlog_errno(ret); + mlog(ML_ERROR, "unhandled error!"); + BUG(); + } + /* all other errors should be network errors, + * and likely indicate node death */ + mlog(ML_ERROR, "link to %d went down!\n", to); + goto out; + } + + ret = 0; + resend = 0; + spin_lock(&mle->spinlock); + switch (response) { + case DLM_MASTER_RESP_YES: + set_bit(to, mle->response_map); + mlog(0, "node %u is the master, response=YES\n", to); + mle->master = to; + break; + case DLM_MASTER_RESP_NO: + mlog(0, "node %u not master, response=NO\n", to); + set_bit(to, mle->response_map); + break; + case DLM_MASTER_RESP_MAYBE: + mlog(0, "node %u not master, response=MAYBE\n", to); + set_bit(to, mle->response_map); + set_bit(to, mle->maybe_map); + break; + case DLM_MASTER_RESP_ERROR: + mlog(0, "node %u hit an error, resending\n", to); + resend = 1; + response = 0; + break; + default: + mlog(ML_ERROR, "bad response! %u\n", response); + BUG(); + } + spin_unlock(&mle->spinlock); + if (resend) { + /* this is also totally crude */ + msleep(50); + goto again; + } + +out: + return ret; +} + +/* + * locks that can be taken here: + * dlm->spinlock + * res->spinlock + * mle->spinlock + * dlm->master_list + * + * if possible, TRIM THIS DOWN!!! + */ +int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) +{ + u8 response = DLM_MASTER_RESP_MAYBE; + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res; + struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; + struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; + char *name; + unsigned int namelen; + int found, ret; + int set_maybe; + + if (!dlm_grab(dlm)) + return DLM_MASTER_RESP_NO; + + if (!dlm_domain_fully_joined(dlm)) { + response = DLM_MASTER_RESP_NO; + goto send_response; + } + + name = request->name; + namelen = request->namelen; + + if (namelen > DLM_LOCKID_NAME_MAX) { + response = DLM_IVBUFLEN; + goto send_response; + } + +way_up_top: + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, namelen); + if (res) { + spin_unlock(&dlm->spinlock); + + /* take care of the easy cases up front */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " + "being recovered\n"); + response = DLM_MASTER_RESP_ERROR; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + if (res->owner == dlm->node_num) { + u32 flags = DLM_ASSERT_MASTER_MLE_CLEANUP; + spin_unlock(&res->spinlock); + // mlog(0, "this node is the master\n"); + response = DLM_MASTER_RESP_YES; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + + /* this node is the owner. + * there is some extra work that needs to + * happen now. the requesting node has + * caused all nodes up to this one to + * create mles. this node now needs to + * go back and clean those up. */ + mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", + dlm->node_num, res->lockname.len, res->lockname.name); + ret = dlm_dispatch_assert_master(dlm, res, 1, + request->node_idx, + flags); + if (ret < 0) { + mlog(ML_ERROR, "failed to dispatch assert " + "master work\n"); + response = DLM_MASTER_RESP_ERROR; + } + goto send_response; + } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + spin_unlock(&res->spinlock); + // mlog(0, "node %u is the master\n", res->owner); + response = DLM_MASTER_RESP_NO; + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + /* ok, there is no owner. either this node is + * being blocked, or it is actively trying to + * master this lock. */ + if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { + mlog(ML_ERROR, "lock with no owner should be " + "in-progress!\n"); + BUG(); + } + + // mlog(0, "lockres is in progress...\n"); + spin_lock(&dlm->master_lock); + found = dlm_find_mle(dlm, &tmpmle, name, namelen); + if (!found) { + mlog(ML_ERROR, "no mle found for this lock!\n"); + BUG(); + } + set_maybe = 1; + spin_lock(&tmpmle->spinlock); + if (tmpmle->type == DLM_MLE_BLOCK) { + // mlog(0, "this node is waiting for " + // "lockres to be mastered\n"); + response = DLM_MASTER_RESP_NO; + } else if (tmpmle->type == DLM_MLE_MIGRATION) { + mlog(0, "node %u is master, but trying to migrate to " + "node %u.\n", tmpmle->master, tmpmle->new_master); + if (tmpmle->master == dlm->node_num) { + response = DLM_MASTER_RESP_YES; + mlog(ML_ERROR, "no owner on lockres, but this " + "node is trying to migrate it to %u?!\n", + tmpmle->new_master); + BUG(); + } else { + /* the real master can respond on its own */ + response = DLM_MASTER_RESP_NO; + } + } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) { + set_maybe = 0; + if (tmpmle->master == dlm->node_num) + response = DLM_MASTER_RESP_YES; + else + response = DLM_MASTER_RESP_NO; + } else { + // mlog(0, "this node is attempting to " + // "master lockres\n"); + response = DLM_MASTER_RESP_MAYBE; + } + if (set_maybe) + set_bit(request->node_idx, tmpmle->maybe_map); + spin_unlock(&tmpmle->spinlock); + + spin_unlock(&dlm->master_lock); + spin_unlock(&res->spinlock); + + /* keep the mle attached to heartbeat events */ + dlm_put_mle(tmpmle); + if (mle) + kmem_cache_free(dlm_mle_cache, mle); + goto send_response; + } + + /* + * lockres doesn't exist on this node + * if there is an MLE_BLOCK, return NO + * if there is an MLE_MASTER, return MAYBE + * otherwise, add an MLE_BLOCK, return NO + */ + spin_lock(&dlm->master_lock); + found = dlm_find_mle(dlm, &tmpmle, name, namelen); + if (!found) { + /* this lockid has never been seen on this node yet */ + // mlog(0, "no mle found\n"); + if (!mle) { + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + mle = (struct dlm_master_list_entry *) + kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + if (!mle) { + // bad bad bad... this sucks. + response = DLM_MASTER_RESP_ERROR; + goto send_response; + } + spin_lock(&dlm->spinlock); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, + name, namelen); + spin_unlock(&dlm->spinlock); + goto way_up_top; + } + + // mlog(0, "this is second time thru, already allocated, " + // "add the block.\n"); + set_bit(request->node_idx, mle->maybe_map); + list_add(&mle->list, &dlm->master_list); + response = DLM_MASTER_RESP_NO; + } else { + // mlog(0, "mle was found\n"); + set_maybe = 1; + spin_lock(&tmpmle->spinlock); + if (tmpmle->type == DLM_MLE_BLOCK) + response = DLM_MASTER_RESP_NO; + else if (tmpmle->type == DLM_MLE_MIGRATION) { + mlog(0, "migration mle was found (%u->%u)\n", + tmpmle->master, tmpmle->new_master); + if (tmpmle->master == dlm->node_num) { + mlog(ML_ERROR, "no lockres, but migration mle " + "says that this node is master!\n"); + BUG(); + } + /* real master can respond on its own */ + response = DLM_MASTER_RESP_NO; + } else { + if (tmpmle->master == dlm->node_num) { + response = DLM_MASTER_RESP_YES; + set_maybe = 0; + } else + response = DLM_MASTER_RESP_MAYBE; + } + if (set_maybe) + set_bit(request->node_idx, tmpmle->maybe_map); + spin_unlock(&tmpmle->spinlock); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (found) { + /* keep the mle attached to heartbeat events */ + dlm_put_mle(tmpmle); + } +send_response: + dlm_put(dlm); + return response; +} + +/* + * DLM_ASSERT_MASTER_MSG + */ + + +/* + * NOTE: this can be used for debugging + * can periodically run all locks owned by this node + * and re-assert across the cluster... + */ +static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, + unsigned int namelen, void *nodemap, + u32 flags) +{ + struct dlm_assert_master assert; + int to, tmpret; + struct dlm_node_iter iter; + int ret = 0; + + BUG_ON(namelen > O2NM_MAX_NAME_LEN); + + /* note that if this nodemap is empty, it returns 0 */ + dlm_node_iter_init(nodemap, &iter); + while ((to = dlm_node_iter_next(&iter)) >= 0) { + int r = 0; + mlog(0, "sending assert master to %d (%.*s)\n", to, + namelen, lockname); + memset(&assert, 0, sizeof(assert)); + assert.node_idx = dlm->node_num; + assert.namelen = namelen; + memcpy(assert.name, lockname, namelen); + assert.flags = cpu_to_be32(flags); + + tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, + &assert, sizeof(assert), to, &r); + if (tmpret < 0) { + mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); + if (!dlm_is_host_down(tmpret)) { + mlog(ML_ERROR, "unhandled error!\n"); + BUG(); + } + /* a node died. finish out the rest of the nodes. */ + mlog(ML_ERROR, "link to %d went down!\n", to); + /* any nonzero status return will do */ + ret = tmpret; + } else if (r < 0) { + /* ok, something horribly messed. kill thyself. */ + mlog(ML_ERROR,"during assert master of %.*s to %u, " + "got %d.\n", namelen, lockname, to, r); + dlm_dump_lock_resources(dlm); + BUG(); + } + } + + return ret; +} + +/* + * locks that can be taken here: + * dlm->spinlock + * res->spinlock + * mle->spinlock + * dlm->master_list + * + * if possible, TRIM THIS DOWN!!! + */ +int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_master_list_entry *mle = NULL; + struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; + struct dlm_lock_resource *res = NULL; + char *name; + unsigned int namelen; + u32 flags; + + if (!dlm_grab(dlm)) + return 0; + + name = assert->name; + namelen = assert->namelen; + flags = be32_to_cpu(assert->flags); + + if (namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length!"); + goto done; + } + + spin_lock(&dlm->spinlock); + + if (flags) + mlog(0, "assert_master with flags: %u\n", flags); + + /* find the MLE */ + spin_lock(&dlm->master_lock); + if (!dlm_find_mle(dlm, &mle, name, namelen)) { + /* not an error, could be master just re-asserting */ + mlog(0, "just got an assert_master from %u, but no " + "MLE for it! (%.*s)\n", assert->node_idx, + namelen, name); + } else { + int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit >= O2NM_MAX_NODES) { + /* not necessarily an error, though less likely. + * could be master just re-asserting. */ + mlog(ML_ERROR, "no bits set in the maybe_map, but %u " + "is asserting! (%.*s)\n", assert->node_idx, + namelen, name); + } else if (bit != assert->node_idx) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "master %u was found, %u should " + "back off\n", assert->node_idx, bit); + } else { + /* with the fix for bug 569, a higher node + * number winning the mastery will respond + * YES to mastery requests, but this node + * had no way of knowing. let it pass. */ + mlog(ML_ERROR, "%u is the lowest node, " + "%u is asserting. (%.*s) %u must " + "have begun after %u won.\n", bit, + assert->node_idx, namelen, name, bit, + assert->node_idx); + } + } + } + spin_unlock(&dlm->master_lock); + + /* ok everything checks out with the MLE + * now check to see if there is a lockres */ + res = __dlm_lookup_lockres(dlm, name, namelen); + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + mlog(ML_ERROR, "%u asserting but %.*s is " + "RECOVERING!\n", assert->node_idx, namelen, name); + goto kill; + } + if (!mle) { + if (res->owner != assert->node_idx) { + mlog(ML_ERROR, "assert_master from " + "%u, but current owner is " + "%u! (%.*s)\n", + assert->node_idx, res->owner, + namelen, name); + goto kill; + } + } else if (mle->type != DLM_MLE_MIGRATION) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { + /* owner is just re-asserting */ + if (res->owner == assert->node_idx) { + mlog(0, "owner %u re-asserting on " + "lock %.*s\n", assert->node_idx, + namelen, name); + goto ok; + } + mlog(ML_ERROR, "got assert_master from " + "node %u, but %u is the owner! " + "(%.*s)\n", assert->node_idx, + res->owner, namelen, name); + goto kill; + } + if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) { + mlog(ML_ERROR, "got assert from %u, but lock " + "with no owner should be " + "in-progress! (%.*s)\n", + assert->node_idx, + namelen, name); + goto kill; + } + } else /* mle->type == DLM_MLE_MIGRATION */ { + /* should only be getting an assert from new master */ + if (assert->node_idx != mle->new_master) { + mlog(ML_ERROR, "got assert from %u, but " + "new master is %u, and old master " + "was %u (%.*s)\n", + assert->node_idx, mle->new_master, + mle->master, namelen, name); + goto kill; + } + + } +ok: + spin_unlock(&res->spinlock); + } + spin_unlock(&dlm->spinlock); + + // mlog(0, "woo! got an assert_master from node %u!\n", + // assert->node_idx); + if (mle) { + int extra_ref; + + spin_lock(&mle->spinlock); + extra_ref = !!(mle->type == DLM_MLE_BLOCK + || mle->type == DLM_MLE_MIGRATION); + mle->master = assert->node_idx; + atomic_set(&mle->woken, 1); + wake_up(&mle->wq); + spin_unlock(&mle->spinlock); + + if (mle->type == DLM_MLE_MIGRATION && res) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + spin_unlock(&res->spinlock); + } + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + + if (extra_ref) { + /* the assert master message now balances the extra + * ref given by the master / migration request message. + * if this is the last put, it will be removed + * from the list. */ + dlm_put_mle(mle); + } + } + +done: + if (res) + dlm_lockres_put(res); + dlm_put(dlm); + return 0; + +kill: + /* kill the caller! */ + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + dlm_lockres_put(res); + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + dlm_dump_lock_resources(dlm); + dlm_put(dlm); + return -EINVAL; +} + +int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int ignore_higher, u8 request_from, u32 flags) +{ + struct dlm_work_item *item; + item = kcalloc(1, sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + + + /* queue up work for dlm_assert_master_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL); + item->u.am.lockres = res; /* already have a ref */ + /* can optionally ignore node numbers higher than this node */ + item->u.am.ignore_higher = ignore_higher; + item->u.am.request_from = request_from; + item->u.am.flags = flags; + + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + + schedule_work(&dlm->dispatched_work); + return 0; +} + +static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm = data; + int ret = 0; + struct dlm_lock_resource *res; + unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int ignore_higher; + int bit; + u8 request_from; + u32 flags; + + dlm = item->dlm; + res = item->u.am.lockres; + ignore_higher = item->u.am.ignore_higher; + request_from = item->u.am.request_from; + flags = item->u.am.flags; + + spin_lock(&dlm->spinlock); + memcpy(nodemap, dlm->domain_map, sizeof(nodemap)); + spin_unlock(&dlm->spinlock); + + clear_bit(dlm->node_num, nodemap); + if (ignore_higher) { + /* if is this just to clear up mles for nodes below + * this node, do not send the message to the original + * caller or any node number higher than this */ + clear_bit(request_from, nodemap); + bit = dlm->node_num; + while (1) { + bit = find_next_bit(nodemap, O2NM_MAX_NODES, + bit+1); + if (bit >= O2NM_MAX_NODES) + break; + clear_bit(bit, nodemap); + } + } + + /* this call now finishes out the nodemap + * even if one or more nodes die */ + mlog(0, "worker about to master %.*s here, this=%u\n", + res->lockname.len, res->lockname.name, dlm->node_num); + ret = dlm_do_assert_master(dlm, res->lockname.name, + res->lockname.len, + nodemap, flags); + if (ret < 0) { + /* no need to restart, we are done */ + mlog_errno(ret); + } + + dlm_lockres_put(res); + + mlog(0, "finished with dlm_assert_master_worker\n"); +} + + +/* + * DLM_MIGRATE_LOCKRES + */ + + +int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 target) +{ + struct dlm_master_list_entry *mle = NULL; + struct dlm_master_list_entry *oldmle = NULL; + struct dlm_migratable_lockres *mres = NULL; + int ret = -EINVAL; + const char *name; + unsigned int namelen; + int mle_added = 0; + struct list_head *queue, *iter; + int i; + struct dlm_lock *lock; + int empty = 1; + + if (!dlm_grab(dlm)) + return -EINVAL; + + name = res->lockname.name; + namelen = res->lockname.len; + + mlog(0, "migrating %.*s to %u\n", namelen, name, target); + + /* + * ensure this lockres is a proper candidate for migration + */ + spin_lock(&res->spinlock); + if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "cannot migrate lockres with unknown owner!\n"); + spin_unlock(&res->spinlock); + goto leave; + } + if (res->owner != dlm->node_num) { + mlog(0, "cannot migrate lockres this node doesn't own!\n"); + spin_unlock(&res->spinlock); + goto leave; + } + mlog(0, "checking queues...\n"); + queue = &res->granted; + for (i=0; i<3; i++) { + list_for_each(iter, queue) { + lock = list_entry (iter, struct dlm_lock, list); + empty = 0; + if (lock->ml.node == dlm->node_num) { + mlog(0, "found a lock owned by this node " + "still on the %s queue! will not " + "migrate this lockres\n", + i==0 ? "granted" : + (i==1 ? "converting" : "blocked")); + spin_unlock(&res->spinlock); + ret = -ENOTEMPTY; + goto leave; + } + } + queue++; + } + mlog(0, "all locks on this lockres are nonlocal. continuing\n"); + spin_unlock(&res->spinlock); + + /* no work to do */ + if (empty) { + mlog(0, "no locks were found on this lockres! done!\n"); + ret = 0; + goto leave; + } + + /* + * preallocate up front + * if this fails, abort + */ + + ret = -ENOMEM; + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); + if (!mres) { + mlog_errno(ret); + goto leave; + } + + mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, + GFP_KERNEL); + if (!mle) { + mlog_errno(ret); + goto leave; + } + ret = 0; + + /* + * find a node to migrate the lockres to + */ + + mlog(0, "picking a migration node\n"); + spin_lock(&dlm->spinlock); + /* pick a new node */ + if (!test_bit(target, dlm->domain_map) || + target >= O2NM_MAX_NODES) { + target = dlm_pick_migration_target(dlm, res); + } + mlog(0, "node %u chosen for migration\n", target); + + if (target >= O2NM_MAX_NODES || + !test_bit(target, dlm->domain_map)) { + /* target chosen is not alive */ + ret = -EINVAL; + } + + if (ret) { + spin_unlock(&dlm->spinlock); + goto fail; + } + + mlog(0, "continuing with target = %u\n", target); + + /* + * clear any existing master requests and + * add the migration mle to the list + */ + spin_lock(&dlm->master_lock); + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, + namelen, target, dlm->node_num); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (ret == -EEXIST) { + mlog(0, "another process is already migrating it\n"); + goto fail; + } + mle_added = 1; + + /* + * set the MIGRATING flag and flush asts + * if we fail after this we need to re-dirty the lockres + */ + if (dlm_mark_lockres_migrating(dlm, res, target) < 0) { + mlog(ML_ERROR, "tried to migrate %.*s to %u, but " + "the target went down.\n", res->lockname.len, + res->lockname.name, target); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + ret = -EINVAL; + } + +fail: + if (oldmle) { + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, oldmle); + dlm_put_mle(oldmle); + } + + if (ret < 0) { + if (mle_added) { + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + } else if (mle) { + kmem_cache_free(dlm_mle_cache, mle); + } + goto leave; + } + + /* + * at this point, we have a migration target, an mle + * in the master list, and the MIGRATING flag set on + * the lockres + */ + + + /* get an extra reference on the mle. + * otherwise the assert_master from the new + * master will destroy this. + * also, make sure that all callers of dlm_get_mle + * take both dlm->spinlock and dlm->master_lock */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + dlm_get_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + /* notify new node and send all lock state */ + /* call send_one_lockres with migration flag. + * this serves as notice to the target node that a + * migration is starting. */ + ret = dlm_send_one_lockres(dlm, res, mres, target, + DLM_MRES_MIGRATION); + + if (ret < 0) { + mlog(0, "migration to node %u failed with %d\n", + target, ret); + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle(mle); + goto leave; + } + + /* at this point, the target sends a message to all nodes, + * (using dlm_do_migrate_request). this node is skipped since + * we had to put an mle in the list to begin the process. this + * node now waits for target to do an assert master. this node + * will be the last one notified, ensuring that the migration + * is complete everywhere. if the target dies while this is + * going on, some nodes could potentially see the target as the + * master, so it is important that my recovery finds the migration + * mle and sets the master to UNKNONWN. */ + + + /* wait for new node to assert master */ + while (1) { + ret = wait_event_interruptible_timeout(mle->wq, + (atomic_read(&mle->woken) == 1), + msecs_to_jiffies(5000)); + + if (ret >= 0) { + if (atomic_read(&mle->woken) == 1 || + res->owner == target) + break; + + mlog(0, "timed out during migration\n"); + } + if (ret == -ERESTARTSYS) { + /* migration failed, detach and clean up mle */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + dlm_put_mle(mle); + goto leave; + } + /* TODO: if node died: stop, clean up, return error */ + } + + /* all done, set the owner, clear the flag */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, target); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_remove_nonlocal_locks(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, mle); + dlm_put_mle(mle); + ret = 0; + + dlm_lockres_calc_usage(dlm, res); + +leave: + /* re-dirty the lockres if we failed */ + if (ret < 0) + dlm_kick_thread(dlm, res); + + /* TODO: cleanup */ + if (mres) + free_page((unsigned long)mres); + + dlm_put(dlm); + + mlog(0, "returning %d\n", ret); + return ret; +} +EXPORT_SYMBOL_GPL(dlm_migrate_lockres); + +int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) +{ + int ret; + spin_lock(&dlm->ast_lock); + spin_lock(&lock->spinlock); + ret = (list_empty(&lock->bast_list) && !lock->bast_pending); + spin_unlock(&lock->spinlock); + spin_unlock(&dlm->ast_lock); + return ret; +} + +static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 mig_target) +{ + int can_proceed; + spin_lock(&res->spinlock); + can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING); + spin_unlock(&res->spinlock); + + /* target has died, so make the caller break out of the + * wait_event, but caller must recheck the domain_map */ + spin_lock(&dlm->spinlock); + if (!test_bit(mig_target, dlm->domain_map)) + can_proceed = 1; + spin_unlock(&dlm->spinlock); + return can_proceed; +} + +int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + int ret; + spin_lock(&res->spinlock); + ret = !!(res->state & DLM_LOCK_RES_DIRTY); + spin_unlock(&res->spinlock); + return ret; +} + + +static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 target) +{ + int ret = 0; + + mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n", + res->lockname.len, res->lockname.name, dlm->node_num, + target); + /* need to set MIGRATING flag on lockres. this is done by + * ensuring that all asts have been flushed for this lockres. */ + spin_lock(&res->spinlock); + BUG_ON(res->migration_pending); + res->migration_pending = 1; + /* strategy is to reserve an extra ast then release + * it below, letting the release do all of the work */ + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + + /* now flush all the pending asts.. hang out for a bit */ + dlm_kick_thread(dlm, res); + wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); + dlm_lockres_release_ast(dlm, res); + + mlog(0, "about to wait on migration_wq, dirty=%s\n", + res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + /* if the extra ref we just put was the final one, this + * will pass thru immediately. otherwise, we need to wait + * for the last ast to finish. */ +again: + ret = wait_event_interruptible_timeout(dlm->migration_wq, + dlm_migration_can_proceed(dlm, res, target), + msecs_to_jiffies(1000)); + if (ret < 0) { + mlog(0, "woken again: migrating? %s, dead? %s\n", + res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", + test_bit(target, dlm->domain_map) ? "no":"yes"); + } else { + mlog(0, "all is well: migrating? %s, dead? %s\n", + res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no", + test_bit(target, dlm->domain_map) ? "no":"yes"); + } + if (!dlm_migration_can_proceed(dlm, res, target)) { + mlog(0, "trying again...\n"); + goto again; + } + + /* did the target go down or die? */ + spin_lock(&dlm->spinlock); + if (!test_bit(target, dlm->domain_map)) { + mlog(ML_ERROR, "aha. migration target %u just went down\n", + target); + ret = -EHOSTDOWN; + } + spin_unlock(&dlm->spinlock); + + /* + * at this point: + * + * o the DLM_LOCK_RES_MIGRATING flag is set + * o there are no pending asts on this lockres + * o all processes trying to reserve an ast on this + * lockres must wait for the MIGRATING flag to clear + */ + return ret; +} + +/* last step in the migration process. + * original master calls this to free all of the dlm_lock + * structures that used to be for other nodes. */ +static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct list_head *iter, *iter2; + struct list_head *queue = &res->granted; + int i; + struct dlm_lock *lock; + + assert_spin_locked(&res->spinlock); + + BUG_ON(res->owner == dlm->node_num); + + for (i=0; i<3; i++) { + list_for_each_safe(iter, iter2, queue) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.node != dlm->node_num) { + mlog(0, "putting lock for node %u\n", + lock->ml.node); + /* be extra careful */ + BUG_ON(!list_empty(&lock->ast_list)); + BUG_ON(!list_empty(&lock->bast_list)); + BUG_ON(lock->ast_pending); + BUG_ON(lock->bast_pending); + list_del_init(&lock->list); + dlm_lock_put(lock); + } + } + queue++; + } +} + +/* for now this is not too intelligent. we will + * need stats to make this do the right thing. + * this just finds the first lock on one of the + * queues and uses that node as the target. */ +static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int i; + struct list_head *queue = &res->granted; + struct list_head *iter; + struct dlm_lock *lock; + int nodenum; + + assert_spin_locked(&dlm->spinlock); + + spin_lock(&res->spinlock); + for (i=0; i<3; i++) { + list_for_each(iter, queue) { + /* up to the caller to make sure this node + * is alive */ + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.node != dlm->node_num) { + spin_unlock(&res->spinlock); + return lock->ml.node; + } + } + queue++; + } + spin_unlock(&res->spinlock); + mlog(0, "have not found a suitable target yet! checking domain map\n"); + + /* ok now we're getting desperate. pick anyone alive. */ + nodenum = -1; + while (1) { + nodenum = find_next_bit(dlm->domain_map, + O2NM_MAX_NODES, nodenum+1); + mlog(0, "found %d in domain map\n", nodenum); + if (nodenum >= O2NM_MAX_NODES) + break; + if (nodenum != dlm->node_num) { + mlog(0, "picking %d\n", nodenum); + return nodenum; + } + } + + mlog(0, "giving up. no master to migrate to\n"); + return DLM_LOCK_RES_OWNER_UNKNOWN; +} + + + +/* this is called by the new master once all lockres + * data has been received */ +static int dlm_do_migrate_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 master, u8 new_master, + struct dlm_node_iter *iter) +{ + struct dlm_migrate_request migrate; + int ret, status = 0; + int nodenum; + + memset(&migrate, 0, sizeof(migrate)); + migrate.namelen = res->lockname.len; + memcpy(migrate.name, res->lockname.name, migrate.namelen); + migrate.new_master = new_master; + migrate.master = master; + + ret = 0; + + /* send message to all nodes, except the master and myself */ + while ((nodenum = dlm_node_iter_next(iter)) >= 0) { + if (nodenum == master || + nodenum == new_master) + continue; + + ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, + &migrate, sizeof(migrate), nodenum, + &status); + if (ret < 0) + mlog_errno(ret); + else if (status < 0) { + mlog(0, "migrate request (node %u) returned %d!\n", + nodenum, status); + ret = status; + } + } + + if (ret < 0) + mlog_errno(ret); + + mlog(0, "returning ret=%d\n", ret); + return ret; +} + + +/* if there is an existing mle for this lockres, we now know who the master is. + * (the one who sent us *this* message) we can clear it up right away. + * since the process that put the mle on the list still has a reference to it, + * we can unhash it now, set the master and wake the process. as a result, + * we will have no mle in the list to start with. now we can add an mle for + * the migration and this should be the only one found for those scanning the + * list. */ +int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_lock_resource *res = NULL; + struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; + struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; + const char *name; + unsigned int namelen; + int ret = 0; + + if (!dlm_grab(dlm)) + return -EINVAL; + + name = migrate->name; + namelen = migrate->namelen; + + /* preallocate.. if this fails, abort */ + mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, + GFP_KERNEL); + + if (!mle) { + ret = -ENOMEM; + goto leave; + } + + /* check for pre-existing lock */ + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, name, namelen); + spin_lock(&dlm->master_lock); + + if (res) { + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + /* if all is working ok, this can only mean that we got + * a migrate request from a node that we now see as + * dead. what can we do here? drop it to the floor? */ + spin_unlock(&res->spinlock); + mlog(ML_ERROR, "Got a migrate request, but the " + "lockres is marked as recovering!"); + kmem_cache_free(dlm_mle_cache, mle); + ret = -EINVAL; /* need a better solution */ + goto unlock; + } + res->state |= DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + } + + /* ignore status. only nonzero status would BUG. */ + ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, + name, namelen, + migrate->new_master, + migrate->master); + +unlock: + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + + if (oldmle) { + /* master is known, detach if not already detached */ + dlm_mle_detach_hb_events(dlm, oldmle); + dlm_put_mle(oldmle); + } + + if (res) + dlm_lockres_put(res); +leave: + dlm_put(dlm); + return ret; +} + +/* must be holding dlm->spinlock and dlm->master_lock + * when adding a migration mle, we can clear any other mles + * in the master list because we know with certainty that + * the master is "master". so we remove any old mle from + * the list after setting it's master field, and then add + * the new migration mle. this way we can hold with the rule + * of having only one mle for a given lock name at all times. */ +static int dlm_add_migration_mle(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_master_list_entry *mle, + struct dlm_master_list_entry **oldmle, + const char *name, unsigned int namelen, + u8 new_master, u8 master) +{ + int found; + int ret = 0; + + *oldmle = NULL; + + mlog_entry_void(); + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + + /* caller is responsible for any ref taken here on oldmle */ + found = dlm_find_mle(dlm, oldmle, (char *)name, namelen); + if (found) { + struct dlm_master_list_entry *tmp = *oldmle; + spin_lock(&tmp->spinlock); + if (tmp->type == DLM_MLE_MIGRATION) { + if (master == dlm->node_num) { + /* ah another process raced me to it */ + mlog(0, "tried to migrate %.*s, but some " + "process beat me to it\n", + namelen, name); + ret = -EEXIST; + } else { + /* bad. 2 NODES are trying to migrate! */ + mlog(ML_ERROR, "migration error mle: " + "master=%u new_master=%u // request: " + "master=%u new_master=%u // " + "lockres=%.*s\n", + tmp->master, tmp->new_master, + master, new_master, + namelen, name); + BUG(); + } + } else { + /* this is essentially what assert_master does */ + tmp->master = master; + atomic_set(&tmp->woken, 1); + wake_up(&tmp->wq); + /* remove it from the list so that only one + * mle will be found */ + list_del_init(&tmp->list); + } + spin_unlock(&tmp->spinlock); + } + + /* now add a migration mle to the tail of the list */ + dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); + mle->new_master = new_master; + mle->master = master; + /* do this for consistency with other mle types */ + set_bit(new_master, mle->maybe_map); + list_add(&mle->list, &dlm->master_list); + + return ret; +} + + +void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct list_head *iter, *iter2; + struct dlm_master_list_entry *mle; + struct dlm_lock_resource *res; + + mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); +top: + assert_spin_locked(&dlm->spinlock); + + /* clean the master list */ + spin_lock(&dlm->master_lock); + list_for_each_safe(iter, iter2, &dlm->master_list) { + mle = list_entry(iter, struct dlm_master_list_entry, list); + + BUG_ON(mle->type != DLM_MLE_BLOCK && + mle->type != DLM_MLE_MASTER && + mle->type != DLM_MLE_MIGRATION); + + /* MASTER mles are initiated locally. the waiting + * process will notice the node map change + * shortly. let that happen as normal. */ + if (mle->type == DLM_MLE_MASTER) + continue; + + + /* BLOCK mles are initiated by other nodes. + * need to clean up if the dead node would have + * been the master. */ + if (mle->type == DLM_MLE_BLOCK) { + int bit; + + spin_lock(&mle->spinlock); + bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); + if (bit != dead_node) { + mlog(0, "mle found, but dead node %u would " + "not have been master\n", dead_node); + spin_unlock(&mle->spinlock); + } else { + /* must drop the refcount by one since the + * assert_master will never arrive. this + * may result in the mle being unlinked and + * freed, but there may still be a process + * waiting in the dlmlock path which is fine. */ + mlog(ML_ERROR, "node %u was expected master\n", + dead_node); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); + wake_up(&mle->wq); + /* final put will take care of list removal */ + __dlm_put_mle(mle); + } + continue; + } + + /* everything else is a MIGRATION mle */ + + /* the rule for MIGRATION mles is that the master + * becomes UNKNOWN if *either* the original or + * the new master dies. all UNKNOWN lockreses + * are sent to whichever node becomes the recovery + * master. the new master is responsible for + * determining if there is still a master for + * this lockres, or if he needs to take over + * mastery. either way, this node should expect + * another message to resolve this. */ + if (mle->master != dead_node && + mle->new_master != dead_node) + continue; + + /* if we have reached this point, this mle needs to + * be removed from the list and freed. */ + + /* remove from the list early. NOTE: unlinking + * list_head while in list_for_each_safe */ + spin_lock(&mle->spinlock); + list_del_init(&mle->list); + atomic_set(&mle->woken, 1); + spin_unlock(&mle->spinlock); + wake_up(&mle->wq); + + mlog(0, "node %u died during migration from " + "%u to %u!\n", dead_node, + mle->master, mle->new_master); + /* if there is a lockres associated with this + * mle, find it and set its owner to UNKNOWN */ + res = __dlm_lookup_lockres(dlm, mle->u.name.name, + mle->u.name.len); + if (res) { + /* unfortunately if we hit this rare case, our + * lock ordering is messed. we need to drop + * the master lock so that we can take the + * lockres lock, meaning that we will have to + * restart from the head of list. */ + spin_unlock(&dlm->master_lock); + + /* move lockres onto recovery list */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, + DLM_LOCK_RES_OWNER_UNKNOWN); + dlm_move_lockres_to_recovery_list(dlm, res); + spin_unlock(&res->spinlock); + dlm_lockres_put(res); + + /* dump the mle */ + spin_lock(&dlm->master_lock); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + + /* restart */ + goto top; + } + + /* this may be the last reference */ + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); +} + + +int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + u8 old_master) +{ + struct dlm_node_iter iter; + int ret = 0; + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + clear_bit(old_master, iter.node_map); + clear_bit(dlm->node_num, iter.node_map); + spin_unlock(&dlm->spinlock); + + mlog(0, "now time to do a migrate request to other nodes\n"); + ret = dlm_do_migrate_request(dlm, res, old_master, + dlm->node_num, &iter); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + mlog(0, "doing assert master of %.*s to all except the original node\n", + res->lockname.len, res->lockname.name); + /* this call now finishes out the nodemap + * even if one or more nodes die */ + ret = dlm_do_assert_master(dlm, res->lockname.name, + res->lockname.len, iter.node_map, + DLM_ASSERT_MASTER_FINISH_MIGRATION); + if (ret < 0) { + /* no longer need to retry. all living nodes contacted. */ + mlog_errno(ret); + ret = 0; + } + + memset(iter.node_map, 0, sizeof(iter.node_map)); + set_bit(old_master, iter.node_map); + mlog(0, "doing assert master of %.*s back to %u\n", + res->lockname.len, res->lockname.name, old_master); + ret = dlm_do_assert_master(dlm, res->lockname.name, + res->lockname.len, iter.node_map, + DLM_ASSERT_MASTER_FINISH_MIGRATION); + if (ret < 0) { + mlog(0, "assert master to original master failed " + "with %d.\n", ret); + /* the only nonzero status here would be because of + * a dead original node. we're done. */ + ret = 0; + } + + /* all done, set the owner, clear the flag */ + spin_lock(&res->spinlock); + dlm_set_lockres_owner(dlm, res, dlm->node_num); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + /* re-dirty it on the new master */ + dlm_kick_thread(dlm, res); + wake_up(&res->wq); +leave: + return ret; +} + +/* + * LOCKRES AST REFCOUNT + * this is integral to migration + */ + +/* for future intent to call an ast, reserve one ahead of time. + * this should be called only after waiting on the lockres + * with dlm_wait_on_lockres, and while still holding the + * spinlock after the call. */ +void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res) +{ + assert_spin_locked(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + __dlm_print_one_lock_resource(res); + } + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + + atomic_inc(&res->asts_reserved); +} + +/* + * used to drop the reserved ast, either because it went unused, + * or because the ast/bast was actually called. + * + * also, if there is a pending migration on this lockres, + * and this was the last pending ast on the lockres, + * atomically set the MIGRATING flag before we drop the lock. + * this is how we ensure that migration can proceed with no + * asts in progress. note that it is ok if the state of the + * queues is such that a lock should be granted in the future + * or that a bast should be fired, because the new master will + * shuffle the lists on this lockres as soon as it is migrated. + */ +void dlm_lockres_release_ast(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock)) + return; + + if (!res->migration_pending) { + spin_unlock(&res->spinlock); + return; + } + + BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); + res->migration_pending = 0; + res->state |= DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); + wake_up(&res->wq); + wake_up(&dlm->migration_wq); +} diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c new file mode 100644 index 000000000000..0c8eb1093f00 --- /dev/null +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -0,0 +1,2132 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmrecovery.c + * + * recovery stuff + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/timer.h> +#include <linux/kthread.h> + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY) +#include "cluster/masklog.h" + +static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node); + +static int dlm_recovery_thread(void *data); +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); +static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); +static int dlm_do_recovery(struct dlm_ctxt *dlm); + +static int dlm_pick_recovery_master(struct dlm_ctxt *dlm); +static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node); +static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); +static int dlm_request_all_locks(struct dlm_ctxt *dlm, + u8 request_from, u8 dead_node); +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); + +static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); +static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, + const char *lockname, int namelen, + int total_locks, u64 cookie, + u8 flags, u8 master); +static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres, + u8 send_to, + struct dlm_lock_resource *res, + int total_locks); +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master); +static int dlm_process_recovery_data(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres); +static int dlm_do_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 nodenum, u8 *real_master); +static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm); +static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, + u8 dead_node, u8 send_to); +static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node); +static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, + struct list_head *list, u8 dead_node); +static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, + u8 dead_node, u8 new_master); +static void dlm_reco_ast(void *astdata); +static void dlm_reco_bast(void *astdata, int blocked_type); +static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st); +static void dlm_request_all_locks_worker(struct dlm_work_item *item, + void *data); +static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); + +static u64 dlm_get_next_mig_cookie(void); + +static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; +static u64 dlm_mig_cookie = 1; + +static u64 dlm_get_next_mig_cookie(void) +{ + u64 c; + spin_lock(&dlm_mig_cookie_lock); + c = dlm_mig_cookie; + if (dlm_mig_cookie == (~0ULL)) + dlm_mig_cookie = 1; + else + dlm_mig_cookie++; + spin_unlock(&dlm_mig_cookie_lock); + return c; +} + +static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + spin_unlock(&dlm->spinlock); +} + +/* Worker function used during recovery. */ +void dlm_dispatch_work(void *data) +{ + struct dlm_ctxt *dlm = (struct dlm_ctxt *)data; + LIST_HEAD(tmp_list); + struct list_head *iter, *iter2; + struct dlm_work_item *item; + dlm_workfunc_t *workfunc; + + spin_lock(&dlm->work_lock); + list_splice_init(&dlm->work_list, &tmp_list); + spin_unlock(&dlm->work_lock); + + list_for_each_safe(iter, iter2, &tmp_list) { + item = list_entry(iter, struct dlm_work_item, list); + workfunc = item->func; + list_del_init(&item->list); + + /* already have ref on dlm to avoid having + * it disappear. just double-check. */ + BUG_ON(item->dlm != dlm); + + /* this is allowed to sleep and + * call network stuff */ + workfunc(item, item->data); + + dlm_put(dlm); + kfree(item); + } +} + +/* + * RECOVERY THREAD + */ + +static void dlm_kick_recovery_thread(struct dlm_ctxt *dlm) +{ + /* wake the recovery thread + * this will wake the reco thread in one of three places + * 1) sleeping with no recovery happening + * 2) sleeping with recovery mastered elsewhere + * 3) recovery mastered here, waiting on reco data */ + + wake_up(&dlm->dlm_reco_thread_wq); +} + +/* Launch the recovery thread */ +int dlm_launch_recovery_thread(struct dlm_ctxt *dlm) +{ + mlog(0, "starting dlm recovery thread...\n"); + + dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm, + "dlm_reco_thread"); + if (IS_ERR(dlm->dlm_reco_thread_task)) { + mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task)); + dlm->dlm_reco_thread_task = NULL; + return -EINVAL; + } + + return 0; +} + +void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_reco_thread_task) { + mlog(0, "waiting for dlm recovery thread to exit\n"); + kthread_stop(dlm->dlm_reco_thread_task); + dlm->dlm_reco_thread_task = NULL; + } +} + + + +/* + * this is lame, but here's how recovery works... + * 1) all recovery threads cluster wide will work on recovering + * ONE node at a time + * 2) negotiate who will take over all the locks for the dead node. + * thats right... ALL the locks. + * 3) once a new master is chosen, everyone scans all locks + * and moves aside those mastered by the dead guy + * 4) each of these locks should be locked until recovery is done + * 5) the new master collects up all of secondary lock queue info + * one lock at a time, forcing each node to communicate back + * before continuing + * 6) each secondary lock queue responds with the full known lock info + * 7) once the new master has run all its locks, it sends a ALLDONE! + * message to everyone + * 8) upon receiving this message, the secondary queue node unlocks + * and responds to the ALLDONE + * 9) once the new master gets responses from everyone, he unlocks + * everything and recovery for this dead node is done + *10) go back to 2) while there are still dead nodes + * + */ + + +#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) + +static int dlm_recovery_thread(void *data) +{ + int status; + struct dlm_ctxt *dlm = data; + unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS); + + mlog(0, "dlm thread running for %s...\n", dlm->name); + + while (!kthread_should_stop()) { + if (dlm_joined(dlm)) { + status = dlm_do_recovery(dlm); + if (status == -EAGAIN) { + /* do not sleep, recheck immediately. */ + continue; + } + if (status < 0) + mlog_errno(status); + } + + wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, + kthread_should_stop(), + timeout); + } + + mlog(0, "quitting DLM recovery thread\n"); + return 0; +} + +/* callers of the top-level api calls (dlmlock/dlmunlock) should + * block on the dlm->reco.event when recovery is in progress. + * the dlm recovery thread will set this state when it begins + * recovering a dead node (as the new master or not) and clear + * the state and wake as soon as all affected lock resources have + * been marked with the RECOVERY flag */ +static int dlm_in_recovery(struct dlm_ctxt *dlm) +{ + int in_recovery; + spin_lock(&dlm->spinlock); + in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + spin_unlock(&dlm->spinlock); + return in_recovery; +} + + +void dlm_wait_for_recovery(struct dlm_ctxt *dlm) +{ + wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); +} + +static void dlm_begin_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); + dlm->reco.state |= DLM_RECO_STATE_ACTIVE; + spin_unlock(&dlm->spinlock); +} + +static void dlm_end_recovery(struct dlm_ctxt *dlm) +{ + spin_lock(&dlm->spinlock); + BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); + dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; + spin_unlock(&dlm->spinlock); + wake_up(&dlm->reco.event); +} + +static int dlm_do_recovery(struct dlm_ctxt *dlm) +{ + int status = 0; + + spin_lock(&dlm->spinlock); + + /* check to see if the new master has died */ + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && + test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "new master %u died while recovering %u!\n", + dlm->reco.new_master, dlm->reco.dead_node); + /* unset the new_master, leave dead_node */ + dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + } + + /* select a target to recover */ + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + int bit; + + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); + if (bit >= O2NM_MAX_NODES || bit < 0) + dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + else + dlm->reco.dead_node = bit; + } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { + /* BUG? */ + mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", + dlm->reco.dead_node); + dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + } + + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + // mlog(0, "nothing to recover! sleeping now!\n"); + spin_unlock(&dlm->spinlock); + /* return to main thread loop and sleep. */ + return 0; + } + mlog(0, "recovery thread found node %u in the recovery map!\n", + dlm->reco.dead_node); + spin_unlock(&dlm->spinlock); + + /* take write barrier */ + /* (stops the list reshuffling thread, proxy ast handling) */ + dlm_begin_recovery(dlm); + + if (dlm->reco.new_master == dlm->node_num) + goto master_here; + + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + /* choose a new master */ + if (!dlm_pick_recovery_master(dlm)) { + /* already notified everyone. go. */ + dlm->reco.new_master = dlm->node_num; + goto master_here; + } + mlog(0, "another node will master this recovery session.\n"); + } + mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", + dlm->name, dlm->reco.new_master, + dlm->node_num, dlm->reco.dead_node); + + /* it is safe to start everything back up here + * because all of the dead node's lock resources + * have been marked as in-recovery */ + dlm_end_recovery(dlm); + + /* sleep out in main dlm_recovery_thread loop. */ + return 0; + +master_here: + mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", + dlm->name, dlm->reco.dead_node, dlm->node_num); + + status = dlm_remaster_locks(dlm, dlm->reco.dead_node); + if (status < 0) { + mlog(ML_ERROR, "error %d remastering locks for node %u, " + "retrying.\n", status, dlm->reco.dead_node); + } else { + /* success! see if any other nodes need recovery */ + dlm_reset_recovery(dlm); + } + dlm_end_recovery(dlm); + + /* continue and look for another dead node */ + return -EAGAIN; +} + +static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) +{ + int status = 0; + struct dlm_reco_node_data *ndata; + struct list_head *iter; + int all_nodes_done; + int destroy = 0; + int pass = 0; + + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) + goto leave; + + /* safe to access the node data list without a lock, since this + * process is the only one to change the list */ + list_for_each(iter, &dlm->reco.node_data) { + ndata = list_entry (iter, struct dlm_reco_node_data, list); + BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); + ndata->state = DLM_RECO_NODE_DATA_REQUESTING; + + mlog(0, "requesting lock info from node %u\n", + ndata->node_num); + + if (ndata->node_num == dlm->node_num) { + ndata->state = DLM_RECO_NODE_DATA_DONE; + continue; + } + + status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) + ndata->state = DLM_RECO_NODE_DATA_DEAD; + else { + destroy = 1; + goto leave; + } + } + + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + case DLM_RECO_NODE_DATA_REQUESTED: + BUG(); + break; + case DLM_RECO_NODE_DATA_DEAD: + mlog(0, "node %u died after requesting " + "recovery info for node %u\n", + ndata->node_num, dead_node); + // start all over + destroy = 1; + status = -EAGAIN; + goto leave; + case DLM_RECO_NODE_DATA_REQUESTING: + ndata->state = DLM_RECO_NODE_DATA_REQUESTED; + mlog(0, "now receiving recovery data from " + "node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + case DLM_RECO_NODE_DATA_RECEIVING: + mlog(0, "already receiving recovery data from " + "node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + case DLM_RECO_NODE_DATA_DONE: + mlog(0, "already DONE receiving recovery data " + "from node %u for dead node %u\n", + ndata->node_num, dead_node); + break; + } + } + + mlog(0, "done requesting all lock info\n"); + + /* nodes should be sending reco data now + * just need to wait */ + + while (1) { + /* check all the nodes now to see if we are + * done, or if anyone died */ + all_nodes_done = 1; + spin_lock(&dlm_reco_state_lock); + list_for_each(iter, &dlm->reco.node_data) { + ndata = list_entry (iter, struct dlm_reco_node_data, list); + + mlog(0, "checking recovery state of node %u\n", + ndata->node_num); + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_REQUESTING: + mlog(ML_ERROR, "bad ndata state for " + "node %u: state=%d\n", + ndata->node_num, ndata->state); + BUG(); + break; + case DLM_RECO_NODE_DATA_DEAD: + mlog(0, "node %u died after " + "requesting recovery info for " + "node %u\n", ndata->node_num, + dead_node); + spin_unlock(&dlm_reco_state_lock); + // start all over + destroy = 1; + status = -EAGAIN; + goto leave; + case DLM_RECO_NODE_DATA_RECEIVING: + case DLM_RECO_NODE_DATA_REQUESTED: + all_nodes_done = 0; + break; + case DLM_RECO_NODE_DATA_DONE: + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + break; + } + } + spin_unlock(&dlm_reco_state_lock); + + mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass, + all_nodes_done?"yes":"no"); + if (all_nodes_done) { + int ret; + + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state + * just send a finalize message to everyone and + * clean up */ + mlog(0, "all nodes are done! send finalize\n"); + ret = dlm_send_finalize_reco_message(dlm); + if (ret < 0) + mlog_errno(ret); + + spin_lock(&dlm->spinlock); + dlm_finish_local_lockres_recovery(dlm, dead_node, + dlm->node_num); + spin_unlock(&dlm->spinlock); + mlog(0, "should be done with recovery!\n"); + + mlog(0, "finishing recovery of %s at %lu, " + "dead=%u, this=%u, new=%u\n", dlm->name, + jiffies, dlm->reco.dead_node, + dlm->node_num, dlm->reco.new_master); + destroy = 1; + status = ret; + /* rescan everything marked dirty along the way */ + dlm_kick_thread(dlm, NULL); + break; + } + /* wait to be signalled, with periodic timeout + * to check for node death */ + wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq, + kthread_should_stop(), + msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS)); + + } + +leave: + if (destroy) + dlm_destroy_recovery_area(dlm, dead_node); + + mlog_exit(status); + return status; +} + +static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +{ + int num=0; + struct dlm_reco_node_data *ndata; + + spin_lock(&dlm->spinlock); + memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map)); + /* nodes can only be removed (by dying) after dropping + * this lock, and death will be trapped later, so this should do */ + spin_unlock(&dlm->spinlock); + + while (1) { + num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num); + if (num >= O2NM_MAX_NODES) { + break; + } + BUG_ON(num == dead_node); + + ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); + if (!ndata) { + dlm_destroy_recovery_area(dlm, dead_node); + return -ENOMEM; + } + ndata->node_num = num; + ndata->state = DLM_RECO_NODE_DATA_INIT; + spin_lock(&dlm_reco_state_lock); + list_add_tail(&ndata->list, &dlm->reco.node_data); + spin_unlock(&dlm_reco_state_lock); + num++; + } + + return 0; +} + +static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct list_head *iter, *iter2; + struct dlm_reco_node_data *ndata; + LIST_HEAD(tmplist); + + spin_lock(&dlm_reco_state_lock); + list_splice_init(&dlm->reco.node_data, &tmplist); + spin_unlock(&dlm_reco_state_lock); + + list_for_each_safe(iter, iter2, &tmplist) { + ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_del_init(&ndata->list); + kfree(ndata); + } +} + +static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, + u8 dead_node) +{ + struct dlm_lock_request lr; + enum dlm_status ret; + + mlog(0, "\n"); + + + mlog(0, "dlm_request_all_locks: dead node is %u, sending request " + "to %u\n", dead_node, request_from); + + memset(&lr, 0, sizeof(lr)); + lr.node_idx = dlm->node_num; + lr.dead_node = dead_node; + + // send message + ret = DLM_NOLOCKMGR; + ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, + &lr, sizeof(lr), request_from, NULL); + + /* negative status is handled by caller */ + if (ret < 0) + mlog_errno(ret); + + // return from here, then + // sleep until all received or error + return ret; + +} + +int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf; + char *buf = NULL; + struct dlm_work_item *item = NULL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + BUG_ON(lr->dead_node != dlm->reco.dead_node); + + item = kcalloc(1, sizeof(*item), GFP_KERNEL); + if (!item) { + dlm_put(dlm); + return -ENOMEM; + } + + /* this will get freed by dlm_request_all_locks_worker */ + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) { + kfree(item); + dlm_put(dlm); + return -ENOMEM; + } + + /* queue up work for dlm_request_all_locks_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf); + item->u.ral.reco_master = lr->node_idx; + item->u.ral.dead_node = lr->dead_node; + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + schedule_work(&dlm->dispatched_work); + + dlm_put(dlm); + return 0; +} + +static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_migratable_lockres *mres; + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; + LIST_HEAD(resources); + struct list_head *iter; + int ret; + u8 dead_node, reco_master; + + dlm = item->dlm; + dead_node = item->u.ral.dead_node; + reco_master = item->u.ral.reco_master; + BUG_ON(dead_node != dlm->reco.dead_node); + BUG_ON(reco_master != dlm->reco.new_master); + + mres = (struct dlm_migratable_lockres *)data; + + /* lock resources should have already been moved to the + * dlm->reco.resources list. now move items from that list + * to a temp list if the dead owner matches. note that the + * whole cluster recovers only one node at a time, so we + * can safely move UNKNOWN lock resources for each recovery + * session. */ + dlm_move_reco_locks_to_list(dlm, &resources, dead_node); + + /* now we can begin blasting lockreses without the dlm lock */ + list_for_each(iter, &resources) { + res = list_entry (iter, struct dlm_lock_resource, recovering); + ret = dlm_send_one_lockres(dlm, res, mres, reco_master, + DLM_MRES_RECOVERY); + if (ret < 0) + mlog_errno(ret); + } + + /* move the resources back to the list */ + spin_lock(&dlm->spinlock); + list_splice_init(&resources, &dlm->reco.resources); + spin_unlock(&dlm->spinlock); + + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) + mlog_errno(ret); + + free_page((unsigned long)data); +} + + +static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) +{ + int ret, tmpret; + struct dlm_reco_data_done done_msg; + + memset(&done_msg, 0, sizeof(done_msg)); + done_msg.node_idx = dlm->node_num; + done_msg.dead_node = dead_node; + mlog(0, "sending DATA DONE message to %u, " + "my node=%u, dead node=%u\n", send_to, done_msg.node_idx, + done_msg.dead_node); + + ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, + sizeof(done_msg), send_to, &tmpret); + /* negative status is ignored by the caller */ + if (ret >= 0) + ret = tmpret; + return ret; +} + + +int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; + struct list_head *iter; + struct dlm_reco_node_data *ndata = NULL; + int ret = -EINVAL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); + BUG_ON(done->dead_node != dlm->reco.dead_node); + + spin_lock(&dlm_reco_state_lock); + list_for_each(iter, &dlm->reco.node_data) { + ndata = list_entry (iter, struct dlm_reco_node_data, list); + if (ndata->node_num != done->node_idx) + continue; + + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + case DLM_RECO_NODE_DATA_DEAD: + case DLM_RECO_NODE_DATA_DONE: + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(ML_ERROR, "bad ndata state for node %u:" + " state=%d\n", ndata->node_num, + ndata->state); + BUG(); + break; + case DLM_RECO_NODE_DATA_RECEIVING: + case DLM_RECO_NODE_DATA_REQUESTED: + case DLM_RECO_NODE_DATA_REQUESTING: + mlog(0, "node %u is DONE sending " + "recovery data!\n", + ndata->node_num); + + ndata->state = DLM_RECO_NODE_DATA_DONE; + ret = 0; + break; + } + } + spin_unlock(&dlm_reco_state_lock); + + /* wake the recovery thread, some node is done */ + if (!ret) + dlm_kick_recovery_thread(dlm); + + if (ret < 0) + mlog(ML_ERROR, "failed to find recovery node data for node " + "%u\n", done->node_idx); + dlm_put(dlm); + + mlog(0, "leaving reco data done handler, ret=%d\n", ret); + return ret; +} + +static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, + struct list_head *list, + u8 dead_node) +{ + struct dlm_lock_resource *res; + struct list_head *iter, *iter2; + + spin_lock(&dlm->spinlock); + list_for_each_safe(iter, iter2, &dlm->reco.resources) { + res = list_entry (iter, struct dlm_lock_resource, recovering); + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) + continue; + if (res->owner == dead_node) { + mlog(0, "found lockres owned by dead node while " + "doing recovery for node %u. sending it.\n", + dead_node); + list_del_init(&res->recovering); + list_add_tail(&res->recovering, list); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "found UNKNOWN owner while doing recovery " + "for node %u. sending it.\n", dead_node); + list_del_init(&res->recovering); + list_add_tail(&res->recovering, list); + } + } + spin_unlock(&dlm->spinlock); +} + +static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res) +{ + int total_locks = 0; + struct list_head *iter, *queue = &res->granted; + int i; + + for (i=0; i<3; i++) { + list_for_each(iter, queue) + total_locks++; + queue++; + } + return total_locks; +} + + +static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, + struct dlm_migratable_lockres *mres, + u8 send_to, + struct dlm_lock_resource *res, + int total_locks) +{ + u64 mig_cookie = be64_to_cpu(mres->mig_cookie); + int mres_total_locks = be32_to_cpu(mres->total_locks); + int sz, ret = 0, status = 0; + u8 orig_flags = mres->flags, + orig_master = mres->master; + + BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS); + if (!mres->num_locks) + return 0; + + sz = sizeof(struct dlm_migratable_lockres) + + (mres->num_locks * sizeof(struct dlm_migratable_lock)); + + /* add an all-done flag if we reached the last lock */ + orig_flags = mres->flags; + BUG_ON(total_locks > mres_total_locks); + if (total_locks == mres_total_locks) + mres->flags |= DLM_MRES_ALL_DONE; + + /* send it */ + ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres, + sz, send_to, &status); + if (ret < 0) { + /* XXX: negative status is not handled. + * this will end up killing this node. */ + mlog_errno(ret); + } else { + /* might get an -ENOMEM back here */ + ret = status; + if (ret < 0) { + mlog_errno(ret); + + if (ret == -EFAULT) { + mlog(ML_ERROR, "node %u told me to kill " + "myself!\n", send_to); + BUG(); + } + } + } + + /* zero and reinit the message buffer */ + dlm_init_migratable_lockres(mres, res->lockname.name, + res->lockname.len, mres_total_locks, + mig_cookie, orig_flags, orig_master); + return ret; +} + +static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, + const char *lockname, int namelen, + int total_locks, u64 cookie, + u8 flags, u8 master) +{ + /* mres here is one full page */ + memset(mres, 0, PAGE_SIZE); + mres->lockname_len = namelen; + memcpy(mres->lockname, lockname, namelen); + mres->num_locks = 0; + mres->total_locks = cpu_to_be32(total_locks); + mres->mig_cookie = cpu_to_be64(cookie); + mres->flags = flags; + mres->master = master; +} + + +/* returns 1 if this lock fills the network structure, + * 0 otherwise */ +static int dlm_add_lock_to_array(struct dlm_lock *lock, + struct dlm_migratable_lockres *mres, int queue) +{ + struct dlm_migratable_lock *ml; + int lock_num = mres->num_locks; + + ml = &(mres->ml[lock_num]); + ml->cookie = lock->ml.cookie; + ml->type = lock->ml.type; + ml->convert_type = lock->ml.convert_type; + ml->highest_blocked = lock->ml.highest_blocked; + ml->list = queue; + if (lock->lksb) { + ml->flags = lock->lksb->flags; + /* send our current lvb */ + if (ml->type == LKM_EXMODE || + ml->type == LKM_PRMODE) { + /* if it is already set, this had better be a PR + * and it has to match */ + if (mres->lvb[0] && (ml->type == LKM_EXMODE || + memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { + mlog(ML_ERROR, "mismatched lvbs!\n"); + __dlm_print_one_lock_resource(lock->lockres); + BUG(); + } + memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN); + } + } + ml->node = lock->ml.node; + mres->num_locks++; + /* we reached the max, send this network message */ + if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS) + return 1; + return 0; +} + + +int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres, + u8 send_to, u8 flags) +{ + struct list_head *queue, *iter; + int total_locks, i; + u64 mig_cookie = 0; + struct dlm_lock *lock; + int ret = 0; + + BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); + + mlog(0, "sending to %u\n", send_to); + + total_locks = dlm_num_locks_in_lockres(res); + if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) { + /* rare, but possible */ + mlog(0, "argh. lockres has %d locks. this will " + "require more than one network packet to " + "migrate\n", total_locks); + mig_cookie = dlm_get_next_mig_cookie(); + } + + dlm_init_migratable_lockres(mres, res->lockname.name, + res->lockname.len, total_locks, + mig_cookie, flags, res->owner); + + total_locks = 0; + for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each(iter, queue) { + lock = list_entry (iter, struct dlm_lock, list); + + /* add another lock. */ + total_locks++; + if (!dlm_add_lock_to_array(lock, mres, i)) + continue; + + /* this filled the lock message, + * we must send it immediately. */ + ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, + res, total_locks); + if (ret < 0) { + // TODO + mlog(ML_ERROR, "dlm_send_mig_lockres_msg " + "returned %d, TODO\n", ret); + BUG(); + } + } + } + /* flush any remaining locks */ + ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); + if (ret < 0) { + // TODO + mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " + "TODO\n", ret); + BUG(); + } + return ret; +} + + + +/* + * this message will contain no more than one page worth of + * recovery data, and it will work on only one lockres. + * there may be many locks in this page, and we may need to wait + * for additional packets to complete all the locks (rare, but + * possible). + */ +/* + * NOTE: the allocation error cases here are scary + * we really cannot afford to fail an alloc in recovery + * do we spin? returning an error only delays the problem really + */ + +int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_migratable_lockres *mres = + (struct dlm_migratable_lockres *)msg->buf; + int ret = 0; + u8 real_master; + char *buf = NULL; + struct dlm_work_item *item = NULL; + struct dlm_lock_resource *res = NULL; + + if (!dlm_grab(dlm)) + return -EINVAL; + + BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); + + real_master = mres->master; + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* cannot migrate a lockres with no master */ + BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); + } + + mlog(0, "%s message received from node %u\n", + (mres->flags & DLM_MRES_RECOVERY) ? + "recovery" : "migration", mres->master); + if (mres->flags & DLM_MRES_ALL_DONE) + mlog(0, "all done flag. all lockres data received!\n"); + + ret = -ENOMEM; + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_KERNEL); + if (!buf || !item) + goto leave; + + /* lookup the lock to see if we have a secondary queue for this + * already... just add the locks in and this will have its owner + * and RECOVERY flag changed when it completes. */ + res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + if (res) { + /* this will get a ref on res */ + /* mark it as recovering/migrating and hash it */ + spin_lock(&res->spinlock); + if (mres->flags & DLM_MRES_RECOVERY) { + res->state |= DLM_LOCK_RES_RECOVERING; + } else { + if (res->state & DLM_LOCK_RES_MIGRATING) { + /* this is at least the second + * lockres message */ + mlog(0, "lock %.*s is already migrating\n", + mres->lockname_len, + mres->lockname); + } else if (res->state & DLM_LOCK_RES_RECOVERING) { + /* caller should BUG */ + mlog(ML_ERROR, "node is attempting to migrate " + "lock %.*s, but marked as recovering!\n", + mres->lockname_len, mres->lockname); + ret = -EFAULT; + spin_unlock(&res->spinlock); + goto leave; + } + res->state |= DLM_LOCK_RES_MIGRATING; + } + spin_unlock(&res->spinlock); + } else { + /* need to allocate, just like if it was + * mastered here normally */ + res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); + if (!res) + goto leave; + + /* to match the ref that we would have gotten if + * dlm_lookup_lockres had succeeded */ + dlm_lockres_get(res); + + /* mark it as recovering/migrating and hash it */ + if (mres->flags & DLM_MRES_RECOVERY) + res->state |= DLM_LOCK_RES_RECOVERING; + else + res->state |= DLM_LOCK_RES_MIGRATING; + + spin_lock(&dlm->spinlock); + __dlm_insert_lockres(dlm, res); + spin_unlock(&dlm->spinlock); + + /* now that the new lockres is inserted, + * make it usable by other processes */ + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + spin_unlock(&res->spinlock); + + /* add an extra ref for just-allocated lockres + * otherwise the lockres will be purged immediately */ + dlm_lockres_get(res); + + } + + /* at this point we have allocated everything we need, + * and we have a hashed lockres with an extra ref and + * the proper res->state flags. */ + ret = 0; + if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* migration cannot have an unknown master */ + BUG_ON(!(mres->flags & DLM_MRES_RECOVERY)); + mlog(0, "recovery has passed me a lockres with an " + "unknown owner.. will need to requery: " + "%.*s\n", mres->lockname_len, mres->lockname); + } else { + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, dlm->node_num); + spin_unlock(&res->spinlock); + } + + /* queue up work for dlm_mig_lockres_worker */ + dlm_grab(dlm); /* get an extra ref for the work item */ + memcpy(buf, msg->buf, be16_to_cpu(msg->data_len)); /* copy the whole message */ + dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf); + item->u.ml.lockres = res; /* already have a ref */ + item->u.ml.real_master = real_master; + spin_lock(&dlm->work_lock); + list_add_tail(&item->list, &dlm->work_list); + spin_unlock(&dlm->work_lock); + schedule_work(&dlm->dispatched_work); + +leave: + dlm_put(dlm); + if (ret < 0) { + if (buf) + kfree(buf); + if (item) + kfree(item); + } + + mlog_exit(ret); + return ret; +} + + +static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_migratable_lockres *mres; + int ret = 0; + struct dlm_lock_resource *res; + u8 real_master; + + dlm = item->dlm; + mres = (struct dlm_migratable_lockres *)data; + + res = item->u.ml.lockres; + real_master = item->u.ml.real_master; + + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + /* this case is super-rare. only occurs if + * node death happens during migration. */ +again: + ret = dlm_lockres_master_requery(dlm, res, &real_master); + if (ret < 0) { + mlog(0, "dlm_lockres_master_requery failure: %d\n", + ret); + goto again; + } + if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "lockres %.*s not claimed. " + "this node will take it.\n", + res->lockname.len, res->lockname.name); + } else { + mlog(0, "master needs to respond to sender " + "that node %u still owns %.*s\n", + real_master, res->lockname.len, + res->lockname.name); + /* cannot touch this lockres */ + goto leave; + } + } + + ret = dlm_process_recovery_data(dlm, res, mres); + if (ret < 0) + mlog(0, "dlm_process_recovery_data returned %d\n", ret); + else + mlog(0, "dlm_process_recovery_data succeeded\n"); + + if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) == + (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) { + ret = dlm_finish_migration(dlm, res, mres->master); + if (ret < 0) + mlog_errno(ret); + } + +leave: + kfree(data); + mlog_exit(ret); +} + + + +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master) +{ + struct dlm_node_iter iter; + int nodenum; + int ret = 0; + + *real_master = DLM_LOCK_RES_OWNER_UNKNOWN; + + /* we only reach here if one of the two nodes in a + * migration died while the migration was in progress. + * at this point we need to requery the master. we + * know that the new_master got as far as creating + * an mle on at least one node, but we do not know + * if any nodes had actually cleared the mle and set + * the master to the new_master. the old master + * is supposed to set the owner to UNKNOWN in the + * event of a new_master death, so the only possible + * responses that we can get from nodes here are + * that the master is new_master, or that the master + * is UNKNOWN. + * if all nodes come back with UNKNOWN then we know + * the lock needs remastering here. + * if any node comes back with a valid master, check + * to see if that master is the one that we are + * recovering. if so, then the new_master died and + * we need to remaster this lock. if not, then the + * new_master survived and that node will respond to + * other nodes about the owner. + * if there is an owner, this node needs to dump this + * lockres and alert the sender that this lockres + * was rejected. */ + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + /* do not send to self */ + if (nodenum == dlm->node_num) + continue; + ret = dlm_do_master_requery(dlm, res, nodenum, real_master); + if (ret < 0) { + mlog_errno(ret); + BUG(); + /* TODO: need to figure a way to restart this */ + } + if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "lock master is %u\n", *real_master); + break; + } + } + return ret; +} + + +static int dlm_do_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 nodenum, u8 *real_master) +{ + int ret = -EINVAL; + struct dlm_master_requery req; + int status = DLM_LOCK_RES_OWNER_UNKNOWN; + + memset(&req, 0, sizeof(req)); + req.node_idx = dlm->node_num; + req.namelen = res->lockname.len; + memcpy(req.name, res->lockname.name, res->lockname.len); + + ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key, + &req, sizeof(req), nodenum, &status); + /* XXX: negative status not handled properly here. */ + if (ret < 0) + mlog_errno(ret); + else { + BUG_ON(status < 0); + BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN); + *real_master = (u8) (status & 0xff); + mlog(0, "node %u responded to master requery with %u\n", + nodenum, *real_master); + ret = 0; + } + return ret; +} + + +/* this function cannot error, so unless the sending + * or receiving of the message failed, the owner can + * be trusted */ +int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; + struct dlm_lock_resource *res = NULL; + int master = DLM_LOCK_RES_OWNER_UNKNOWN; + u32 flags = DLM_ASSERT_MASTER_REQUERY; + + if (!dlm_grab(dlm)) { + /* since the domain has gone away on this + * node, the proper response is UNKNOWN */ + return master; + } + + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen); + if (res) { + spin_lock(&res->spinlock); + master = res->owner; + if (master == dlm->node_num) { + int ret = dlm_dispatch_assert_master(dlm, res, + 0, 0, flags); + if (ret < 0) { + mlog_errno(-ENOMEM); + /* retry!? */ + BUG(); + } + } + spin_unlock(&res->spinlock); + } + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); + return master; +} + +static inline struct list_head * +dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num) +{ + struct list_head *ret; + BUG_ON(list_num < 0); + BUG_ON(list_num > 2); + ret = &(res->granted); + ret += list_num; + return ret; +} +/* TODO: do ast flush business + * TODO: do MIGRATING and RECOVERING spinning + */ + +/* +* NOTE about in-flight requests during migration: +* +* Before attempting the migrate, the master has marked the lockres as +* MIGRATING and then flushed all of its pending ASTS. So any in-flight +* requests either got queued before the MIGRATING flag got set, in which +* case the lock data will reflect the change and a return message is on +* the way, or the request failed to get in before MIGRATING got set. In +* this case, the caller will be told to spin and wait for the MIGRATING +* flag to be dropped, then recheck the master. +* This holds true for the convert, cancel and unlock cases, and since lvb +* updates are tied to these same messages, it applies to lvb updates as +* well. For the lock case, there is no way a lock can be on the master +* queue and not be on the secondary queue since the lock is always added +* locally first. This means that the new target node will never be sent +* a lock that he doesn't already have on the list. +* In total, this means that the local lock is correct and should not be +* updated to match the one sent by the master. Any messages sent back +* from the master before the MIGRATING flag will bring the lock properly +* up-to-date, and the change will be ordered properly for the waiter. +* We will *not* attempt to modify the lock underneath the waiter. +*/ + +static int dlm_process_recovery_data(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_migratable_lockres *mres) +{ + struct dlm_migratable_lock *ml; + struct list_head *queue; + struct dlm_lock *newlock = NULL; + struct dlm_lockstatus *lksb = NULL; + int ret = 0; + int i; + struct list_head *iter; + struct dlm_lock *lock = NULL; + + mlog(0, "running %d locks for this lockres\n", mres->num_locks); + for (i=0; i<mres->num_locks; i++) { + ml = &(mres->ml[i]); + BUG_ON(ml->highest_blocked != LKM_IVMODE); + newlock = NULL; + lksb = NULL; + + queue = dlm_list_num_to_pointer(res, ml->list); + + /* if the lock is for the local node it needs to + * be moved to the proper location within the queue. + * do not allocate a new lock structure. */ + if (ml->node == dlm->node_num) { + /* MIGRATION ONLY! */ + BUG_ON(!(mres->flags & DLM_MRES_MIGRATION)); + + spin_lock(&res->spinlock); + list_for_each(iter, queue) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.cookie != ml->cookie) + lock = NULL; + else + break; + } + + /* lock is always created locally first, and + * destroyed locally last. it must be on the list */ + if (!lock) { + mlog(ML_ERROR, "could not find local lock " + "with cookie %"MLFu64"!\n", + ml->cookie); + BUG(); + } + BUG_ON(lock->ml.node != ml->node); + + /* see NOTE above about why we do not update + * to match the master here */ + + /* move the lock to its proper place */ + /* do not alter lock refcount. switching lists. */ + list_del_init(&lock->list); + list_add_tail(&lock->list, queue); + spin_unlock(&res->spinlock); + + mlog(0, "just reordered a local lock!\n"); + continue; + } + + /* lock is for another node. */ + newlock = dlm_new_lock(ml->type, ml->node, + be64_to_cpu(ml->cookie), NULL); + if (!newlock) { + ret = -ENOMEM; + goto leave; + } + lksb = newlock->lksb; + dlm_lock_attach_lockres(newlock, res); + + if (ml->convert_type != LKM_IVMODE) { + BUG_ON(queue != &res->converting); + newlock->ml.convert_type = ml->convert_type; + } + lksb->flags |= (ml->flags & + (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); + + if (mres->lvb[0]) { + if (lksb->flags & DLM_LKSB_PUT_LVB) { + /* other node was trying to update + * lvb when node died. recreate the + * lksb with the updated lvb. */ + memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + } else { + /* otherwise, the node is sending its + * most recent valid lvb info */ + BUG_ON(ml->type != LKM_EXMODE && + ml->type != LKM_PRMODE); + if (res->lvb[0] && (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + mlog(ML_ERROR, "received bad lvb!\n"); + __dlm_print_one_lock_resource(res); + BUG(); + } + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); + } + } + + + /* NOTE: + * wrt lock queue ordering and recovery: + * 1. order of locks on granted queue is + * meaningless. + * 2. order of locks on converting queue is + * LOST with the node death. sorry charlie. + * 3. order of locks on the blocked queue is + * also LOST. + * order of locks does not affect integrity, it + * just means that a lock request may get pushed + * back in line as a result of the node death. + * also note that for a given node the lock order + * for its secondary queue locks is preserved + * relative to each other, but clearly *not* + * preserved relative to locks from other nodes. + */ + spin_lock(&res->spinlock); + dlm_lock_get(newlock); + list_add_tail(&newlock->list, queue); + spin_unlock(&res->spinlock); + } + mlog(0, "done running all the locks\n"); + +leave: + if (ret < 0) { + mlog_errno(ret); + if (newlock) + dlm_lock_put(newlock); + } + + mlog_exit(ret); + return ret; +} + +void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + int i; + struct list_head *queue, *iter, *iter2; + struct dlm_lock *lock; + + res->state |= DLM_LOCK_RES_RECOVERING; + if (!list_empty(&res->recovering)) + list_del_init(&res->recovering); + list_add_tail(&res->recovering, &dlm->reco.resources); + + /* find any pending locks and put them back on proper list */ + for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each_safe(iter, iter2, queue) { + lock = list_entry (iter, struct dlm_lock, list); + dlm_lock_get(lock); + if (lock->convert_pending) { + /* move converting lock back to granted */ + BUG_ON(i != DLM_CONVERTING_LIST); + mlog(0, "node died with convert pending " + "on %.*s. move back to granted list.\n", + res->lockname.len, res->lockname.name); + dlm_revert_pending_convert(res, lock); + lock->convert_pending = 0; + } else if (lock->lock_pending) { + /* remove pending lock requests completely */ + BUG_ON(i != DLM_BLOCKED_LIST); + mlog(0, "node died with lock pending " + "on %.*s. remove from blocked list and skip.\n", + res->lockname.len, res->lockname.name); + /* lock will be floating until ref in + * dlmlock_remote is freed after the network + * call returns. ok for it to not be on any + * list since no ast can be called + * (the master is dead). */ + dlm_revert_pending_lock(res, lock); + lock->lock_pending = 0; + } else if (lock->unlock_pending) { + /* if an unlock was in progress, treat as + * if this had completed successfully + * before sending this lock state to the + * new master. note that the dlm_unlock + * call is still responsible for calling + * the unlockast. that will happen after + * the network call times out. for now, + * just move lists to prepare the new + * recovery master. */ + BUG_ON(i != DLM_GRANTED_LIST); + mlog(0, "node died with unlock pending " + "on %.*s. remove from blocked list and skip.\n", + res->lockname.len, res->lockname.name); + dlm_commit_pending_unlock(res, lock); + lock->unlock_pending = 0; + } else if (lock->cancel_pending) { + /* if a cancel was in progress, treat as + * if this had completed successfully + * before sending this lock state to the + * new master */ + BUG_ON(i != DLM_CONVERTING_LIST); + mlog(0, "node died with cancel pending " + "on %.*s. move back to granted list.\n", + res->lockname.len, res->lockname.name); + dlm_commit_pending_cancel(res, lock); + lock->cancel_pending = 0; + } + dlm_lock_put(lock); + } + } +} + + + +/* removes all recovered locks from the recovery list. + * sets the res->owner to the new master. + * unsets the RECOVERY flag and wakes waiters. */ +static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, + u8 dead_node, u8 new_master) +{ + int i; + struct list_head *iter, *iter2, *bucket; + struct dlm_lock_resource *res; + + mlog_entry_void(); + + assert_spin_locked(&dlm->spinlock); + + list_for_each_safe(iter, iter2, &dlm->reco.resources) { + res = list_entry (iter, struct dlm_lock_resource, recovering); + if (res->owner == dead_node) { + list_del_init(&res->recovering); + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + } + + /* this will become unnecessary eventually, but + * for now we need to run the whole hash, clear + * the RECOVERING state and set the owner + * if necessary */ + for (i=0; i<DLM_HASH_SIZE; i++) { + bucket = &(dlm->resources[i]); + list_for_each(iter, bucket) { + res = list_entry (iter, struct dlm_lock_resource, list); + if (res->state & DLM_LOCK_RES_RECOVERING) { + if (res->owner == dead_node) { + mlog(0, "(this=%u) res %.*s owner=%u " + "was not on recovering list, but " + "clearing state anyway\n", + dlm->node_num, res->lockname.len, + res->lockname.name, new_master); + } else if (res->owner == dlm->node_num) { + mlog(0, "(this=%u) res %.*s owner=%u " + "was not on recovering list, " + "owner is THIS node, clearing\n", + dlm->node_num, res->lockname.len, + res->lockname.name, new_master); + } else + continue; + + spin_lock(&res->spinlock); + dlm_change_lockres_owner(dlm, res, new_master); + res->state &= ~DLM_LOCK_RES_RECOVERING; + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + } + } + } +} + +static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local) +{ + if (local) { + if (lock->ml.type != LKM_EXMODE && + lock->ml.type != LKM_PRMODE) + return 1; + } else if (lock->ml.type == LKM_EXMODE) + return 1; + return 0; +} + +static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 dead_node) +{ + struct list_head *iter, *queue; + struct dlm_lock *lock; + int blank_lvb = 0, local = 0; + int i; + u8 search_node; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (res->owner == dlm->node_num) + /* if this node owned the lockres, and if the dead node + * had an EX when he died, blank out the lvb */ + search_node = dead_node; + else { + /* if this is a secondary lockres, and we had no EX or PR + * locks granted, we can no longer trust the lvb */ + search_node = dlm->node_num; + local = 1; /* check local state for valid lvb */ + } + + for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { + queue = dlm_list_idx_to_ptr(res, i); + list_for_each(iter, queue) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.node == search_node) { + if (dlm_lvb_needs_invalidation(lock, local)) { + /* zero the lksb lvb and lockres lvb */ + blank_lvb = 1; + memset(lock->lksb->lvb, 0, DLM_LVB_LEN); + } + } + } + } + + if (blank_lvb) { + mlog(0, "clearing %.*s lvb, dead node %u had EX\n", + res->lockname.len, res->lockname.name, dead_node); + memset(res->lvb, 0, DLM_LVB_LEN); + } +} + +static void dlm_free_dead_locks(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, u8 dead_node) +{ + struct list_head *iter, *tmpiter; + struct dlm_lock *lock; + + /* this node is the lockres master: + * 1) remove any stale locks for the dead node + * 2) if the dead node had an EX when he died, blank out the lvb + */ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* TODO: check pending_asts, pending_basts here */ + list_for_each_safe(iter, tmpiter, &res->granted) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + } + } + list_for_each_safe(iter, tmpiter, &res->converting) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + } + } + list_for_each_safe(iter, tmpiter, &res->blocked) { + lock = list_entry (iter, struct dlm_lock, list); + if (lock->ml.node == dead_node) { + list_del_init(&lock->list); + dlm_lock_put(lock); + } + } + + /* do not kick thread yet */ + __dlm_dirty_lockres(dlm, res); +} + +/* if this node is the recovery master, and there are no + * locks for a given lockres owned by this node that are in + * either PR or EX mode, zero out the lvb before requesting. + * + */ + + +static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct list_head *iter; + struct dlm_lock_resource *res; + int i; + struct list_head *bucket; + + + /* purge any stale mles */ + dlm_clean_master_list(dlm, dead_node); + + /* + * now clean up all lock resources. there are two rules: + * + * 1) if the dead node was the master, move the lockres + * to the recovering list. set the RECOVERING flag. + * this lockres needs to be cleaned up before it can + * be used further. + * + * 2) if this node was the master, remove all locks from + * each of the lockres queues that were owned by the + * dead node. once recovery finishes, the dlm thread + * can be kicked again to see if any ASTs or BASTs + * need to be fired as a result. + */ + for (i=0; i<DLM_HASH_SIZE; i++) { + bucket = &(dlm->resources[i]); + list_for_each(iter, bucket) { + res = list_entry (iter, struct dlm_lock_resource, list); + if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) + continue; + + spin_lock(&res->spinlock); + /* zero the lvb if necessary */ + dlm_revalidate_lvb(dlm, res, dead_node); + if (res->owner == dead_node) + dlm_move_lockres_to_recovery_list(dlm, res); + else if (res->owner == dlm->node_num) { + dlm_free_dead_locks(dlm, res, dead_node); + __dlm_lockres_calc_usage(dlm, res); + } + spin_unlock(&res->spinlock); + } + } + +} + +static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) +{ + assert_spin_locked(&dlm->spinlock); + + /* check to see if the node is already considered dead */ + if (!test_bit(idx, dlm->live_nodes_map)) { + mlog(0, "for domain %s, node %d is already dead. " + "another node likely did recovery already.\n", + dlm->name, idx); + return; + } + + /* check to see if we do not care about this node */ + if (!test_bit(idx, dlm->domain_map)) { + /* This also catches the case that we get a node down + * but haven't joined the domain yet. */ + mlog(0, "node %u already removed from domain!\n", idx); + return; + } + + clear_bit(idx, dlm->live_nodes_map); + + /* Clean up join state on node death. */ + if (dlm->joining_node == idx) { + mlog(0, "Clearing join state for node %u\n", idx); + __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + } + + /* make sure local cleanup occurs before the heartbeat events */ + if (!test_bit(idx, dlm->recovery_map)) + dlm_do_local_recovery_cleanup(dlm, idx); + + /* notify anything attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, idx, 0); + + mlog(0, "node %u being removed from domain map!\n", idx); + clear_bit(idx, dlm->domain_map); + /* wake up migration waiters if a node goes down. + * perhaps later we can genericize this for other waiters. */ + wake_up(&dlm->migration_wq); + + if (test_bit(idx, dlm->recovery_map)) + mlog(0, "domain %s, node %u already added " + "to recovery map!\n", dlm->name, idx); + else + set_bit(idx, dlm->recovery_map); +} + +void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) +{ + struct dlm_ctxt *dlm = data; + + if (!dlm_grab(dlm)) + return; + + spin_lock(&dlm->spinlock); + __dlm_hb_node_down(dlm, idx); + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); +} + +void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) +{ + struct dlm_ctxt *dlm = data; + + if (!dlm_grab(dlm)) + return; + + spin_lock(&dlm->spinlock); + + set_bit(idx, dlm->live_nodes_map); + + /* notify any mles attached to the heartbeat events */ + dlm_hb_event_notify_attached(dlm, idx, 1); + + spin_unlock(&dlm->spinlock); + + dlm_put(dlm); +} + +static void dlm_reco_ast(void *astdata) +{ + struct dlm_ctxt *dlm = astdata; + mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n", + dlm->node_num, dlm->name); +} +static void dlm_reco_bast(void *astdata, int blocked_type) +{ + struct dlm_ctxt *dlm = astdata; + mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n", + dlm->node_num, dlm->name); +} +static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) +{ + mlog(0, "unlockast for recovery lock fired!\n"); +} + + +static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) +{ + enum dlm_status ret; + struct dlm_lockstatus lksb; + int status = -EINVAL; + + mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", + dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); +retry: + memset(&lksb, 0, sizeof(lksb)); + + ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, + DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); + + if (ret == DLM_NORMAL) { + mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", + dlm->name, dlm->node_num); + /* I am master, send message to all nodes saying + * that I am beginning a recovery session */ + status = dlm_send_begin_reco_message(dlm, + dlm->reco.dead_node); + + /* recovery lock is a special case. ast will not get fired, + * so just go ahead and unlock it. */ + ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); + if (ret != DLM_NORMAL) { + /* this would really suck. this could only happen + * if there was a network error during the unlock + * because of node death. this means the unlock + * is actually "done" and the lock structure is + * even freed. we can continue, but only + * because this specific lock name is special. */ + mlog(0, "dlmunlock returned %d\n", ret); + } + + if (status < 0) { + mlog(0, "failed to send recovery message. " + "must retry with new node map.\n"); + goto retry; + } + } else if (ret == DLM_NOTQUEUED) { + mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", + dlm->name, dlm->node_num); + /* another node is master. wait on + * reco.new_master != O2NM_INVALID_NODE_NUM */ + status = -EEXIST; + } + + return status; +} + +static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) +{ + struct dlm_begin_reco br; + int ret = 0; + struct dlm_node_iter iter; + int nodenum; + int status; + + mlog_entry("%u\n", dead_node); + + mlog(0, "dead node is %u\n", dead_node); + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + clear_bit(dead_node, iter.node_map); + + memset(&br, 0, sizeof(br)); + br.node_idx = dlm->node_num; + br.dead_node = dead_node; + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + ret = 0; + if (nodenum == dead_node) { + mlog(0, "not sending begin reco to dead node " + "%u\n", dead_node); + continue; + } + if (nodenum == dlm->node_num) { + mlog(0, "not sending begin reco to self\n"); + continue; + } + + ret = -EINVAL; + mlog(0, "attempting to send begin reco msg to %d\n", + nodenum); + ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, + &br, sizeof(br), nodenum, &status); + /* negative status is handled ok by caller here */ + if (ret >= 0) + ret = status; + if (ret < 0) { + struct dlm_lock_resource *res; + mlog_errno(ret); + mlog(ML_ERROR, "begin reco of dlm %s to node %u " + " returned %d\n", dlm->name, nodenum, ret); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + break; + } + } + + return ret; +} + +int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf; + + /* ok to return 0, domain has gone away */ + if (!dlm_grab(dlm)) + return 0; + + mlog(0, "node %u wants to recover node %u\n", + br->node_idx, br->dead_node); + + dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); + + spin_lock(&dlm->spinlock); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(0, "new_master already set to %u!\n", + dlm->reco.new_master); + } + if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { + mlog(0, "dead_node already set to %u!\n", + dlm->reco.dead_node); + } + dlm->reco.new_master = br->node_idx; + dlm->reco.dead_node = br->dead_node; + if (!test_bit(br->dead_node, dlm->recovery_map)) { + mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " + "node has not yet. marking %u as dead\n", + br->node_idx, br->dead_node, br->dead_node); + __dlm_hb_node_down(dlm, br->dead_node); + } + spin_unlock(&dlm->spinlock); + + dlm_kick_recovery_thread(dlm); + dlm_put(dlm); + return 0; +} + +static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) +{ + int ret = 0; + struct dlm_finalize_reco fr; + struct dlm_node_iter iter; + int nodenum; + int status; + + mlog(0, "finishing recovery for node %s:%u\n", + dlm->name, dlm->reco.dead_node); + + spin_lock(&dlm->spinlock); + dlm_node_iter_init(dlm->domain_map, &iter); + spin_unlock(&dlm->spinlock); + + memset(&fr, 0, sizeof(fr)); + fr.node_idx = dlm->node_num; + fr.dead_node = dlm->reco.dead_node; + + while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { + if (nodenum == dlm->node_num) + continue; + ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, + &fr, sizeof(fr), nodenum, &status); + if (ret >= 0) { + ret = status; + if (dlm_is_host_down(ret)) { + /* this has no effect on this recovery + * session, so set the status to zero to + * finish out the last recovery */ + mlog(ML_ERROR, "node %u went down after this " + "node finished recovery.\n", nodenum); + ret = 0; + } + } + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + + /* ok to return 0, domain has gone away */ + if (!dlm_grab(dlm)) + return 0; + + mlog(0, "node %u finalizing recovery of node %u\n", + fr->node_idx, fr->dead_node); + + spin_lock(&dlm->spinlock); + + if (dlm->reco.new_master != fr->node_idx) { + mlog(ML_ERROR, "node %u sent recovery finalize msg, but node " + "%u is supposed to be the new master, dead=%u\n", + fr->node_idx, dlm->reco.new_master, fr->dead_node); + BUG(); + } + if (dlm->reco.dead_node != fr->dead_node) { + mlog(ML_ERROR, "node %u sent recovery finalize msg for dead " + "node %u, but node %u is supposed to be dead\n", + fr->node_idx, fr->dead_node, dlm->reco.dead_node); + BUG(); + } + + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + + spin_unlock(&dlm->spinlock); + + dlm_reset_recovery(dlm); + + dlm_kick_recovery_thread(dlm); + dlm_put(dlm); + return 0; +} diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c new file mode 100644 index 000000000000..5be9d14f12cb --- /dev/null +++ b/fs/ocfs2/dlm/dlmthread.c @@ -0,0 +1,692 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmthread.c + * + * standalone DLM module + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/timer.h> +#include <linux/kthread.h> + + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" +#include "dlmdomain.h" + +#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD) +#include "cluster/masklog.h" + +static int dlm_thread(void *data); + +static void dlm_flush_asts(struct dlm_ctxt *dlm); + +#define dlm_lock_is_remote(dlm, lock) ((lock)->ml.node != (dlm)->node_num) + +/* will exit holding res->spinlock, but may drop in function */ +/* waits until flags are cleared on res->state */ +void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + + assert_spin_locked(&res->spinlock); + + add_wait_queue(&res->wq, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (res->state & flags) { + spin_unlock(&res->spinlock); + schedule(); + spin_lock(&res->spinlock); + goto repeat; + } + remove_wait_queue(&res->wq, &wait); + current->state = TASK_RUNNING; +} + + +static int __dlm_lockres_unused(struct dlm_lock_resource *res) +{ + if (list_empty(&res->granted) && + list_empty(&res->converting) && + list_empty(&res->blocked) && + list_empty(&res->dirty)) + return 1; + return 0; +} + + +/* Call whenever you may have added or deleted something from one of + * the lockres queue's. This will figure out whether it belongs on the + * unused list or not and does the appropriate thing. */ +void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + if (__dlm_lockres_unused(res)){ + if (list_empty(&res->purge)) { + mlog(0, "putting lockres %.*s from purge list\n", + res->lockname.len, res->lockname.name); + + res->last_used = jiffies; + list_add_tail(&res->purge, &dlm->purge_list); + dlm->purge_count++; + } + } else if (!list_empty(&res->purge)) { + mlog(0, "removing lockres %.*s from purge list\n", + res->lockname.len, res->lockname.name); + + list_del_init(&res->purge); + dlm->purge_count--; + } +} + +void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + + __dlm_lockres_calc_usage(dlm, res); + + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); +} + +/* TODO: Eventual API: Called with the dlm spinlock held, may drop it + * to do migration, but will re-acquire before exit. */ +void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres) +{ + int master; + int ret; + + spin_lock(&lockres->spinlock); + master = lockres->owner == dlm->node_num; + spin_unlock(&lockres->spinlock); + + mlog(0, "purging lockres %.*s, master = %d\n", lockres->lockname.len, + lockres->lockname.name, master); + + /* Non master is the easy case -- no migration required, just + * quit. */ + if (!master) + goto finish; + + /* Wheee! Migrate lockres here! */ + spin_unlock(&dlm->spinlock); +again: + + ret = dlm_migrate_lockres(dlm, lockres, O2NM_MAX_NODES); + if (ret == -ENOTEMPTY) { + mlog(ML_ERROR, "lockres %.*s still has local locks!\n", + lockres->lockname.len, lockres->lockname.name); + + BUG(); + } else if (ret < 0) { + mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", + lockres->lockname.len, lockres->lockname.name); + goto again; + } + + spin_lock(&dlm->spinlock); + +finish: + if (!list_empty(&lockres->purge)) { + list_del_init(&lockres->purge); + dlm->purge_count--; + } + __dlm_unhash_lockres(lockres); +} + +static void dlm_run_purge_list(struct dlm_ctxt *dlm, + int purge_now) +{ + unsigned int run_max, unused; + unsigned long purge_jiffies; + struct dlm_lock_resource *lockres; + + spin_lock(&dlm->spinlock); + run_max = dlm->purge_count; + + while(run_max && !list_empty(&dlm->purge_list)) { + run_max--; + + lockres = list_entry(dlm->purge_list.next, + struct dlm_lock_resource, purge); + + /* Status of the lockres *might* change so double + * check. If the lockres is unused, holding the dlm + * spinlock will prevent people from getting and more + * refs on it -- there's no need to keep the lockres + * spinlock. */ + spin_lock(&lockres->spinlock); + unused = __dlm_lockres_unused(lockres); + spin_unlock(&lockres->spinlock); + + if (!unused) + continue; + + purge_jiffies = lockres->last_used + + msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); + + /* Make sure that we want to be processing this guy at + * this time. */ + if (!purge_now && time_after(purge_jiffies, jiffies)) { + /* Since resources are added to the purge list + * in tail order, we can stop at the first + * unpurgable resource -- anyone added after + * him will have a greater last_used value */ + break; + } + + list_del_init(&lockres->purge); + dlm->purge_count--; + + /* This may drop and reacquire the dlm spinlock if it + * has to do migration. */ + mlog(0, "calling dlm_purge_lockres!\n"); + dlm_purge_lockres(dlm, lockres); + mlog(0, "DONE calling dlm_purge_lockres!\n"); + + /* Avoid adding any scheduling latencies */ + cond_resched_lock(&dlm->spinlock); + } + + spin_unlock(&dlm->spinlock); +} + +static void dlm_shuffle_lists(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res) +{ + struct dlm_lock *lock, *target; + struct list_head *iter; + struct list_head *head; + int can_grant = 1; + + //mlog(0, "res->lockname.len=%d\n", res->lockname.len); + //mlog(0, "res->lockname.name=%p\n", res->lockname.name); + //mlog(0, "shuffle res %.*s\n", res->lockname.len, + // res->lockname.name); + + /* because this function is called with the lockres + * spinlock, and because we know that it is not migrating/ + * recovering/in-progress, it is fine to reserve asts and + * basts right before queueing them all throughout */ + assert_spin_locked(&res->spinlock); + BUG_ON((res->state & (DLM_LOCK_RES_MIGRATING| + DLM_LOCK_RES_RECOVERING| + DLM_LOCK_RES_IN_PROGRESS))); + +converting: + if (list_empty(&res->converting)) + goto blocked; + mlog(0, "res %.*s has locks on a convert queue\n", res->lockname.len, + res->lockname.name); + + target = list_entry(res->converting.next, struct dlm_lock, list); + if (target->ml.convert_type == LKM_IVMODE) { + mlog(ML_ERROR, "%.*s: converting a lock with no " + "convert_type!\n", res->lockname.len, res->lockname.name); + BUG(); + } + head = &res->granted; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, + target->ml.convert_type)) { + can_grant = 0; + /* queue the BAST if not already */ + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + dlm_queue_bast(dlm, lock); + } + /* update the highest_blocked if needed */ + if (lock->ml.highest_blocked < target->ml.convert_type) + lock->ml.highest_blocked = + target->ml.convert_type; + } + } + head = &res->converting; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, + target->ml.convert_type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.convert_type) + lock->ml.highest_blocked = + target->ml.convert_type; + } + } + + /* we can convert the lock */ + if (can_grant) { + spin_lock(&target->spinlock); + BUG_ON(target->ml.highest_blocked != LKM_IVMODE); + + mlog(0, "calling ast for converting lock: %.*s, have: %d, " + "granting: %d, node: %u\n", res->lockname.len, + res->lockname.name, target->ml.type, + target->ml.convert_type, target->ml.node); + + target->ml.type = target->ml.convert_type; + target->ml.convert_type = LKM_IVMODE; + list_del_init(&target->list); + list_add_tail(&target->list, &res->granted); + + BUG_ON(!target->lksb); + target->lksb->status = DLM_NORMAL; + + spin_unlock(&target->spinlock); + + __dlm_lockres_reserve_ast(res); + dlm_queue_ast(dlm, target); + /* go back and check for more */ + goto converting; + } + +blocked: + if (list_empty(&res->blocked)) + goto leave; + target = list_entry(res->blocked.next, struct dlm_lock, list); + + head = &res->granted; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.type) + lock->ml.highest_blocked = target->ml.type; + } + } + + head = &res->converting; + list_for_each(iter, head) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock==target) + continue; + if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { + can_grant = 0; + if (lock->ml.highest_blocked == LKM_IVMODE) { + __dlm_lockres_reserve_ast(res); + dlm_queue_bast(dlm, lock); + } + if (lock->ml.highest_blocked < target->ml.type) + lock->ml.highest_blocked = target->ml.type; + } + } + + /* we can grant the blocked lock (only + * possible if converting list empty) */ + if (can_grant) { + spin_lock(&target->spinlock); + BUG_ON(target->ml.highest_blocked != LKM_IVMODE); + + mlog(0, "calling ast for blocked lock: %.*s, granting: %d, " + "node: %u\n", res->lockname.len, res->lockname.name, + target->ml.type, target->ml.node); + + // target->ml.type is already correct + list_del_init(&target->list); + list_add_tail(&target->list, &res->granted); + + BUG_ON(!target->lksb); + target->lksb->status = DLM_NORMAL; + + spin_unlock(&target->spinlock); + + __dlm_lockres_reserve_ast(res); + dlm_queue_ast(dlm, target); + /* go back and check for more */ + goto converting; + } + +leave: + return; +} + +/* must have NO locks when calling this with res !=NULL * */ +void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + mlog_entry("dlm=%p, res=%p\n", dlm, res); + if (res) { + spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); + __dlm_dirty_lockres(dlm, res); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + } + wake_up(&dlm->dlm_thread_wq); +} + +void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) +{ + mlog_entry("dlm=%p, res=%p\n", dlm, res); + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); + + /* don't shuffle secondary queues */ + if ((res->owner == dlm->node_num) && + !(res->state & DLM_LOCK_RES_DIRTY)) { + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + } +} + + +/* Launch the NM thread for the mounted volume */ +int dlm_launch_thread(struct dlm_ctxt *dlm) +{ + mlog(0, "starting dlm thread...\n"); + + dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread"); + if (IS_ERR(dlm->dlm_thread_task)) { + mlog_errno(PTR_ERR(dlm->dlm_thread_task)); + dlm->dlm_thread_task = NULL; + return -EINVAL; + } + + return 0; +} + +void dlm_complete_thread(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_thread_task) { + mlog(ML_KTHREAD, "waiting for dlm thread to exit\n"); + kthread_stop(dlm->dlm_thread_task); + dlm->dlm_thread_task = NULL; + } +} + +static int dlm_dirty_list_empty(struct dlm_ctxt *dlm) +{ + int empty; + + spin_lock(&dlm->spinlock); + empty = list_empty(&dlm->dirty_list); + spin_unlock(&dlm->spinlock); + + return empty; +} + +static void dlm_flush_asts(struct dlm_ctxt *dlm) +{ + int ret; + struct dlm_lock *lock; + struct dlm_lock_resource *res; + u8 hi; + + spin_lock(&dlm->ast_lock); + while (!list_empty(&dlm->pending_asts)) { + lock = list_entry(dlm->pending_asts.next, + struct dlm_lock, ast_list); + /* get an extra ref on lock */ + dlm_lock_get(lock); + res = lock->lockres; + mlog(0, "delivering an ast for this lockres\n"); + + BUG_ON(!lock->ast_pending); + + /* remove from list (including ref) */ + list_del_init(&lock->ast_list); + dlm_lock_put(lock); + spin_unlock(&dlm->ast_lock); + + if (lock->ml.node != dlm->node_num) { + ret = dlm_do_remote_ast(dlm, res, lock); + if (ret < 0) + mlog_errno(ret); + } else + dlm_do_local_ast(dlm, res, lock); + + spin_lock(&dlm->ast_lock); + + /* possible that another ast was queued while + * we were delivering the last one */ + if (!list_empty(&lock->ast_list)) { + mlog(0, "aha another ast got queued while " + "we were finishing the last one. will " + "keep the ast_pending flag set.\n"); + } else + lock->ast_pending = 0; + + /* drop the extra ref. + * this may drop it completely. */ + dlm_lock_put(lock); + dlm_lockres_release_ast(dlm, res); + } + + while (!list_empty(&dlm->pending_basts)) { + lock = list_entry(dlm->pending_basts.next, + struct dlm_lock, bast_list); + /* get an extra ref on lock */ + dlm_lock_get(lock); + res = lock->lockres; + + BUG_ON(!lock->bast_pending); + + /* get the highest blocked lock, and reset */ + spin_lock(&lock->spinlock); + BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE); + hi = lock->ml.highest_blocked; + lock->ml.highest_blocked = LKM_IVMODE; + spin_unlock(&lock->spinlock); + + /* remove from list (including ref) */ + list_del_init(&lock->bast_list); + dlm_lock_put(lock); + spin_unlock(&dlm->ast_lock); + + mlog(0, "delivering a bast for this lockres " + "(blocked = %d\n", hi); + + if (lock->ml.node != dlm->node_num) { + ret = dlm_send_proxy_bast(dlm, res, lock, hi); + if (ret < 0) + mlog_errno(ret); + } else + dlm_do_local_bast(dlm, res, lock, hi); + + spin_lock(&dlm->ast_lock); + + /* possible that another bast was queued while + * we were delivering the last one */ + if (!list_empty(&lock->bast_list)) { + mlog(0, "aha another bast got queued while " + "we were finishing the last one. will " + "keep the bast_pending flag set.\n"); + } else + lock->bast_pending = 0; + + /* drop the extra ref. + * this may drop it completely. */ + dlm_lock_put(lock); + dlm_lockres_release_ast(dlm, res); + } + wake_up(&dlm->ast_wq); + spin_unlock(&dlm->ast_lock); +} + + +#define DLM_THREAD_TIMEOUT_MS (4 * 1000) +#define DLM_THREAD_MAX_DIRTY 100 +#define DLM_THREAD_MAX_ASTS 10 + +static int dlm_thread(void *data) +{ + struct dlm_lock_resource *res; + struct dlm_ctxt *dlm = data; + unsigned long timeout = msecs_to_jiffies(DLM_THREAD_TIMEOUT_MS); + + mlog(0, "dlm thread running for %s...\n", dlm->name); + + while (!kthread_should_stop()) { + int n = DLM_THREAD_MAX_DIRTY; + + /* dlm_shutting_down is very point-in-time, but that + * doesn't matter as we'll just loop back around if we + * get false on the leading edge of a state + * transition. */ + dlm_run_purge_list(dlm, dlm_shutting_down(dlm)); + + /* We really don't want to hold dlm->spinlock while + * calling dlm_shuffle_lists on each lockres that + * needs to have its queues adjusted and AST/BASTs + * run. So let's pull each entry off the dirty_list + * and drop dlm->spinlock ASAP. Once off the list, + * res->spinlock needs to be taken again to protect + * the queues while calling dlm_shuffle_lists. */ + spin_lock(&dlm->spinlock); + while (!list_empty(&dlm->dirty_list)) { + int delay = 0; + res = list_entry(dlm->dirty_list.next, + struct dlm_lock_resource, dirty); + + /* peel a lockres off, remove it from the list, + * unset the dirty flag and drop the dlm lock */ + BUG_ON(!res); + dlm_lockres_get(res); + + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_DIRTY; + list_del_init(&res->dirty); + spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); + + /* lockres can be re-dirtied/re-added to the + * dirty_list in this gap, but that is ok */ + + spin_lock(&res->spinlock); + if (res->owner != dlm->node_num) { + __dlm_print_one_lock_resource(res); + mlog(ML_ERROR, "inprog:%s, mig:%s, reco:%s, dirty:%s\n", + res->state & DLM_LOCK_RES_IN_PROGRESS ? "yes" : "no", + res->state & DLM_LOCK_RES_MIGRATING ? "yes" : "no", + res->state & DLM_LOCK_RES_RECOVERING ? "yes" : "no", + res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no"); + } + BUG_ON(res->owner != dlm->node_num); + + /* it is now ok to move lockreses in these states + * to the dirty list, assuming that they will only be + * dirty for a short while. */ + if (res->state & (DLM_LOCK_RES_IN_PROGRESS | + DLM_LOCK_RES_MIGRATING | + DLM_LOCK_RES_RECOVERING)) { + /* move it to the tail and keep going */ + spin_unlock(&res->spinlock); + mlog(0, "delaying list shuffling for in-" + "progress lockres %.*s, state=%d\n", + res->lockname.len, res->lockname.name, + res->state); + delay = 1; + goto in_progress; + } + + /* at this point the lockres is not migrating/ + * recovering/in-progress. we have the lockres + * spinlock and do NOT have the dlm lock. + * safe to reserve/queue asts and run the lists. */ + + mlog(0, "calling dlm_shuffle_lists with dlm=%p, " + "res=%p\n", dlm, res); + + /* called while holding lockres lock */ + dlm_shuffle_lists(dlm, res); + spin_unlock(&res->spinlock); + + dlm_lockres_calc_usage(dlm, res); + +in_progress: + + spin_lock(&dlm->spinlock); + /* if the lock was in-progress, stick + * it on the back of the list */ + if (delay) { + spin_lock(&res->spinlock); + list_add_tail(&res->dirty, &dlm->dirty_list); + res->state |= DLM_LOCK_RES_DIRTY; + spin_unlock(&res->spinlock); + } + dlm_lockres_put(res); + + /* unlikely, but we may need to give time to + * other tasks */ + if (!--n) { + mlog(0, "throttling dlm_thread\n"); + break; + } + } + + spin_unlock(&dlm->spinlock); + dlm_flush_asts(dlm); + + /* yield and continue right away if there is more work to do */ + if (!n) { + yield(); + continue; + } + + wait_event_interruptible_timeout(dlm->dlm_thread_wq, + !dlm_dirty_list_empty(dlm) || + kthread_should_stop(), + timeout); + } + + mlog(0, "quitting DLM thread\n"); + return 0; +} diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c new file mode 100644 index 000000000000..cec2ce1cd318 --- /dev/null +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -0,0 +1,672 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmunlock.c + * + * underlying calls for unlocking locks + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + */ + + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/sysctl.h> +#include <linux/random.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/spinlock.h> +#include <linux/delay.h> + +#include "cluster/heartbeat.h" +#include "cluster/nodemanager.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" +#include "dlmcommon.h" + +#define MLOG_MASK_PREFIX ML_DLM +#include "cluster/masklog.h" + +#define DLM_UNLOCK_FREE_LOCK 0x00000001 +#define DLM_UNLOCK_CALL_AST 0x00000002 +#define DLM_UNLOCK_REMOVE_LOCK 0x00000004 +#define DLM_UNLOCK_REGRANT_LOCK 0x00000008 +#define DLM_UNLOCK_CLEAR_CONVERT_TYPE 0x00000010 + + +static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions); +static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions); + +static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + u8 owner); + + +/* + * according to the spec: + * http://opendlm.sourceforge.net/cvsmirror/opendlm/docs/dlmbook_final.pdf + * + * flags & LKM_CANCEL != 0: must be converting or blocked + * flags & LKM_CANCEL == 0: must be granted + * + * So to unlock a converting lock, you must first cancel the + * convert (passing LKM_CANCEL in flags), then call the unlock + * again (with no LKM_CANCEL in flags). + */ + + +/* + * locking: + * caller needs: none + * taken: res->spinlock and lock->spinlock taken and dropped + * held on exit: none + * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network + * all callers should have taken an extra ref on lock coming in + */ +static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, int *call_ast, + int master_node) +{ + enum dlm_status status; + int actions = 0; + int in_use; + u8 owner; + + mlog(0, "master_node = %d, valblk = %d\n", master_node, + flags & LKM_VALBLK); + + if (master_node) + BUG_ON(res->owner != dlm->node_num); + else + BUG_ON(res->owner == dlm->node_num); + + spin_lock(&dlm->spinlock); + /* We want to be sure that we're not freeing a lock + * that still has AST's pending... */ + in_use = !list_empty(&lock->ast_list); + spin_unlock(&dlm->spinlock); + if (in_use) { + mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " + "while waiting for an ast!", res->lockname.len, + res->lockname.name); + return DLM_BADPARAM; + } + + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_IN_PROGRESS) { + if (master_node) { + mlog(ML_ERROR, "lockres in progress!\n"); + spin_unlock(&res->spinlock); + return DLM_FORWARD; + } + /* ok for this to sleep if not in a network handler */ + __dlm_wait_on_lockres(res); + res->state |= DLM_LOCK_RES_IN_PROGRESS; + } + spin_lock(&lock->spinlock); + + if (res->state & DLM_LOCK_RES_RECOVERING) { + status = DLM_RECOVERING; + goto leave; + } + + + /* see above for what the spec says about + * LKM_CANCEL and the lock queue state */ + if (flags & LKM_CANCEL) + status = dlm_get_cancel_actions(dlm, res, lock, lksb, &actions); + else + status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); + + if (status != DLM_NORMAL) + goto leave; + + /* By now this has been masked out of cancel requests. */ + if (flags & LKM_VALBLK) { + /* make the final update to the lvb */ + if (master_node) + memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); + else + flags |= LKM_PUT_LVB; /* let the send function + * handle it. */ + } + + if (!master_node) { + owner = res->owner; + /* drop locks and send message */ + if (flags & LKM_CANCEL) + lock->cancel_pending = 1; + else + lock->unlock_pending = 1; + spin_unlock(&lock->spinlock); + spin_unlock(&res->spinlock); + status = dlm_send_remote_unlock_request(dlm, res, lock, lksb, + flags, owner); + spin_lock(&res->spinlock); + spin_lock(&lock->spinlock); + /* if the master told us the lock was already granted, + * let the ast handle all of these actions */ + if (status == DLM_NORMAL && + lksb->status == DLM_CANCELGRANT) { + actions &= ~(DLM_UNLOCK_REMOVE_LOCK| + DLM_UNLOCK_REGRANT_LOCK| + DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } + if (flags & LKM_CANCEL) + lock->cancel_pending = 0; + else + lock->unlock_pending = 0; + + } + + /* get an extra ref on lock. if we are just switching + * lists here, we dont want the lock to go away. */ + dlm_lock_get(lock); + + if (actions & DLM_UNLOCK_REMOVE_LOCK) { + list_del_init(&lock->list); + dlm_lock_put(lock); + } + if (actions & DLM_UNLOCK_REGRANT_LOCK) { + dlm_lock_get(lock); + list_add_tail(&lock->list, &res->granted); + } + if (actions & DLM_UNLOCK_CLEAR_CONVERT_TYPE) { + mlog(0, "clearing convert_type at %smaster node\n", + master_node ? "" : "non-"); + lock->ml.convert_type = LKM_IVMODE; + } + + /* remove the extra ref on lock */ + dlm_lock_put(lock); + +leave: + res->state &= ~DLM_LOCK_RES_IN_PROGRESS; + if (!dlm_lock_on_list(&res->converting, lock)) + BUG_ON(lock->ml.convert_type != LKM_IVMODE); + else + BUG_ON(lock->ml.convert_type == LKM_IVMODE); + spin_unlock(&lock->spinlock); + spin_unlock(&res->spinlock); + wake_up(&res->wq); + + /* let the caller's final dlm_lock_put handle the actual kfree */ + if (actions & DLM_UNLOCK_FREE_LOCK) { + /* this should always be coupled with list removal */ + BUG_ON(!(actions & DLM_UNLOCK_REMOVE_LOCK)); + mlog(0, "lock %"MLFu64" should be gone now! refs=%d\n", + lock->ml.cookie, atomic_read(&lock->lock_refs.refcount)-1); + dlm_lock_put(lock); + } + if (actions & DLM_UNLOCK_CALL_AST) + *call_ast = 1; + + /* if cancel or unlock succeeded, lvb work is done */ + if (status == DLM_NORMAL) + lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); + + return status; +} + +void dlm_commit_pending_unlock(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + /* leave DLM_LKSB_PUT_LVB on the lksb so any final + * update of the lvb will be sent to the new master */ + list_del_init(&lock->list); +} + +void dlm_commit_pending_cancel(struct dlm_lock_resource *res, + struct dlm_lock *lock) +{ + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); + lock->ml.convert_type = LKM_IVMODE; +} + + +static inline enum dlm_status dlmunlock_master(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + int *call_ast) +{ + return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 1); +} + +static inline enum dlm_status dlmunlock_remote(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, int *call_ast) +{ + return dlmunlock_common(dlm, res, lock, lksb, flags, call_ast, 0); +} + +/* + * locking: + * caller needs: none + * taken: none + * held on exit: none + * returns: DLM_NORMAL, DLM_NOLOCKMGR, status from network + */ +static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int flags, + u8 owner) +{ + struct dlm_unlock_lock unlock; + int tmpret; + enum dlm_status ret; + int status = 0; + struct kvec vec[2]; + size_t veclen = 1; + + mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + + memset(&unlock, 0, sizeof(unlock)); + unlock.node_idx = dlm->node_num; + unlock.flags = cpu_to_be32(flags); + unlock.cookie = lock->ml.cookie; + unlock.namelen = res->lockname.len; + memcpy(unlock.name, res->lockname.name, unlock.namelen); + + vec[0].iov_len = sizeof(struct dlm_unlock_lock); + vec[0].iov_base = &unlock; + + if (flags & LKM_PUT_LVB) { + /* extra data to send if we are updating lvb */ + vec[1].iov_len = DLM_LVB_LEN; + vec[1].iov_base = lock->lksb->lvb; + veclen++; + } + + tmpret = o2net_send_message_vec(DLM_UNLOCK_LOCK_MSG, dlm->key, + vec, veclen, owner, &status); + if (tmpret >= 0) { + // successfully sent and received + if (status == DLM_CANCELGRANT) + ret = DLM_NORMAL; + else if (status == DLM_FORWARD) { + mlog(0, "master was in-progress. retry\n"); + ret = DLM_FORWARD; + } else + ret = status; + lksb->status = status; + } else { + mlog_errno(tmpret); + if (dlm_is_host_down(tmpret)) { + /* NOTE: this seems strange, but it is what we want. + * when the master goes down during a cancel or + * unlock, the recovery code completes the operation + * as if the master had not died, then passes the + * updated state to the recovery master. this thread + * just needs to finish out the operation and call + * the unlockast. */ + ret = DLM_NORMAL; + } else { + /* something bad. this will BUG in ocfs2 */ + ret = dlm_err_to_dlm_status(tmpret); + } + lksb->status = ret; + } + + return ret; +} + +/* + * locking: + * caller needs: none + * taken: takes and drops res->spinlock + * held on exit: none + * returns: DLM_NORMAL, DLM_BADARGS, DLM_IVLOCKID, + * return value from dlmunlock_master + */ +int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) +{ + struct dlm_ctxt *dlm = data; + struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; + struct dlm_lock_resource *res = NULL; + struct list_head *iter; + struct dlm_lock *lock = NULL; + enum dlm_status status = DLM_NORMAL; + int found = 0, i; + struct dlm_lockstatus *lksb = NULL; + int ignore; + u32 flags; + struct list_head *queue; + + flags = be32_to_cpu(unlock->flags); + + if (flags & LKM_GET_LVB) { + mlog(ML_ERROR, "bad args! GET_LVB specified on unlock!\n"); + return DLM_BADARGS; + } + + if ((flags & (LKM_PUT_LVB|LKM_CANCEL)) == (LKM_PUT_LVB|LKM_CANCEL)) { + mlog(ML_ERROR, "bad args! cannot modify lvb on a CANCEL " + "request!\n"); + return DLM_BADARGS; + } + + if (unlock->namelen > DLM_LOCKID_NAME_MAX) { + mlog(ML_ERROR, "Invalid name length in unlock handler!\n"); + return DLM_IVBUFLEN; + } + + if (!dlm_grab(dlm)) + return DLM_REJECTED; + + mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), + "Domain %s not fully joined!\n", dlm->name); + + mlog(0, "lvb: %s\n", flags & LKM_PUT_LVB ? "put lvb" : "none"); + + res = dlm_lookup_lockres(dlm, unlock->name, unlock->namelen); + if (!res) { + /* We assume here that a no lock resource simply means + * it was migrated away and destroyed before the other + * node could detect it. */ + mlog(0, "returning DLM_FORWARD -- res no longer exists\n"); + status = DLM_FORWARD; + goto not_found; + } + + queue=&res->granted; + found = 0; + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_RECOVERING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_RECOVERING\n"); + status = DLM_RECOVERING; + goto leave; + } + + if (res->state & DLM_LOCK_RES_MIGRATING) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_MIGRATING\n"); + status = DLM_MIGRATING; + goto leave; + } + + if (res->owner != dlm->node_num) { + spin_unlock(&res->spinlock); + mlog(0, "returning DLM_FORWARD -- not master\n"); + status = DLM_FORWARD; + goto leave; + } + + for (i=0; i<3; i++) { + list_for_each(iter, queue) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.cookie == unlock->cookie && + lock->ml.node == unlock->node_idx) { + dlm_lock_get(lock); + found = 1; + break; + } + } + if (found) + break; + /* scan granted -> converting -> blocked queues */ + queue++; + } + spin_unlock(&res->spinlock); + if (!found) { + status = DLM_IVLOCKID; + goto not_found; + } + + /* lock was found on queue */ + lksb = lock->lksb; + /* unlockast only called on originating node */ + if (flags & LKM_PUT_LVB) { + lksb->flags |= DLM_LKSB_PUT_LVB; + memcpy(&lksb->lvb[0], &unlock->lvb[0], DLM_LVB_LEN); + } + + /* if this is in-progress, propagate the DLM_FORWARD + * all the way back out */ + status = dlmunlock_master(dlm, res, lock, lksb, flags, &ignore); + if (status == DLM_FORWARD) + mlog(0, "lockres is in progress\n"); + + if (flags & LKM_PUT_LVB) + lksb->flags &= ~DLM_LKSB_PUT_LVB; + + dlm_lockres_calc_usage(dlm, res); + dlm_kick_thread(dlm, res); + +not_found: + if (!found) + mlog(ML_ERROR, "failed to find lock to unlock! " + "cookie=%"MLFu64"\n", + unlock->cookie); + else { + /* send the lksb->status back to the other node */ + status = lksb->status; + dlm_lock_put(lock); + } + +leave: + if (res) + dlm_lockres_put(res); + + dlm_put(dlm); + + return status; +} + + +static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions) +{ + enum dlm_status status; + + if (dlm_lock_on_list(&res->blocked, lock)) { + /* cancel this outright */ + lksb->status = DLM_NORMAL; + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK); + } else if (dlm_lock_on_list(&res->converting, lock)) { + /* cancel the request, put back on granted */ + lksb->status = DLM_NORMAL; + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK | + DLM_UNLOCK_REGRANT_LOCK | + DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (dlm_lock_on_list(&res->granted, lock)) { + /* too late, already granted. DLM_CANCELGRANT */ + lksb->status = DLM_CANCELGRANT; + status = DLM_NORMAL; + *actions = DLM_UNLOCK_CALL_AST; + } else { + mlog(ML_ERROR, "lock to cancel is not on any list!\n"); + lksb->status = DLM_IVLOCKID; + status = DLM_IVLOCKID; + *actions = 0; + } + return status; +} + +static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + struct dlm_lock *lock, + struct dlm_lockstatus *lksb, + int *actions) +{ + enum dlm_status status; + + /* unlock request */ + if (!dlm_lock_on_list(&res->granted, lock)) { + lksb->status = DLM_DENIED; + status = DLM_DENIED; + dlm_error(status); + *actions = 0; + } else { + /* unlock granted lock */ + lksb->status = DLM_NORMAL; + status = DLM_NORMAL; + *actions = (DLM_UNLOCK_FREE_LOCK | + DLM_UNLOCK_CALL_AST | + DLM_UNLOCK_REMOVE_LOCK); + } + return status; +} + +/* there seems to be no point in doing this async + * since (even for the remote case) there is really + * no work to queue up... so just do it and fire the + * unlockast by hand when done... */ +enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb, + int flags, dlm_astunlockfunc_t *unlockast, void *data) +{ + enum dlm_status status; + struct dlm_lock_resource *res; + struct dlm_lock *lock = NULL; + int call_ast, is_master; + + mlog_entry_void(); + + if (!lksb) { + dlm_error(DLM_BADARGS); + return DLM_BADARGS; + } + + if (flags & ~(LKM_CANCEL | LKM_VALBLK | LKM_INVVALBLK)) { + dlm_error(DLM_BADPARAM); + return DLM_BADPARAM; + } + + if ((flags & (LKM_VALBLK | LKM_CANCEL)) == (LKM_VALBLK | LKM_CANCEL)) { + mlog(0, "VALBLK given with CANCEL: ignoring VALBLK\n"); + flags &= ~LKM_VALBLK; + } + + if (!lksb->lockid || !lksb->lockid->lockres) { + dlm_error(DLM_BADPARAM); + return DLM_BADPARAM; + } + + lock = lksb->lockid; + BUG_ON(!lock); + dlm_lock_get(lock); + + res = lock->lockres; + BUG_ON(!res); + dlm_lockres_get(res); +retry: + call_ast = 0; + /* need to retry up here because owner may have changed */ + mlog(0, "lock=%p res=%p\n", lock, res); + + spin_lock(&res->spinlock); + is_master = (res->owner == dlm->node_num); + spin_unlock(&res->spinlock); + + if (is_master) { + status = dlmunlock_master(dlm, res, lock, lksb, flags, + &call_ast); + mlog(0, "done calling dlmunlock_master: returned %d, " + "call_ast is %d\n", status, call_ast); + } else { + status = dlmunlock_remote(dlm, res, lock, lksb, flags, + &call_ast); + mlog(0, "done calling dlmunlock_remote: returned %d, " + "call_ast is %d\n", status, call_ast); + } + + if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* We want to go away for a tiny bit to allow recovery + * / migration to complete on this resource. I don't + * know of any wait queue we could sleep on as this + * may be happening on another node. Perhaps the + * proper solution is to queue up requests on the + * other end? */ + + /* do we want to yield(); ?? */ + msleep(50); + + mlog(0, "retrying unlock due to pending recovery/" + "migration/in-progress\n"); + goto retry; + } + + if (call_ast) { + mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status); + if (is_master) { + /* it is possible that there is one last bast + * pending. make sure it is flushed, then + * call the unlockast. + * not an issue if this is a mastered remotely, + * since this lock has been removed from the + * lockres queues and cannot be found. */ + dlm_kick_thread(dlm, NULL); + wait_event(dlm->ast_wq, + dlm_lock_basts_flushed(dlm, lock)); + } + (*unlockast)(data, lksb->status); + } + + if (status == DLM_NORMAL) { + mlog(0, "kicking the thread\n"); + dlm_kick_thread(dlm, res); + } else + dlm_error(status); + + dlm_lockres_calc_usage(dlm, res); + dlm_lockres_put(res); + dlm_lock_put(lock); + + mlog(0, "returning status=%d!\n", status); + return status; +} +EXPORT_SYMBOL_GPL(dlmunlock); + diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c new file mode 100644 index 000000000000..7ef2653f8f41 --- /dev/null +++ b/fs/ocfs2/dlm/dlmver.c @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include "dlmver.h" + +#define DLM_BUILD_VERSION "1.3.3" + +#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION + +void dlm_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(DLM_BUILD_VERSION); diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h new file mode 100644 index 000000000000..f674aee77a16 --- /dev/null +++ b/fs/ocfs2/dlm/dlmver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmfsver.h + * + * Function prototypes + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef DLM_VER_H +#define DLM_VER_H + +void dlm_print_version(void); + +#endif /* DLM_VER_H */ diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c new file mode 100644 index 000000000000..e1fdd288796e --- /dev/null +++ b/fs/ocfs2/dlm/userdlm.c @@ -0,0 +1,658 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.c + * + * Code which implements the kernel side of a minimal userspace + * interface to our DLM. + * + * Many of the functions here are pared down versions of dlmglue.c + * functions. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <asm/signal.h> + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/crc32.h> + + +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "dlmapi.h" + +#include "userdlm.h" + +#define MLOG_MASK_PREFIX ML_DLMFS +#include "cluster/masklog.h" + +static inline int user_check_wait_flag(struct user_lock_res *lockres, + int flag) +{ + int ret; + + spin_lock(&lockres->l_lock); + ret = lockres->l_flags & flag; + spin_unlock(&lockres->l_lock); + + return ret; +} + +static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BUSY)); +} + +static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); +} + +/* I heart container_of... */ +static inline struct dlm_ctxt * +dlm_ctxt_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return ip->ip_dlm; +} + +static struct inode * +user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) +{ + struct dlmfs_inode_private *ip; + + ip = container_of(lockres, + struct dlmfs_inode_private, + ip_lockres); + return &ip->ip_vfs_inode; +} + +static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) +{ + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); +} + +#define user_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ + "resource %s: %s\n", dlm_errname(_stat), _func, \ + _lockres->l_name, dlm_errmsg(_stat)); \ +} while (0) + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int user_highest_compat_lock_level(int level) +{ + int new_level = LKM_EXMODE; + + if (level == LKM_EXMODE) + new_level = LKM_NLMODE; + else if (level == LKM_PRMODE) + new_level = LKM_PRMODE; + return new_level; +} + +static void user_ast(void *opaque) +{ + struct user_lock_res *lockres = opaque; + struct dlm_lockstatus *lksb; + + mlog(0, "AST fired for lockres %s\n", lockres->l_name); + + spin_lock(&lockres->l_lock); + + lksb = &(lockres->l_lksb); + if (lksb->status != DLM_NORMAL) { + mlog(ML_ERROR, "lksb status value of %u on lockres %s\n", + lksb->status, lockres->l_name); + spin_unlock(&lockres->l_lock); + return; + } + + /* we're downconverting. */ + if (lockres->l_requested < lockres->l_level) { + if (lockres->l_requested <= + user_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = LKM_NLMODE; + lockres->l_flags &= ~USER_LOCK_BLOCKED; + } + } + + lockres->l_level = lockres->l_requested; + lockres->l_requested = LKM_IVMODE; + lockres->l_flags |= USER_LOCK_ATTACHED; + lockres->l_flags &= ~USER_LOCK_BUSY; + + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + if (!igrab(inode)) + BUG(); +} + +static void user_dlm_unblock_lock(void *opaque); + +static void __user_dlm_queue_lockres(struct user_lock_res *lockres) +{ + if (!(lockres->l_flags & USER_LOCK_QUEUED)) { + user_dlm_grab_inode_ref(lockres); + + INIT_WORK(&lockres->l_work, user_dlm_unblock_lock, + lockres); + + queue_work(user_dlm_worker, &lockres->l_work); + lockres->l_flags |= USER_LOCK_QUEUED; + } +} + +static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) +{ + int queue = 0; + + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) + return; + + switch (lockres->l_blocking) { + case LKM_EXMODE: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + queue = 1; + break; + case LKM_PRMODE: + if (!lockres->l_ex_holders) + queue = 1; + break; + default: + BUG(); + } + + if (queue) + __user_dlm_queue_lockres(lockres); +} + +static void user_bast(void *opaque, int level) +{ + struct user_lock_res *lockres = opaque; + + mlog(0, "Blocking AST fired for lockres %s. Blocking level %d\n", + lockres->l_name, level); + + spin_lock(&lockres->l_lock); + lockres->l_flags |= USER_LOCK_BLOCKED; + if (level > lockres->l_blocking) + lockres->l_blocking = level; + + __user_dlm_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static void user_unlock_ast(void *opaque, enum dlm_status status) +{ + struct user_lock_res *lockres = opaque; + + mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); + + if (status != DLM_NORMAL) + mlog(ML_ERROR, "Dlm returns status %d\n", status); + + spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) + lockres->l_level = LKM_IVMODE; + else { + lockres->l_requested = LKM_IVMODE; /* cancel an + * upconvert + * request. */ + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + /* we want the unblock thread to look at it again + * now. */ + __user_dlm_queue_lockres(lockres); + } + + lockres->l_flags &= ~USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + wake_up(&lockres->l_event); +} + +static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) +{ + struct inode *inode; + inode = user_dlm_inode_from_user_lockres(lockres); + iput(inode); +} + +static void user_dlm_unblock_lock(void *opaque) +{ + int new_level, status; + struct user_lock_res *lockres = (struct user_lock_res *) opaque; + struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + + mlog(0, "processing lockres %s\n", lockres->l_name); + + spin_lock(&lockres->l_lock); + + BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); + BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); + + /* notice that we don't clear USER_LOCK_BLOCKED here. That's + * for user_ast to do. */ + lockres->l_flags &= ~USER_LOCK_QUEUED; + + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(0, "lock is in teardown so we do nothing\n"); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + if (lockres->l_flags & USER_LOCK_BUSY) { + mlog(0, "BUSY flag detected...\n"); + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + + lockres->l_flags |= USER_LOCK_IN_CANCEL; + spin_unlock(&lockres->l_lock); + + status = dlmunlock(dlm, + &lockres->l_lksb, + LKM_CANCEL, + user_unlock_ast, + lockres); + if (status == DLM_CANCELGRANT) { + /* If we got this, then the ast was fired + * before we could cancel. We cleanup our + * state, and restart the function. */ + spin_lock(&lockres->l_lock); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + spin_unlock(&lockres->l_lock); + } else if (status != DLM_NORMAL) + user_log_dlm_error("dlmunlock", status, lockres); + goto drop_ref; + } + + /* If there are still incompat holders, we can exit safely + * without worrying about re-queueing this lock as that will + * happen on the last call to user_cluster_unlock. */ + if ((lockres->l_blocking == LKM_EXMODE) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + spin_unlock(&lockres->l_lock); + mlog(0, "can't downconvert for ex: ro = %u, ex = %u\n", + lockres->l_ro_holders, lockres->l_ex_holders); + goto drop_ref; + } + + if ((lockres->l_blocking == LKM_PRMODE) + && lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + mlog(0, "can't downconvert for pr: ex = %u\n", + lockres->l_ex_holders); + goto drop_ref; + } + + /* yay, we can downconvert now. */ + new_level = user_highest_compat_lock_level(lockres->l_blocking); + lockres->l_requested = new_level; + lockres->l_flags |= USER_LOCK_BUSY; + mlog(0, "Downconvert lock from %d to %d\n", + lockres->l_level, new_level); + spin_unlock(&lockres->l_lock); + + /* need lock downconvert request now... */ + status = dlmlock(dlm, + new_level, + &lockres->l_lksb, + LKM_CONVERT|LKM_VALBLK, + lockres->l_name, + user_ast, + lockres, + user_bast); + if (status != DLM_NORMAL) { + user_log_dlm_error("dlmlock", status, lockres); + user_recover_from_dlm_error(lockres); + } + +drop_ref: + user_dlm_drop_inode_ref(lockres); +} + +static inline void user_dlm_inc_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case LKM_EXMODE: + lockres->l_ex_holders++; + break; + case LKM_PRMODE: + lockres->l_ro_holders++; + break; + default: + BUG(); + } +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int +user_may_continue_on_blocked_lock(struct user_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); + + return wanted <= user_highest_compat_lock_level(lockres->l_blocking); +} + +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags) +{ + int status, local_flags; + struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + + if (level != LKM_EXMODE && + level != LKM_PRMODE) { + mlog(ML_ERROR, "lockres %s: invalid request!\n", + lockres->l_name); + status = -EINVAL; + goto bail; + } + + mlog(0, "lockres %s: asking for %s lock, passed flags = 0x%x\n", + lockres->l_name, + (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE", + lkm_flags); + +again: + if (signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + spin_lock(&lockres->l_lock); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if ((lockres->l_flags & USER_LOCK_BUSY) && + (level > lockres->l_level)) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + spin_unlock(&lockres->l_lock); + + user_wait_on_busy_lock(lockres); + goto again; + } + + if ((lockres->l_flags & USER_LOCK_BLOCKED) && + (!user_may_continue_on_blocked_lock(lockres, level))) { + /* is the lock is currently blocked on behalf of + * another node */ + spin_unlock(&lockres->l_lock); + + user_wait_on_blocked_lock(lockres); + goto again; + } + + if (level > lockres->l_level) { + local_flags = lkm_flags | LKM_VALBLK; + if (lockres->l_level != LKM_IVMODE) + local_flags |= LKM_CONVERT; + + lockres->l_requested = level; + lockres->l_flags |= USER_LOCK_BUSY; + spin_unlock(&lockres->l_lock); + + BUG_ON(level == LKM_IVMODE); + BUG_ON(level == LKM_NLMODE); + + mlog(0, "lock %s, get lock from %d to level = %d\n", + lockres->l_name, lockres->l_level, level); + + /* call dlm_lock to upgrade lock now */ + status = dlmlock(dlm, + level, + &lockres->l_lksb, + local_flags, + lockres->l_name, + user_ast, + lockres, + user_bast); + if (status != DLM_NORMAL) { + if ((lkm_flags & LKM_NOQUEUE) && + (status == DLM_NOTQUEUED)) + status = -EAGAIN; + else { + user_log_dlm_error("dlmlock", status, lockres); + status = -EINVAL; + } + user_recover_from_dlm_error(lockres); + goto bail; + } + + mlog(0, "lock %s, successfull return from dlmlock\n", + lockres->l_name); + + user_wait_on_busy_lock(lockres); + goto again; + } + + user_dlm_inc_holders(lockres, level); + spin_unlock(&lockres->l_lock); + + mlog(0, "lockres %s: Got %s lock!\n", lockres->l_name, + (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE"); + + status = 0; +bail: + return status; +} + +static inline void user_dlm_dec_holders(struct user_lock_res *lockres, + int level) +{ + switch(level) { + case LKM_EXMODE: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case LKM_PRMODE: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } +} + +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level) +{ + if (level != LKM_EXMODE && + level != LKM_PRMODE) { + mlog(ML_ERROR, "lockres %s: invalid request!\n", lockres->l_name); + return; + } + + mlog(0, "lockres %s: dropping %s lock\n", lockres->l_name, + (level == LKM_EXMODE) ? "LKM_EXMODE" : "LKM_PRMODE"); + + spin_lock(&lockres->l_lock); + user_dlm_dec_holders(lockres, level); + __user_dlm_cond_queue_lockres(lockres); + spin_unlock(&lockres->l_lock); +} + +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb = lockres->l_lksb.lvb; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < LKM_EXMODE); + memcpy(lvb, val, len); + + spin_unlock(&lockres->l_lock); +} + +void user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len) +{ + struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; + char *lvb = lockres->l_lksb.lvb; + + BUG_ON(len > DLM_LVB_LEN); + + spin_lock(&lockres->l_lock); + + BUG_ON(lockres->l_level < LKM_PRMODE); + memcpy(val, lvb, len); + + spin_unlock(&lockres->l_lock); +} + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry) +{ + memset(lockres, 0, sizeof(*lockres)); + + spin_lock_init(&lockres->l_lock); + init_waitqueue_head(&lockres->l_event); + lockres->l_level = LKM_IVMODE; + lockres->l_requested = LKM_IVMODE; + lockres->l_blocking = LKM_IVMODE; + + /* should have been checked before getting here. */ + BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); + + memcpy(lockres->l_name, + dentry->d_name.name, + dentry->d_name.len); +} + +int user_dlm_destroy_lock(struct user_lock_res *lockres) +{ + int status = -EBUSY; + struct dlm_ctxt *dlm = dlm_ctxt_from_user_lockres(lockres); + + mlog(0, "asked to destroy %s\n", lockres->l_name); + + spin_lock(&lockres->l_lock); + while (lockres->l_flags & USER_LOCK_BUSY) { + spin_unlock(&lockres->l_lock); + + mlog(0, "lock %s is busy\n", lockres->l_name); + + user_wait_on_busy_lock(lockres); + + spin_lock(&lockres->l_lock); + } + + if (lockres->l_ro_holders || lockres->l_ex_holders) { + spin_unlock(&lockres->l_lock); + mlog(0, "lock %s has holders\n", lockres->l_name); + goto bail; + } + + status = 0; + if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { + spin_unlock(&lockres->l_lock); + mlog(0, "lock %s is not attached\n", lockres->l_name); + goto bail; + } + + lockres->l_flags &= ~USER_LOCK_ATTACHED; + lockres->l_flags |= USER_LOCK_BUSY; + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + spin_unlock(&lockres->l_lock); + + mlog(0, "unlocking lockres %s\n", lockres->l_name); + status = dlmunlock(dlm, + &lockres->l_lksb, + LKM_VALBLK, + user_unlock_ast, + lockres); + if (status != DLM_NORMAL) { + user_log_dlm_error("dlmunlock", status, lockres); + status = -EINVAL; + goto bail; + } + + user_wait_on_busy_lock(lockres); + + status = 0; +bail: + return status; +} + +struct dlm_ctxt *user_dlm_register_context(struct qstr *name) +{ + struct dlm_ctxt *dlm; + u32 dlm_key; + char *domain; + + domain = kmalloc(name->len + 1, GFP_KERNEL); + if (!domain) { + mlog_errno(-ENOMEM); + return ERR_PTR(-ENOMEM); + } + + dlm_key = crc32_le(0, name->name, name->len); + + snprintf(domain, name->len + 1, "%.*s", name->len, name->name); + + dlm = dlm_register_domain(domain, dlm_key); + if (IS_ERR(dlm)) + mlog_errno(PTR_ERR(dlm)); + + kfree(domain); + return dlm; +} + +void user_dlm_unregister_context(struct dlm_ctxt *dlm) +{ + dlm_unregister_domain(dlm); +} diff --git a/fs/ocfs2/dlm/userdlm.h b/fs/ocfs2/dlm/userdlm.h new file mode 100644 index 000000000000..04178bc40b76 --- /dev/null +++ b/fs/ocfs2/dlm/userdlm.h @@ -0,0 +1,111 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * userdlm.h + * + * Userspace dlm defines + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef USERDLM_H +#define USERDLM_H + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/workqueue.h> + +/* user_lock_res->l_flags flags. */ +#define USER_LOCK_ATTACHED (0x00000001) /* have we initialized + * the lvb */ +#define USER_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define USER_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define USER_LOCK_IN_TEARDOWN (0x00000008) /* we're currently + * destroying this + * lock. */ +#define USER_LOCK_QUEUED (0x00000010) /* lock is on the + * workqueue */ +#define USER_LOCK_IN_CANCEL (0x00000020) + +struct user_lock_res { + spinlock_t l_lock; + + int l_flags; + +#define USER_DLM_LOCK_ID_MAX_LEN 32 + char l_name[USER_DLM_LOCK_ID_MAX_LEN]; + int l_level; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + struct dlm_lockstatus l_lksb; + + int l_requested; + int l_blocking; + + wait_queue_head_t l_event; + + struct work_struct l_work; +}; + +extern struct workqueue_struct *user_dlm_worker; + +void user_dlm_lock_res_init(struct user_lock_res *lockres, + struct dentry *dentry); +int user_dlm_destroy_lock(struct user_lock_res *lockres); +int user_dlm_cluster_lock(struct user_lock_res *lockres, + int level, + int lkm_flags); +void user_dlm_cluster_unlock(struct user_lock_res *lockres, + int level); +void user_dlm_write_lvb(struct inode *inode, + const char *val, + unsigned int len); +void user_dlm_read_lvb(struct inode *inode, + char *val, + unsigned int len); +struct dlm_ctxt *user_dlm_register_context(struct qstr *name); +void user_dlm_unregister_context(struct dlm_ctxt *dlm); + +struct dlmfs_inode_private { + struct dlm_ctxt *ip_dlm; + + struct user_lock_res ip_lockres; /* unused for directories. */ + struct inode *ip_parent; + + struct inode ip_vfs_inode; +}; + +static inline struct dlmfs_inode_private * +DLMFS_I(struct inode *inode) +{ + return container_of(inode, + struct dlmfs_inode_private, + ip_vfs_inode); +} + +struct dlmfs_filp_private { + int fp_lock_level; +}; + +#define DLMFS_MAGIC 0x76a9f425 + +#endif /* USERDLM_H */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c new file mode 100644 index 000000000000..e971ec2f8407 --- /dev/null +++ b/fs/ocfs2/dlmglue.c @@ -0,0 +1,2904 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.c + * + * Code which implements an OCFS2 specific interface to our DLM. + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/mm.h> +#include <linux/smp_lock.h> +#include <linux/crc32.h> +#include <linux/kthread.h> +#include <linux/pagemap.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <cluster/heartbeat.h> +#include <cluster/nodemanager.h> +#include <cluster/tcp.h> + +#include <dlm/dlmapi.h> + +#define MLOG_MASK_PREFIX ML_DLM_GLUE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "slot_map.h" +#include "super.h" +#include "uptodate.h" +#include "vote.h" + +#include "buffer_head_io.h" + +struct ocfs2_mask_waiter { + struct list_head mw_item; + int mw_status; + struct completion mw_complete; + unsigned long mw_mask; + unsigned long mw_goal; +}; + +static void ocfs2_inode_ast_func(void *opaque); +static void ocfs2_inode_bast_func(void *opaque, + int level); +static void ocfs2_super_ast_func(void *opaque); +static void ocfs2_super_bast_func(void *opaque, + int level); +static void ocfs2_rename_ast_func(void *opaque); +static void ocfs2_rename_bast_func(void *opaque, + int level); + +/* so far, all locks have gotten along with the same unlock ast */ +static void ocfs2_unlock_ast_func(void *opaque, + enum dlm_status status); +static int ocfs2_do_unblock_meta(struct inode *inode, + int *requeue); +static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, + int *requeue); +static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, + int *requeue); +static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, + int *requeue); +static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, + int *requeue); +typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); +static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int *requeue, + ocfs2_convert_worker_t *worker); + +struct ocfs2_lock_res_ops { + void (*ast)(void *); + void (*bast)(void *, int); + void (*unlock_ast)(void *, enum dlm_status); + int (*unblock)(struct ocfs2_lock_res *, int *); +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { + .ast = ocfs2_inode_ast_func, + .bast = ocfs2_inode_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_inode_lock, +}; + +static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { + .ast = ocfs2_inode_ast_func, + .bast = ocfs2_inode_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_meta, +}; + +static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking); + +static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { + .ast = ocfs2_inode_ast_func, + .bast = ocfs2_inode_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_data, +}; + +static struct ocfs2_lock_res_ops ocfs2_super_lops = { + .ast = ocfs2_super_ast_func, + .bast = ocfs2_super_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_osb_lock, +}; + +static struct ocfs2_lock_res_ops ocfs2_rename_lops = { + .ast = ocfs2_rename_ast_func, + .bast = ocfs2_rename_bast_func, + .unlock_ast = ocfs2_unlock_ast_func, + .unblock = ocfs2_unblock_osb_lock, +}; + +static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_META || + lockres->l_type == OCFS2_LOCK_TYPE_DATA || + lockres->l_type == OCFS2_LOCK_TYPE_RW; +} + +static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; +} + +static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) +{ + return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; +} + +static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!ocfs2_is_super_lock(lockres) + && !ocfs2_is_rename_lock(lockres)); + + return (struct ocfs2_super *) lockres->l_priv; +} + +static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) +{ + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + return (struct inode *) lockres->l_priv; +} + +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + int dlm_flags); +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted); +static void ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level); +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert); +#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ + mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ + "resource %s: %s\n", dlm_errname(_stat), _func, \ + _lockres->l_name, dlm_errmsg(_stat)); \ +} while (0) +static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); +static int ocfs2_meta_lock_update(struct inode *inode, + struct buffer_head **bh); +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); +static inline int ocfs2_highest_compat_lock_level(int level); +static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, + struct ocfs2_lock_res *lockres, + int new_level); + +static char *ocfs2_lock_type_strings[] = { + [OCFS2_LOCK_TYPE_META] = "Meta", + [OCFS2_LOCK_TYPE_DATA] = "Data", + [OCFS2_LOCK_TYPE_SUPER] = "Super", + [OCFS2_LOCK_TYPE_RENAME] = "Rename", + /* Need to differntiate from [R]ename.. serializing writes is the + * important job it does, anyway. */ + [OCFS2_LOCK_TYPE_RW] = "Write/Read", +}; + +static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) +{ + mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); + return ocfs2_lock_type_strings[type]; +} + +static void ocfs2_build_lock_name(enum ocfs2_lock_type type, + u64 blkno, + u32 generation, + char *name) +{ + int len; + + mlog_entry_void(); + + BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); + + len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016"MLFx64"%08x", + ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, blkno, + generation); + + BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); + + mlog(0, "built lock resource with name: %s\n", name); + + mlog_exit_void(); +} + +static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; + +static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, + struct ocfs2_dlm_debug *dlm_debug) +{ + mlog(0, "Add tracking for lockres %s\n", res->l_name); + + spin_lock(&ocfs2_dlm_tracking_lock); + list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) +{ + spin_lock(&ocfs2_dlm_tracking_lock); + if (!list_empty(&res->l_debug_list)) + list_del_init(&res->l_debug_list); + spin_unlock(&ocfs2_dlm_tracking_lock); +} + +static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, + struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + u64 blkno, + u32 generation, + struct ocfs2_lock_res_ops *ops, + void *priv) +{ + ocfs2_build_lock_name(type, blkno, generation, res->l_name); + + res->l_type = type; + res->l_ops = ops; + res->l_priv = priv; + + res->l_level = LKM_IVMODE; + res->l_requested = LKM_IVMODE; + res->l_blocking = LKM_IVMODE; + res->l_action = OCFS2_AST_INVALID; + res->l_unlock_action = OCFS2_UNLOCK_INVALID; + + res->l_flags = OCFS2_LOCK_INITIALIZED; + + ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); +} + +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) +{ + /* This also clears out the lock status block */ + memset(res, 0, sizeof(struct ocfs2_lock_res)); + spin_lock_init(&res->l_lock); + init_waitqueue_head(&res->l_event); + INIT_LIST_HEAD(&res->l_blocked_list); + INIT_LIST_HEAD(&res->l_mask_waiters); +} + +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + struct inode *inode) +{ + struct ocfs2_lock_res_ops *ops; + + switch(type) { + case OCFS2_LOCK_TYPE_RW: + ops = &ocfs2_inode_rw_lops; + break; + case OCFS2_LOCK_TYPE_META: + ops = &ocfs2_inode_meta_lops; + break; + case OCFS2_LOCK_TYPE_DATA: + ops = &ocfs2_inode_data_lops; + break; + default: + mlog_bug_on_msg(1, "type: %d\n", type); + ops = NULL; /* thanks, gcc */ + break; + }; + + ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, + OCFS2_I(inode)->ip_blkno, + inode->i_generation, ops, inode); +} + +static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Superblock lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, + OCFS2_SUPER_BLOCK_BLKNO, 0, + &ocfs2_super_lops, osb); +} + +static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, + struct ocfs2_super *osb) +{ + /* Rename lockres doesn't come from a slab so we call init + * once on it manually. */ + ocfs2_lock_res_init_once(res); + ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, + &ocfs2_rename_lops, osb); +} + +void ocfs2_lock_res_free(struct ocfs2_lock_res *res) +{ + mlog_entry_void(); + + if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) + return; + + ocfs2_remove_lockres_tracking(res); + + mlog_bug_on_msg(!list_empty(&res->l_blocked_list), + "Lockres %s is on the blocked list\n", + res->l_name); + mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), + "Lockres %s has mask waiters pending\n", + res->l_name); + mlog_bug_on_msg(spin_is_locked(&res->l_lock), + "Lockres %s is locked\n", + res->l_name); + mlog_bug_on_msg(res->l_ro_holders, + "Lockres %s has %u ro holders\n", + res->l_name, res->l_ro_holders); + mlog_bug_on_msg(res->l_ex_holders, + "Lockres %s has %u ex holders\n", + res->l_name, res->l_ex_holders); + + /* Need to clear out the lock status block for the dlm */ + memset(&res->l_lksb, 0, sizeof(res->l_lksb)); + + res->l_flags = 0UL; + mlog_exit_void(); +} + +static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, + int level) +{ + mlog_entry_void(); + + BUG_ON(!lockres); + + switch(level) { + case LKM_EXMODE: + lockres->l_ex_holders++; + break; + case LKM_PRMODE: + lockres->l_ro_holders++; + break; + default: + BUG(); + } + + mlog_exit_void(); +} + +static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, + int level) +{ + mlog_entry_void(); + + BUG_ON(!lockres); + + switch(level) { + case LKM_EXMODE: + BUG_ON(!lockres->l_ex_holders); + lockres->l_ex_holders--; + break; + case LKM_PRMODE: + BUG_ON(!lockres->l_ro_holders); + lockres->l_ro_holders--; + break; + default: + BUG(); + } + mlog_exit_void(); +} + +/* WARNING: This function lives in a world where the only three lock + * levels are EX, PR, and NL. It *will* have to be adjusted when more + * lock types are added. */ +static inline int ocfs2_highest_compat_lock_level(int level) +{ + int new_level = LKM_EXMODE; + + if (level == LKM_EXMODE) + new_level = LKM_NLMODE; + else if (level == LKM_PRMODE) + new_level = LKM_PRMODE; + return new_level; +} + +static void lockres_set_flags(struct ocfs2_lock_res *lockres, + unsigned long newflags) +{ + struct list_head *pos, *tmp; + struct ocfs2_mask_waiter *mw; + + assert_spin_locked(&lockres->l_lock); + + lockres->l_flags = newflags; + + list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { + mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + continue; + + list_del_init(&mw->mw_item); + mw->mw_status = 0; + complete(&mw->mw_complete); + } +} +static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) +{ + lockres_set_flags(lockres, lockres->l_flags | or); +} +static void lockres_clear_flags(struct ocfs2_lock_res *lockres, + unsigned long clear) +{ + lockres_set_flags(lockres, lockres->l_flags & ~clear); +} + +static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + BUG_ON(lockres->l_blocking <= LKM_NLMODE); + + lockres->l_level = lockres->l_requested; + if (lockres->l_level <= + ocfs2_highest_compat_lock_level(lockres->l_blocking)) { + lockres->l_blocking = LKM_NLMODE; + lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); + } + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + + mlog_exit_void(); +} + +static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); + + /* Convert from RO to EX doesn't really need anything as our + * information is already up to data. Convert from NL to + * *anything* however should mark ourselves as needing an + * update */ + if (lockres->l_level == LKM_NLMODE) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + + mlog_exit_void(); +} + +static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + + if (lockres->l_requested > LKM_NLMODE && + !(lockres->l_flags & OCFS2_LOCK_LOCAL)) + lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + lockres->l_level = lockres->l_requested; + lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + + mlog_exit_void(); +} + +static void ocfs2_inode_ast_func(void *opaque) +{ + struct ocfs2_lock_res *lockres = opaque; + struct inode *inode; + struct dlm_lockstatus *lksb; + unsigned long flags; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + + mlog(0, "AST fired for inode %"MLFu64", l_action = %u, type = %s\n", + OCFS2_I(inode)->ip_blkno, lockres->l_action, + ocfs2_lock_type_string(lockres->l_type)); + + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + spin_lock_irqsave(&lockres->l_lock, flags); + + lksb = &(lockres->l_lksb); + if (lksb->status != DLM_NORMAL) { + mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " + "on inode %"MLFu64"\n", lksb->status, + OCFS2_I(inode)->ip_blkno); + spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); + return; + } + + switch(lockres->l_action) { + case OCFS2_AST_ATTACH: + ocfs2_generic_handle_attach_action(lockres); + lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); + break; + case OCFS2_AST_CONVERT: + ocfs2_generic_handle_convert_action(lockres); + break; + case OCFS2_AST_DOWNCONVERT: + ocfs2_generic_handle_downconvert_action(lockres); + break; + default: + mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " + "lockres flags = 0x%lx, unlock action: %u\n", + lockres->l_name, lockres->l_action, lockres->l_flags, + lockres->l_unlock_action); + + BUG(); + } + + /* data and rw locking ignores refresh flag for now. */ + if (lockres->l_type != OCFS2_LOCK_TYPE_META) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + /* set it to something invalid so if we get called again we + * can catch it. */ + lockres->l_action = OCFS2_AST_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + wake_up(&lockres->l_event); + + mlog_exit_void(); +} + +static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, + int level) +{ + int needs_downconvert = 0; + mlog_entry_void(); + + assert_spin_locked(&lockres->l_lock); + + lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); + + if (level > lockres->l_blocking) { + /* only schedule a downconvert if we haven't already scheduled + * one that goes low enough to satisfy the level we're + * blocking. this also catches the case where we get + * duplicate BASTs */ + if (ocfs2_highest_compat_lock_level(level) < + ocfs2_highest_compat_lock_level(lockres->l_blocking)) + needs_downconvert = 1; + + lockres->l_blocking = level; + } + + mlog_exit(needs_downconvert); + return needs_downconvert; +} + +static void ocfs2_generic_bast_func(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level) +{ + int needs_downconvert; + unsigned long flags; + + mlog_entry_void(); + + BUG_ON(level <= LKM_NLMODE); + + spin_lock_irqsave(&lockres->l_lock, flags); + needs_downconvert = ocfs2_generic_handle_bast(lockres, level); + if (needs_downconvert) + ocfs2_schedule_blocked_lock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_kick_vote_thread(osb); + + wake_up(&lockres->l_event); + mlog_exit_void(); +} + +static void ocfs2_inode_bast_func(void *opaque, int level) +{ + struct ocfs2_lock_res *lockres = opaque; + struct inode *inode; + struct ocfs2_super *osb; + + mlog_entry_void(); + + BUG_ON(!ocfs2_is_inode_lock(lockres)); + + inode = ocfs2_lock_res_inode(lockres); + osb = OCFS2_SB(inode->i_sb); + + mlog(0, "BAST fired for inode %"MLFu64", blocking = %d, level = %d " + "type = %s\n", OCFS2_I(inode)->ip_blkno, level, + lockres->l_level, + ocfs2_lock_type_string(lockres->l_type)); + + ocfs2_generic_bast_func(osb, lockres, level); + + mlog_exit_void(); +} + +static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, + int ignore_refresh) +{ + struct dlm_lockstatus *lksb = &lockres->l_lksb; + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + + if (lksb->status != DLM_NORMAL) { + mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", + lockres->l_name, lksb->status); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + switch(lockres->l_action) { + case OCFS2_AST_ATTACH: + ocfs2_generic_handle_attach_action(lockres); + break; + case OCFS2_AST_CONVERT: + ocfs2_generic_handle_convert_action(lockres); + break; + case OCFS2_AST_DOWNCONVERT: + ocfs2_generic_handle_downconvert_action(lockres); + break; + default: + BUG(); + } + + if (ignore_refresh) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + + /* set it to something invalid so if we get called again we + * can catch it. */ + lockres->l_action = OCFS2_AST_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); +} + +static void ocfs2_super_ast_func(void *opaque) +{ + struct ocfs2_lock_res *lockres = opaque; + + mlog_entry_void(); + mlog(0, "Superblock AST fired\n"); + + BUG_ON(!ocfs2_is_super_lock(lockres)); + ocfs2_generic_ast_func(lockres, 0); + + mlog_exit_void(); +} + +static void ocfs2_super_bast_func(void *opaque, + int level) +{ + struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_super *osb; + + mlog_entry_void(); + mlog(0, "Superblock BAST fired\n"); + + BUG_ON(!ocfs2_is_super_lock(lockres)); + osb = ocfs2_lock_res_super(lockres); + ocfs2_generic_bast_func(osb, lockres, level); + + mlog_exit_void(); +} + +static void ocfs2_rename_ast_func(void *opaque) +{ + struct ocfs2_lock_res *lockres = opaque; + + mlog_entry_void(); + + mlog(0, "Rename AST fired\n"); + + BUG_ON(!ocfs2_is_rename_lock(lockres)); + + ocfs2_generic_ast_func(lockres, 1); + + mlog_exit_void(); +} + +static void ocfs2_rename_bast_func(void *opaque, + int level) +{ + struct ocfs2_lock_res *lockres = opaque; + struct ocfs2_super *osb; + + mlog_entry_void(); + + mlog(0, "Rename BAST fired\n"); + + BUG_ON(!ocfs2_is_rename_lock(lockres)); + + osb = ocfs2_lock_res_super(lockres); + ocfs2_generic_bast_func(osb, lockres, level); + + mlog_exit_void(); +} + +static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, + int convert) +{ + unsigned long flags; + + mlog_entry_void(); + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); + if (convert) + lockres->l_action = OCFS2_AST_INVALID; + else + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + mlog_exit_void(); +} + +/* Note: If we detect another process working on the lock (i.e., + * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller + * to do the right thing in that case. + */ +static int ocfs2_lock_create(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + int dlm_flags) +{ + int ret = 0; + enum dlm_status status; + unsigned long flags; + + mlog_entry_void(); + + mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, + dlm_flags); + + spin_lock_irqsave(&lockres->l_lock, flags); + if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || + (lockres->l_flags & OCFS2_LOCK_BUSY)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + lockres->l_action = OCFS2_AST_ATTACH; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = dlmlock(osb->dlm, + level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + lockres->l_ops->ast, + lockres, + lockres->l_ops->bast); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmlock", status, lockres); + ret = -EINVAL; + ocfs2_recover_from_dlm_error(lockres, 1); + } + + mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); + +bail: + mlog_exit(ret); + return ret; +} + +static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, + int flag) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&lockres->l_lock, flags); + ret = lockres->l_flags & flag; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; +} + +static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); +} + +static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) + +{ + wait_event(lockres->l_event, + !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); +} + +/* predict what lock level we'll be dropping down to on behalf + * of another node, and return true if the currently wanted + * level will be compatible with it. */ +static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, + int wanted) +{ + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + + return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); +} + +static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) +{ + INIT_LIST_HEAD(&mw->mw_item); + init_completion(&mw->mw_complete); +} + +static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) +{ + wait_for_completion(&mw->mw_complete); + /* Re-arm the completion in case we want to wait on it again */ + INIT_COMPLETION(mw->mw_complete); + return mw->mw_status; +} + +static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw, + unsigned long mask, + unsigned long goal) +{ + BUG_ON(!list_empty(&mw->mw_item)); + + assert_spin_locked(&lockres->l_lock); + + list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); + mw->mw_mask = mask; + mw->mw_goal = goal; +} + +/* returns 0 if the mw that was removed was already satisfied, -EBUSY + * if the mask still hadn't reached its goal */ +static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, + struct ocfs2_mask_waiter *mw) +{ + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&lockres->l_lock, flags); + if (!list_empty(&mw->mw_item)) { + if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) + ret = -EBUSY; + + list_del_init(&mw->mw_item); + init_completion(&mw->mw_complete); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ret; + +} + +static int ocfs2_cluster_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level, + int lkm_flags, + int arg_flags) +{ + struct ocfs2_mask_waiter mw; + enum dlm_status status; + int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); + int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ + unsigned long flags; + + mlog_entry_void(); + + ocfs2_init_mask_waiter(&mw); + +again: + wait = 0; + + if (catch_signals && signal_pending(current)) { + ret = -ERESTARTSYS; + goto out; + } + + spin_lock_irqsave(&lockres->l_lock, flags); + + mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, + "Cluster lock called on freeing lockres %s! flags " + "0x%lx\n", lockres->l_name, lockres->l_flags); + + /* We only compare against the currently granted level + * here. If the lock is blocked waiting on a downconvert, + * we'll get caught below. */ + if (lockres->l_flags & OCFS2_LOCK_BUSY && + level > lockres->l_level) { + /* is someone sitting in dlm_lock? If so, wait on + * them. */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); + wait = 1; + goto unlock; + } + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + /* lock has not been created yet. */ + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + goto again; + } + + if (lockres->l_flags & OCFS2_LOCK_BLOCKED && + !ocfs2_may_continue_on_blocked_lock(lockres, level)) { + /* is the lock is currently blocked on behalf of + * another node */ + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); + wait = 1; + goto unlock; + } + + if (level > lockres->l_level) { + if (lockres->l_action != OCFS2_AST_INVALID) + mlog(ML_ERROR, "lockres %s has action %u pending\n", + lockres->l_name, lockres->l_action); + + lockres->l_action = OCFS2_AST_CONVERT; + lockres->l_requested = level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + BUG_ON(level == LKM_IVMODE); + BUG_ON(level == LKM_NLMODE); + + mlog(0, "lock %s, convert from %d to level = %d\n", + lockres->l_name, lockres->l_level, level); + + /* call dlm_lock to upgrade lock now */ + status = dlmlock(osb->dlm, + level, + &lockres->l_lksb, + lkm_flags|LKM_CONVERT|LKM_VALBLK, + lockres->l_name, + lockres->l_ops->ast, + lockres, + lockres->l_ops->bast); + if (status != DLM_NORMAL) { + if ((lkm_flags & LKM_NOQUEUE) && + (status == DLM_NOTQUEUED)) + ret = -EAGAIN; + else { + ocfs2_log_dlm_error("dlmlock", status, + lockres); + ret = -EINVAL; + } + ocfs2_recover_from_dlm_error(lockres, 1); + goto out; + } + + mlog(0, "lock %s, successfull return from dlmlock\n", + lockres->l_name); + + /* At this point we've gone inside the dlm and need to + * complete our work regardless. */ + catch_signals = 0; + + /* wait for busy to clear and carry on */ + goto again; + } + + /* Ok, if we get here then we're good to go. */ + ocfs2_inc_holders(lockres, level); + + ret = 0; +unlock: + spin_unlock_irqrestore(&lockres->l_lock, flags); +out: + /* + * This is helping work around a lock inversion between the page lock + * and dlm locks. One path holds the page lock while calling aops + * which block acquiring dlm locks. The voting thread holds dlm + * locks while acquiring page locks while down converting data locks. + * This block is helping an aop path notice the inversion and back + * off to unlock its page lock before trying the dlm lock again. + */ + if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && + mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { + wait = 0; + if (lockres_remove_mask_waiter(lockres, &mw)) + ret = -EAGAIN; + else + goto again; + } + if (wait) { + ret = ocfs2_wait_for_mask(&mw); + if (ret == 0) + goto again; + mlog_errno(ret); + } + + mlog_exit(ret); + return ret; +} + +static void ocfs2_cluster_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int level) +{ + unsigned long flags; + + mlog_entry_void(); + spin_lock_irqsave(&lockres->l_lock, flags); + ocfs2_dec_holders(lockres, level); + ocfs2_vote_on_unlock(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + mlog_exit_void(); +} + +static int ocfs2_create_new_inode_lock(struct inode *inode, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned long flags; + + spin_lock_irqsave(&lockres->l_lock, flags); + BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); + lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); +} + +/* Grants us an EX lock on the data and metadata resources, skipping + * the normal cluster directory lookup. Use this ONLY on newly created + * inodes which other nodes can't possibly see, and which haven't been + * hashed in the inode hash yet. This can give us a good performance + * increase as it'll skip the network broadcast normally associated + * with creating a new lock resource. */ +int ocfs2_create_new_inode_locks(struct inode *inode) +{ + int ret; + + BUG_ON(!inode); + BUG_ON(!ocfs2_inode_is_new(inode)); + + mlog_entry_void(); + + mlog(0, "Inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + /* NOTE: That we don't increment any of the holder counts, nor + * do we add anything to a journal handle. Since this is + * supposed to be a new inode which the cluster doesn't know + * about yet, there is no need to. As far as the LVB handling + * is concerned, this is basically like acquiring an EX lock + * on a resource which has an invalid one -- we'll set it + * valid when we release the EX. */ + + ret = ocfs2_create_new_inode_lock(inode, + &OCFS2_I(inode)->ip_rw_lockres); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_create_new_inode_lock(inode, + &OCFS2_I(inode)->ip_meta_lockres); + if (ret) { + mlog_errno(ret); + goto bail; + } + + ret = ocfs2_create_new_inode_lock(inode, + &OCFS2_I(inode)->ip_data_lockres); + if (ret) { + mlog_errno(ret); + goto bail; + } + +bail: + mlog_exit(ret); + return ret; +} + +int ocfs2_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" take %s RW lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, + 0); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + +void ocfs2_rw_unlock(struct inode *inode, int write) +{ + int level = write ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" drop %s RW lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + + mlog_exit_void(); +} + +int ocfs2_data_lock_full(struct inode *inode, + int write, + int arg_flags) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" take %s DATA lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + /* We'll allow faking a readonly data lock for + * rodevices. */ + if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { + if (write) { + status = -EROFS; + mlog_errno(status); + } + goto out; + } + + lockres = &OCFS2_I(inode)->ip_data_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, + 0, arg_flags); + if (status < 0 && status != -EAGAIN) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +/* see ocfs2_meta_lock_with_page() */ +int ocfs2_data_lock_with_page(struct inode *inode, + int write, + struct page *page) +{ + int ret; + + ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); + if (ret == -EAGAIN) { + unlock_page(page); + if (ocfs2_data_lock(inode, write) == 0) + ocfs2_data_unlock(inode, write); + ret = AOP_TRUNCATED_PAGE; + } + + return ret; +} + +static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int kick = 0; + + mlog_entry_void(); + + /* If we know that another node is waiting on our lock, kick + * the vote thread * pre-emptively when we reach a release + * condition. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { + switch(lockres->l_blocking) { + case LKM_EXMODE: + if (!lockres->l_ex_holders && !lockres->l_ro_holders) + kick = 1; + break; + case LKM_PRMODE: + if (!lockres->l_ex_holders) + kick = 1; + break; + default: + BUG(); + } + } + + if (kick) + ocfs2_kick_vote_thread(osb); + + mlog_exit_void(); +} + +void ocfs2_data_unlock(struct inode *inode, + int write) +{ + int level = write ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" drop %s DATA lock\n", + OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + + mlog_exit_void(); +} + +#define OCFS2_SEC_BITS 34 +#define OCFS2_SEC_SHIFT (64 - 34) +#define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) + +/* LVB only has room for 64 bits of time here so we pack it for + * now. */ +static u64 ocfs2_pack_timespec(struct timespec *spec) +{ + u64 res; + u64 sec = spec->tv_sec; + u32 nsec = spec->tv_nsec; + + res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); + + return res; +} + +/* Call this with the lockres locked. I am reasonably sure we don't + * need ip_lock in this function as anyone who would be changing those + * values is supposed to be blocked in ocfs2_meta_lock right now. */ +static void __ocfs2_stuff_meta_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_meta_lvb *lvb; + + mlog_entry_void(); + + lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); + lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); + lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); + lvb->lvb_iuid = cpu_to_be32(inode->i_uid); + lvb->lvb_igid = cpu_to_be32(inode->i_gid); + lvb->lvb_imode = cpu_to_be16(inode->i_mode); + lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); + lvb->lvb_iatime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); + lvb->lvb_ictime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + lvb->lvb_imtime_packed = + cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + + mlog_meta_lvb(0, lockres); + + mlog_exit_void(); +} + +static void ocfs2_unpack_timespec(struct timespec *spec, + u64 packed_time) +{ + spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; + spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; +} + +static void ocfs2_refresh_inode_from_lvb(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; + struct ocfs2_meta_lvb *lvb; + + mlog_entry_void(); + + mlog_meta_lvb(0, lockres); + + lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + /* We're safe here without the lockres lock... */ + spin_lock(&oi->ip_lock); + oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); + i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); + + /* fast-symlinks are a special case */ + if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = + ocfs2_align_bytes_to_sectors(i_size_read(inode)); + + inode->i_uid = be32_to_cpu(lvb->lvb_iuid); + inode->i_gid = be32_to_cpu(lvb->lvb_igid); + inode->i_mode = be16_to_cpu(lvb->lvb_imode); + inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); + ocfs2_unpack_timespec(&inode->i_atime, + be64_to_cpu(lvb->lvb_iatime_packed)); + ocfs2_unpack_timespec(&inode->i_mtime, + be64_to_cpu(lvb->lvb_imtime_packed)); + ocfs2_unpack_timespec(&inode->i_ctime, + be64_to_cpu(lvb->lvb_ictime_packed)); + spin_unlock(&oi->ip_lock); + + mlog_exit_void(); +} + +static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) + return 1; + return 0; +} + +/* Determine whether a lock resource needs to be refreshed, and + * arbitrate who gets to refresh it. + * + * 0 means no refresh needed. + * + * > 0 means you need to refresh this and you MUST call + * ocfs2_complete_lock_res_refresh afterwards. */ +static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) +{ + unsigned long flags; + int status = 0; + + mlog_entry_void(); + +refresh_check: + spin_lock_irqsave(&lockres->l_lock, flags); + if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto bail; + } + + if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + + ocfs2_wait_on_refreshing_lock(lockres); + goto refresh_check; + } + + /* Ok, I'll be the one to refresh this lock. */ + lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = 1; +bail: + mlog_exit(status); + return status; +} + +/* If status is non zero, I'll mark it as not being in refresh + * anymroe, but i won't clear the needs refresh flag. */ +static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, + int status) +{ + unsigned long flags; + mlog_entry_void(); + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); + if (!status) + lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + + mlog_exit_void(); +} + +/* may or may not return a bh if it went to disk. */ +static int ocfs2_meta_lock_update(struct inode *inode, + struct buffer_head **bh) +{ + int status = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_lock_res *lockres; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + spin_lock(&oi->ip_lock); + if (oi->ip_flags & OCFS2_INODE_DELETED) { + mlog(0, "Orphaned inode %"MLFu64" was deleted while we " + "were waiting on a lock. ip_flags = 0x%x\n", + oi->ip_blkno, oi->ip_flags); + spin_unlock(&oi->ip_lock); + status = -ENOENT; + goto bail; + } + spin_unlock(&oi->ip_lock); + + lockres = &oi->ip_meta_lockres; + + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + + /* This will discard any caching information we might have had + * for the inode metadata. */ + ocfs2_metadata_cache_purge(inode); + + /* will do nothing for inode types that don't use the extent + * map (directories, bitmap files, etc) */ + ocfs2_extent_map_trunc(inode, 0); + + if (ocfs2_meta_lvb_is_trustable(lockres)) { + mlog(0, "Trusting LVB on inode %"MLFu64"\n", + oi->ip_blkno); + ocfs2_refresh_inode_from_lvb(inode); + } else { + /* Boo, we have to go to disk. */ + /* read bh, cast, ocfs2_refresh_inode */ + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, + bh, OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto bail_refresh; + } + fe = (struct ocfs2_dinode *) (*bh)->b_data; + + /* This is a good chance to make sure we're not + * locking an invalid object. + * + * We bug on a stale inode here because we checked + * above whether it was wiped from disk. The wiping + * node provides a guarantee that we receive that + * message and can mark the inode before dropping any + * locks associated with it. */ + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail_refresh; + } + mlog_bug_on_msg(inode->i_generation != + le32_to_cpu(fe->i_generation), + "Invalid dinode %"MLFu64" disk generation: %u " + "inode->i_generation: %u\n", + oi->ip_blkno, le32_to_cpu(fe->i_generation), + inode->i_generation); + mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), + "Stale dinode %"MLFu64" dtime: %"MLFu64" " + "flags: 0x%x\n", oi->ip_blkno, + le64_to_cpu(fe->i_dtime), + le32_to_cpu(fe->i_flags)); + + ocfs2_refresh_inode(inode, fe); + } + + status = 0; +bail_refresh: + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_assign_bh(struct inode *inode, + struct buffer_head **ret_bh, + struct buffer_head *passed_bh) +{ + int status; + + if (passed_bh) { + /* Ok, the update went to disk for us, use the + * returned bh. */ + *ret_bh = passed_bh; + get_bh(*ret_bh); + + return 0; + } + + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + ret_bh, + OCFS2_BH_CACHED, + inode); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* + * returns < 0 error if the callback will never be called, otherwise + * the result of the lock will be communicated via the callback. + */ +int ocfs2_meta_lock_full(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + int arg_flags) +{ + int status, level, dlm_flags, acquired; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *local_bh = NULL; + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64", take %s META lock\n", + OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + status = 0; + acquired = 0; + /* We'll allow faking a readonly metadata lock for + * rodevices. */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + wait_event(osb->recovery_event, + ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + + acquired = 0; + lockres = &OCFS2_I(inode)->ip_meta_lockres; + level = ex ? LKM_EXMODE : LKM_PRMODE; + dlm_flags = 0; + if (arg_flags & OCFS2_META_LOCK_NOQUEUE) + dlm_flags |= LKM_NOQUEUE; + + status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); + if (status < 0) { + if (status != -EAGAIN && status != -EIOCBRETRY) + mlog_errno(status); + goto bail; + } + + /* Notify the error cleanup path to drop the cluster lock. */ + acquired = 1; + + /* We wait twice because a node may have died while we were in + * the lower dlm layers. The second time though, we've + * committed to owning this lock so we don't allow signals to + * abort the operation. */ + if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) + wait_event(osb->recovery_event, + ocfs2_node_map_is_empty(osb, &osb->recovery_map)); + + /* This is fun. The caller may want a bh back, or it may + * not. ocfs2_meta_lock_update definitely wants one in, but + * may or may not read one, depending on what's in the + * LVB. The result of all of this is that we've *only* gone to + * disk if we have to, so the complexity is worthwhile. */ + status = ocfs2_meta_lock_update(inode, &local_bh); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + if (ret_bh) { + status = ocfs2_assign_bh(inode, ret_bh, local_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + if (handle) { + status = ocfs2_handle_add_lock(handle, inode); + if (status < 0) + mlog_errno(status); + } + +bail: + if (status < 0) { + if (ret_bh && (*ret_bh)) { + brelse(*ret_bh); + *ret_bh = NULL; + } + if (acquired) + ocfs2_meta_unlock(inode, ex); + } + + if (local_bh) + brelse(local_bh); + + mlog_exit(status); + return status; +} + +/* + * This is working around a lock inversion between tasks acquiring DLM locks + * while holding a page lock and the vote thread which blocks dlm lock acquiry + * while acquiring page locks. + * + * ** These _with_page variantes are only intended to be called from aop + * methods that hold page locks and return a very specific *positive* error + * code that aop methods pass up to the VFS -- test for errors with != 0. ** + * + * The DLM is called such that it returns -EAGAIN if it would have blocked + * waiting for the vote thread. In that case we unlock our page so the vote + * thread can make progress. Once we've done this we have to return + * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up + * into the VFS who will then immediately retry the aop call. + * + * We do a blocking lock and immediate unlock before returning, though, so that + * the lock has a great chance of being cached on this node by the time the VFS + * calls back to retry the aop. This has a potential to livelock as nodes + * ping locks back and forth, but that's a risk we're willing to take to avoid + * the lock inversion simply. + */ +int ocfs2_meta_lock_with_page(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + struct page *page) +{ + int ret; + + ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, + OCFS2_LOCK_NONBLOCK); + if (ret == -EAGAIN) { + unlock_page(page); + if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) + ocfs2_meta_unlock(inode, ex); + ret = AOP_TRUNCATED_PAGE; + } + + return ret; +} + +void ocfs2_meta_unlock(struct inode *inode, + int ex) +{ + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + + mlog_entry_void(); + + mlog(0, "inode %"MLFu64" drop %s META lock\n", + OCFS2_I(inode)->ip_blkno, + ex ? "EXMODE" : "PRMODE"); + + if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); + + mlog_exit_void(); +} + +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex) +{ + int status; + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + struct buffer_head *bh; + struct ocfs2_slot_info *si = osb->slot_info; + + mlog_entry_void(); + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* The super block lock path is really in the best position to + * know when resources covered by the lock need to be + * refreshed, so we do it here. Of course, making sense of + * everything is up to the caller :) */ + status = ocfs2_should_refresh_lock_res(lockres); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (status) { + bh = si->si_bh; + status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, + si->si_inode); + if (status == 0) + ocfs2_update_slot_info(si); + + ocfs2_complete_lock_res_refresh(lockres, status); + + if (status < 0) + mlog_errno(status); + } +bail: + mlog_exit(status); + return status; +} + +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex) +{ + int level = ex ? LKM_EXMODE : LKM_PRMODE; + struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; + + ocfs2_cluster_unlock(osb, lockres, level); +} + +int ocfs2_rename_lock(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); + if (status < 0) + mlog_errno(status); + + return status; +} + +void ocfs2_rename_unlock(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; + + ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); +} + +/* Reference counting of the dlm debug structure. We want this because + * open references on the debug inodes can live on after a mount, so + * we can't rely on the ocfs2_super to always exist. */ +static void ocfs2_dlm_debug_free(struct kref *kref) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); + + kfree(dlm_debug); +} + +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) +{ + if (dlm_debug) + kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); +} + +static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) +{ + kref_get(&debug->d_refcnt); +} + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) +{ + struct ocfs2_dlm_debug *dlm_debug; + + dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); + if (!dlm_debug) { + mlog_errno(-ENOMEM); + goto out; + } + + kref_init(&dlm_debug->d_refcnt); + INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); + dlm_debug->d_locking_state = NULL; +out: + return dlm_debug; +} + +/* Access to this is arbitrated for us via seq_file->sem. */ +struct ocfs2_dlm_seq_priv { + struct ocfs2_dlm_debug *p_dlm_debug; + struct ocfs2_lock_res p_iter_res; + struct ocfs2_lock_res p_tmp_res; +}; + +static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, + struct ocfs2_dlm_seq_priv *priv) +{ + struct ocfs2_lock_res *iter, *ret = NULL; + struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; + + assert_spin_locked(&ocfs2_dlm_tracking_lock); + + list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { + /* discover the head of the list */ + if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { + mlog(0, "End of list found, %p\n", ret); + break; + } + + /* We track our "dummy" iteration lockres' by a NULL + * l_ops field. */ + if (iter->l_ops != NULL) { + ret = iter; + break; + } + } + + return ret; +} + +static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); + if (iter) { + /* Since lockres' have the lifetime of their container + * (which can be inodes, ocfs2_supers, etc) we want to + * copy this out to a temporary lockres while still + * under the spinlock. Obviously after this we can't + * trust any pointers on the copy returned, but that's + * ok as the information we want isn't typically held + * in them. */ + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) +{ +} + +static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ocfs2_dlm_seq_priv *priv = m->private; + struct ocfs2_lock_res *iter = v; + struct ocfs2_lock_res *dummy = &priv->p_iter_res; + + spin_lock(&ocfs2_dlm_tracking_lock); + iter = ocfs2_dlm_next_res(iter, priv); + list_del_init(&dummy->l_debug_list); + if (iter) { + list_add(&dummy->l_debug_list, &iter->l_debug_list); + priv->p_tmp_res = *iter; + iter = &priv->p_tmp_res; + } + spin_unlock(&ocfs2_dlm_tracking_lock); + + return iter; +} + +/* So that debugfs.ocfs2 can determine which format is being used */ +#define OCFS2_DLM_DEBUG_STR_VERSION 1 +static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) +{ + int i; + char *lvb; + struct ocfs2_lock_res *lockres = v; + + if (!lockres) + return -EINVAL; + + seq_printf(m, "0x%x\t" + "%.*s\t" + "%d\t" + "0x%lx\t" + "0x%x\t" + "0x%x\t" + "%u\t" + "%u\t" + "%d\t" + "%d\t", + OCFS2_DLM_DEBUG_STR_VERSION, + OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, + lockres->l_level, + lockres->l_flags, + lockres->l_action, + lockres->l_unlock_action, + lockres->l_ro_holders, + lockres->l_ex_holders, + lockres->l_requested, + lockres->l_blocking); + + /* Dump the raw LVB */ + lvb = lockres->l_lksb.lvb; + for(i = 0; i < DLM_LVB_LEN; i++) + seq_printf(m, "0x%x\t", lvb[i]); + + /* End the line */ + seq_printf(m, "\n"); + return 0; +} + +static struct seq_operations ocfs2_dlm_seq_ops = { + .start = ocfs2_dlm_seq_start, + .stop = ocfs2_dlm_seq_stop, + .next = ocfs2_dlm_seq_next, + .show = ocfs2_dlm_seq_show, +}; + +static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = (struct seq_file *) file->private_data; + struct ocfs2_dlm_seq_priv *priv = seq->private; + struct ocfs2_lock_res *res = &priv->p_iter_res; + + ocfs2_remove_lockres_tracking(res); + ocfs2_put_dlm_debug(priv->p_dlm_debug); + return seq_release_private(inode, file); +} + +static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) +{ + int ret; + struct ocfs2_dlm_seq_priv *priv; + struct seq_file *seq; + struct ocfs2_super *osb; + + priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); + if (!priv) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + osb = (struct ocfs2_super *) inode->u.generic_ip; + ocfs2_get_dlm_debug(osb->osb_dlm_debug); + priv->p_dlm_debug = osb->osb_dlm_debug; + INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); + + ret = seq_open(file, &ocfs2_dlm_seq_ops); + if (ret) { + kfree(priv); + mlog_errno(ret); + goto out; + } + + seq = (struct seq_file *) file->private_data; + seq->private = priv; + + ocfs2_add_lockres_tracking(&priv->p_iter_res, + priv->p_dlm_debug); + +out: + return ret; +} + +static struct file_operations ocfs2_dlm_debug_fops = { + .open = ocfs2_dlm_debug_open, + .release = ocfs2_dlm_debug_release, + .read = seq_read, + .llseek = seq_lseek, +}; + +static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) +{ + int ret = 0; + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + dlm_debug->d_locking_state = debugfs_create_file("locking_state", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_dlm_debug_fops); + if (!dlm_debug->d_locking_state) { + ret = -EINVAL; + mlog(ML_ERROR, + "Unable to create locking state debugfs file.\n"); + goto out; + } + + ocfs2_get_dlm_debug(dlm_debug); +out: + return ret; +} + +static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) +{ + struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; + + if (dlm_debug) { + debugfs_remove(dlm_debug->d_locking_state); + ocfs2_put_dlm_debug(dlm_debug); + } +} + +int ocfs2_dlm_init(struct ocfs2_super *osb) +{ + int status; + u32 dlm_key; + struct dlm_ctxt *dlm; + + mlog_entry_void(); + + status = ocfs2_dlm_init_debug(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* launch vote thread */ + osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d", + osb->osb_id); + if (IS_ERR(osb->vote_task)) { + status = PTR_ERR(osb->vote_task); + osb->vote_task = NULL; + mlog_errno(status); + goto bail; + } + + /* used by the dlm code to make message headers unique, each + * node in this domain must agree on this. */ + dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); + + /* for now, uuid == domain */ + dlm = dlm_register_domain(osb->uuid_str, dlm_key); + if (IS_ERR(dlm)) { + status = PTR_ERR(dlm); + mlog_errno(status); + goto bail; + } + + ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); + ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); + + dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); + + osb->dlm = dlm; + + status = 0; +bail: + if (status < 0) { + ocfs2_dlm_shutdown_debug(osb); + if (osb->vote_task) + kthread_stop(osb->vote_task); + } + + mlog_exit(status); + return status; +} + +void ocfs2_dlm_shutdown(struct ocfs2_super *osb) +{ + mlog_entry_void(); + + dlm_unregister_eviction_cb(&osb->osb_eviction_cb); + + ocfs2_drop_osb_locks(osb); + + if (osb->vote_task) { + kthread_stop(osb->vote_task); + osb->vote_task = NULL; + } + + ocfs2_lock_res_free(&osb->osb_super_lockres); + ocfs2_lock_res_free(&osb->osb_rename_lockres); + + dlm_unregister_domain(osb->dlm); + osb->dlm = NULL; + + ocfs2_dlm_shutdown_debug(osb); + + mlog_exit_void(); +} + +static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) +{ + struct ocfs2_lock_res *lockres = opaque; + unsigned long flags; + + mlog_entry_void(); + + mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, + lockres->l_unlock_action); + + spin_lock_irqsave(&lockres->l_lock, flags); + /* We tried to cancel a convert request, but it was already + * granted. All we want to do here is clear our unlock + * state. The wake_up call done at the bottom is redundant + * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't + * hurt anything anyway */ + if (status == DLM_CANCELGRANT && + lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { + mlog(0, "Got cancelgrant for %s\n", lockres->l_name); + + /* We don't clear the busy flag in this case as it + * should have been cleared by the ast which the dlm + * has called. */ + goto complete_unlock; + } + + if (status != DLM_NORMAL) { + mlog(ML_ERROR, "Dlm passes status %d for lock %s, " + "unlock_action %d\n", status, lockres->l_name, + lockres->l_unlock_action); + spin_unlock_irqrestore(&lockres->l_lock, flags); + return; + } + + switch(lockres->l_unlock_action) { + case OCFS2_UNLOCK_CANCEL_CONVERT: + mlog(0, "Cancel convert success for %s\n", lockres->l_name); + lockres->l_action = OCFS2_AST_INVALID; + break; + case OCFS2_UNLOCK_DROP_LOCK: + lockres->l_level = LKM_IVMODE; + break; + default: + BUG(); + } + + lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); +complete_unlock: + lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + wake_up(&lockres->l_event); + + mlog_exit_void(); +} + +typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); + +struct drop_lock_cb { + ocfs2_pre_drop_cb_t *drop_func; + void *drop_data; +}; + +static int ocfs2_drop_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + struct drop_lock_cb *dcb) +{ + enum dlm_status status; + unsigned long flags; + + /* We didn't get anywhere near actually using this lockres. */ + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) + goto out; + + spin_lock_irqsave(&lockres->l_lock, flags); + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), + "lockres %s, flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + + while (lockres->l_flags & OCFS2_LOCK_BUSY) { + mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " + "%u, unlock_action = %u\n", + lockres->l_name, lockres->l_flags, lockres->l_action, + lockres->l_unlock_action); + + spin_unlock_irqrestore(&lockres->l_lock, flags); + + /* XXX: Today we just wait on any busy + * locks... Perhaps we need to cancel converts in the + * future? */ + ocfs2_wait_on_busy_lock(lockres); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + + if (dcb) + dcb->drop_func(lockres, dcb->drop_data); + + if (lockres->l_flags & OCFS2_LOCK_BUSY) + mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", + lockres->l_name); + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); + + if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + goto out; + } + + lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); + + /* make sure we never get here while waiting for an ast to + * fire. */ + BUG_ON(lockres->l_action != OCFS2_AST_INVALID); + + /* is this necessary? */ + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); + lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "lock %s\n", lockres->l_name); + + status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, + lockres->l_ops->unlock_ast, lockres); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmunlock", status, lockres); + mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); + dlm_print_one_lock(lockres->l_lksb.lockid); + BUG(); + } + mlog(0, "lock %s, successfull return from dlmunlock\n", + lockres->l_name); + + ocfs2_wait_on_busy_lock(lockres); +out: + mlog_exit(0); + return 0; +} + +/* Mark the lockres as being dropped. It will no longer be + * queued if blocking, but we still may have to wait on it + * being dequeued from the vote thread before we can consider + * it safe to drop. + * + * You can *not* attempt to call cluster_lock on this lockres anymore. */ +void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +{ + int status; + struct ocfs2_mask_waiter mw; + unsigned long flags; + + ocfs2_init_mask_waiter(&mw); + + spin_lock_irqsave(&lockres->l_lock, flags); + lockres->l_flags |= OCFS2_LOCK_FREEING; + while (lockres->l_flags & OCFS2_LOCK_QUEUED) { + lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog(0, "Waiting on lockres %s\n", lockres->l_name); + + status = ocfs2_wait_for_mask(&mw); + if (status) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); + } + spin_unlock_irqrestore(&lockres->l_lock, flags); +} + +static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) +{ + int status; + + mlog_entry_void(); + + ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); + + status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); + if (status < 0) + mlog_errno(status); + + ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); + + status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); +} + +static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) +{ + struct inode *inode = data; + + /* the metadata lock requires a bit more work as we have an + * LVB to worry about. */ + if (lockres->l_flags & OCFS2_LOCK_ATTACHED && + lockres->l_level == LKM_EXMODE && + !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) + __ocfs2_stuff_meta_lvb(inode); +} + +int ocfs2_drop_inode_locks(struct inode *inode) +{ + int status, err; + struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; + + mlog_entry_void(); + + /* No need to call ocfs2_mark_lockres_freeing here - + * ocfs2_clear_inode has done it for us. */ + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_data_lockres, + NULL); + if (err < 0) + mlog_errno(err); + + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_meta_lockres, + &meta_dcb); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_rw_lockres, + NULL); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + mlog_exit(status); + return status; +} + +static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, + int new_level) +{ + assert_spin_locked(&lockres->l_lock); + + BUG_ON(lockres->l_blocking <= LKM_NLMODE); + + if (lockres->l_level <= new_level) { + mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", + lockres->l_level, new_level); + BUG(); + } + + mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", + lockres->l_name, new_level, lockres->l_blocking); + + lockres->l_action = OCFS2_AST_DOWNCONVERT; + lockres->l_requested = new_level; + lockres_or_flags(lockres, OCFS2_LOCK_BUSY); +} + +static int ocfs2_downconvert_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int new_level, + int lvb) +{ + int ret, dlm_flags = LKM_CONVERT; + enum dlm_status status; + + mlog_entry_void(); + + if (lvb) + dlm_flags |= LKM_VALBLK; + + status = dlmlock(osb->dlm, + new_level, + &lockres->l_lksb, + dlm_flags, + lockres->l_name, + lockres->l_ops->ast, + lockres, + lockres->l_ops->bast); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmlock", status, lockres); + ret = -EINVAL; + ocfs2_recover_from_dlm_error(lockres, 1); + goto bail; + } + + ret = 0; +bail: + mlog_exit(ret); + return ret; +} + +/* returns 1 when the caller should unlock and call dlmunlock */ +static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + assert_spin_locked(&lockres->l_lock); + + mlog_entry_void(); + mlog(0, "lock %s\n", lockres->l_name); + + if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { + /* If we're already trying to cancel a lock conversion + * then just drop the spinlock and allow the caller to + * requeue this lock. */ + + mlog(0, "Lockres %s, skip convert\n", lockres->l_name); + return 0; + } + + /* were we in a convert when we got the bast fire? */ + BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && + lockres->l_action != OCFS2_AST_DOWNCONVERT); + /* set things up for the unlockast to know to just + * clear out the ast_action and unset busy, etc. */ + lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; + + mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), + "lock %s, invalid flags: 0x%lx\n", + lockres->l_name, lockres->l_flags); + + return 1; +} + +static int ocfs2_cancel_convert(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int ret; + enum dlm_status status; + + mlog_entry_void(); + mlog(0, "lock %s\n", lockres->l_name); + + ret = 0; + status = dlmunlock(osb->dlm, + &lockres->l_lksb, + LKM_CANCEL, + lockres->l_ops->unlock_ast, + lockres); + if (status != DLM_NORMAL) { + ocfs2_log_dlm_error("dlmunlock", status, lockres); + ret = -EINVAL; + ocfs2_recover_from_dlm_error(lockres, 0); + } + + mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); + + mlog_exit(ret); + return ret; +} + +static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, + struct ocfs2_lock_res *lockres, + int new_level) +{ + int ret; + + mlog_entry_void(); + + BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); + + if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { + ret = 0; + mlog(0, "lockres %s currently being refreshed -- backing " + "off!\n", lockres->l_name); + } else if (new_level == LKM_PRMODE) + ret = !lockres->l_ex_holders && + ocfs2_inode_fully_checkpointed(inode); + else /* Must be NLMODE we're converting to. */ + ret = !lockres->l_ro_holders && !lockres->l_ex_holders && + ocfs2_inode_fully_checkpointed(inode); + + mlog_exit(ret); + return ret; +} + +static int ocfs2_do_unblock_meta(struct inode *inode, + int *requeue) +{ + int new_level; + int set_lvb = 0; + int ret = 0; + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; + unsigned long flags; + + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry_void(); + + spin_lock_irqsave(&lockres->l_lock, flags); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + + mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, + lockres->l_blocking); + + BUG_ON(lockres->l_level != LKM_EXMODE && + lockres->l_level != LKM_PRMODE); + + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + *requeue = 1; + ret = ocfs2_prepare_cancel_convert(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ret) { + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) + mlog_errno(ret); + } + goto leave; + } + + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", + lockres->l_level, lockres->l_blocking, new_level); + + if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { + if (lockres->l_level == LKM_EXMODE) + set_lvb = 1; + + /* If the lock hasn't been refreshed yet (rare), then + * our memory inode values are old and we skip + * stuffing the lvb. There's no need to actually clear + * out the lvb here as it's value is still valid. */ + if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { + if (set_lvb) + __ocfs2_stuff_meta_lvb(inode); + } else + mlog(0, "lockres %s: downconverting stale lock!\n", + lockres->l_name); + + mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " + "l_blocking=%d, new_level=%d\n", + lockres->l_level, lockres->l_blocking, new_level); + + ocfs2_prepare_downconvert(lockres, new_level); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); + goto leave; + } + if (!ocfs2_inode_fully_checkpointed(inode)) + ocfs2_start_checkpoint(osb); + + *requeue = 1; + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = 0; +leave: + mlog_exit(ret); + return ret; +} + +static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres, + int *requeue, + ocfs2_convert_worker_t *worker) +{ + unsigned long flags; + int blocking; + int new_level; + int ret = 0; + + mlog_entry_void(); + + spin_lock_irqsave(&lockres->l_lock, flags); + + BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); + +recheck: + if (lockres->l_flags & OCFS2_LOCK_BUSY) { + *requeue = 1; + ret = ocfs2_prepare_cancel_convert(osb, lockres); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (ret) { + ret = ocfs2_cancel_convert(osb, lockres); + if (ret < 0) + mlog_errno(ret); + } + goto leave; + } + + /* if we're blocking an exclusive and we have *any* holders, + * then requeue. */ + if ((lockres->l_blocking == LKM_EXMODE) + && (lockres->l_ex_holders || lockres->l_ro_holders)) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + *requeue = 1; + ret = 0; + goto leave; + } + + /* If it's a PR we're blocking, then only + * requeue if we've got any EX holders */ + if (lockres->l_blocking == LKM_PRMODE && + lockres->l_ex_holders) { + spin_unlock_irqrestore(&lockres->l_lock, flags); + *requeue = 1; + ret = 0; + goto leave; + } + + /* If we get here, then we know that there are no more + * incompatible holders (and anyone asking for an incompatible + * lock is blocked). We can now downconvert the lock */ + if (!worker) + goto downconvert; + + /* Some lockres types want to do a bit of work before + * downconverting a lock. Allow that here. The worker function + * may sleep, so we save off a copy of what we're blocking as + * it may change while we're not holding the spin lock. */ + blocking = lockres->l_blocking; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + worker(lockres, blocking); + + spin_lock_irqsave(&lockres->l_lock, flags); + if (blocking != lockres->l_blocking) { + /* If this changed underneath us, then we can't drop + * it just yet. */ + goto recheck; + } + +downconvert: + *requeue = 0; + new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); + + ocfs2_prepare_downconvert(lockres, new_level); + spin_unlock_irqrestore(&lockres->l_lock, flags); + ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); +leave: + mlog_exit(ret); + return ret; +} + +static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, + int blocking) +{ + struct inode *inode; + struct address_space *mapping; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + mapping = inode->i_mapping; + + if (filemap_fdatawrite(mapping)) { + mlog(ML_ERROR, "Could not sync inode %"MLFu64" for downconvert!", + OCFS2_I(inode)->ip_blkno); + } + sync_mapping_buffers(mapping); + if (blocking == LKM_EXMODE) { + truncate_inode_pages(mapping, 0); + unmap_mapping_range(mapping, 0, 0, 0); + } else { + /* We only need to wait on the I/O if we're not also + * truncating pages because truncate_inode_pages waits + * for us above. We don't truncate pages if we're + * blocking anything < EXMODE because we want to keep + * them around in that case. */ + filemap_fdatawait(mapping); + } + + mlog_exit_void(); +} + +int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct inode *inode; + struct ocfs2_super *osb; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + osb = OCFS2_SB(inode->i_sb); + + mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + status = ocfs2_generic_unblock_lock(osb, + lockres, + requeue, + ocfs2_data_convert_worker); + if (status < 0) + mlog_errno(status); + + mlog(0, "inode %"MLFu64", requeue = %d\n", + OCFS2_I(inode)->ip_blkno, *requeue); + + mlog_exit(status); + return status; +} + +static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct inode *inode; + + mlog_entry_void(); + + mlog(0, "Unblock lockres %s\n", lockres->l_name); + + inode = ocfs2_lock_res_inode(lockres); + + status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), + lockres, + requeue, + NULL); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + + +int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct inode *inode; + + mlog_entry_void(); + + inode = ocfs2_lock_res_inode(lockres); + + mlog(0, "unblock inode %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + status = ocfs2_do_unblock_meta(inode, requeue); + if (status < 0) + mlog_errno(status); + + mlog(0, "inode %"MLFu64", requeue = %d\n", + OCFS2_I(inode)->ip_blkno, *requeue); + + mlog_exit(status); + return status; +} + +/* Generic unblock function for any lockres whose private data is an + * ocfs2_super pointer. */ +static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, + int *requeue) +{ + int status; + struct ocfs2_super *osb; + + mlog_entry_void(); + + mlog(0, "Unblock lockres %s\n", lockres->l_name); + + osb = ocfs2_lock_res_super(lockres); + + status = ocfs2_generic_unblock_lock(osb, + lockres, + requeue, + NULL); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + +void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + int status; + int requeue = 0; + unsigned long flags; + + /* Our reference to the lockres in this function can be + * considered valid until we remove the OCFS2_LOCK_QUEUED + * flag. */ + + mlog_entry_void(); + + BUG_ON(!lockres); + BUG_ON(!lockres->l_ops); + BUG_ON(!lockres->l_ops->unblock); + + mlog(0, "lockres %s blocked.\n", lockres->l_name); + + /* Detect whether a lock has been marked as going away while + * the vote thread was processing other things. A lock can + * still be marked with OCFS2_LOCK_FREEING after this check, + * but short circuiting here will still save us some + * performance. */ + spin_lock_irqsave(&lockres->l_lock, flags); + if (lockres->l_flags & OCFS2_LOCK_FREEING) + goto unqueue; + spin_unlock_irqrestore(&lockres->l_lock, flags); + + status = lockres->l_ops->unblock(lockres, &requeue); + if (status < 0) + mlog_errno(status); + + spin_lock_irqsave(&lockres->l_lock, flags); +unqueue: + if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); + } else + ocfs2_schedule_blocked_lock(osb, lockres); + + mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, + requeue ? "yes" : "no"); + spin_unlock_irqrestore(&lockres->l_lock, flags); + + mlog_exit_void(); +} + +static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) +{ + mlog_entry_void(); + + assert_spin_locked(&lockres->l_lock); + + if (lockres->l_flags & OCFS2_LOCK_FREEING) { + /* Do not schedule a lock for downconvert when it's on + * the way to destruction - any nodes wanting access + * to the resource will get it soon. */ + mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", + lockres->l_name, lockres->l_flags); + return; + } + + lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); + + spin_lock(&osb->vote_task_lock); + if (list_empty(&lockres->l_blocked_list)) { + list_add_tail(&lockres->l_blocked_list, + &osb->blocked_lock_list); + osb->blocked_lock_count++; + } + spin_unlock(&osb->vote_task_lock); + + mlog_exit_void(); +} + +/* This aids in debugging situations where a bad LVB might be involved. */ +void ocfs2_dump_meta_lvb_info(u64 level, + const char *function, + unsigned int line, + struct ocfs2_lock_res *lockres) +{ + struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; + + mlog(level, "LVB information for %s (called from %s:%u):\n", + lockres->l_name, function, line); + mlog(level, "version: %u, clusters: %u\n", + be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); + mlog(level, "size: %"MLFu64", uid %u, gid %u, mode 0x%x\n", + be64_to_cpu(lvb->lvb_isize), be32_to_cpu(lvb->lvb_iuid), + be32_to_cpu(lvb->lvb_igid), be16_to_cpu(lvb->lvb_imode)); + mlog(level, "nlink %u, atime_packed 0x%"MLFx64", " + "ctime_packed 0x%"MLFx64", mtime_packed 0x%"MLFx64"\n", + be16_to_cpu(lvb->lvb_inlink), be64_to_cpu(lvb->lvb_iatime_packed), + be64_to_cpu(lvb->lvb_ictime_packed), + be64_to_cpu(lvb->lvb_imtime_packed)); +} diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h new file mode 100644 index 000000000000..8f2d1db2d9ea --- /dev/null +++ b/fs/ocfs2/dlmglue.h @@ -0,0 +1,111 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * dlmglue.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef DLMGLUE_H +#define DLMGLUE_H + +#define OCFS2_LVB_VERSION 2 + +struct ocfs2_meta_lvb { + __be32 lvb_version; + __be32 lvb_iclusters; + __be32 lvb_iuid; + __be32 lvb_igid; + __be64 lvb_iatime_packed; + __be64 lvb_ictime_packed; + __be64 lvb_imtime_packed; + __be64 lvb_isize; + __be16 lvb_imode; + __be16 lvb_inlink; + __be32 lvb_reserved[3]; +}; + +/* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */ +/* don't wait on recovery. */ +#define OCFS2_META_LOCK_RECOVERY (0x01) +/* Instruct the dlm not to queue ourselves on the other node. */ +#define OCFS2_META_LOCK_NOQUEUE (0x02) +/* don't block waiting for the vote thread, instead return -EAGAIN */ +#define OCFS2_LOCK_NONBLOCK (0x04) + +int ocfs2_dlm_init(struct ocfs2_super *osb); +void ocfs2_dlm_shutdown(struct ocfs2_super *osb); +void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); +void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, + enum ocfs2_lock_type type, + struct inode *inode); +void ocfs2_lock_res_free(struct ocfs2_lock_res *res); +int ocfs2_create_new_inode_locks(struct inode *inode); +int ocfs2_drop_inode_locks(struct inode *inode); +int ocfs2_data_lock_full(struct inode *inode, + int write, + int arg_flags); +#define ocfs2_data_lock(inode, write) ocfs2_data_lock_full(inode, write, 0) +int ocfs2_data_lock_with_page(struct inode *inode, + int write, + struct page *page); +void ocfs2_data_unlock(struct inode *inode, + int write); +int ocfs2_rw_lock(struct inode *inode, int write); +void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_meta_lock_full(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + int arg_flags); +int ocfs2_meta_lock_with_page(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head **ret_bh, + int ex, + struct page *page); +/* 99% of the time we don't want to supply any additional flags -- + * those are for very specific cases only. */ +#define ocfs2_meta_lock(i, h, b, e) ocfs2_meta_lock_full(i, h, b, e, 0) +void ocfs2_meta_unlock(struct inode *inode, + int ex); +int ocfs2_super_lock(struct ocfs2_super *osb, + int ex); +void ocfs2_super_unlock(struct ocfs2_super *osb, + int ex); +int ocfs2_rename_lock(struct ocfs2_super *osb); +void ocfs2_rename_unlock(struct ocfs2_super *osb); +void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); + +/* for the vote thread */ +void ocfs2_process_blocked_lock(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); + +struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); +void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); + +/* aids in debugging and tracking lvbs */ +void ocfs2_dump_meta_lvb_info(u64 level, + const char *function, + unsigned int line, + struct ocfs2_lock_res *lockres); +#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) + +#endif /* DLMGLUE_H */ diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h new file mode 100644 index 000000000000..f226b2207628 --- /dev/null +++ b/fs/ocfs2/endian.h @@ -0,0 +1,45 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * Copyright (C) 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_ENDIAN_H +#define OCFS2_ENDIAN_H + +static inline void le16_add_cpu(__le16 *var, u16 val) +{ + *var = cpu_to_le16(le16_to_cpu(*var) + val); +} + +static inline void le32_add_cpu(__le32 *var, u32 val) +{ + *var = cpu_to_le32(le32_to_cpu(*var) + val); +} + +static inline void le32_and_cpu(__le32 *var, u32 val) +{ + *var = cpu_to_le32(le32_to_cpu(*var) & val); +} + +static inline void be32_add_cpu(__be32 *var, u32 val) +{ + *var = cpu_to_be32(be32_to_cpu(*var) + val); +} + +#endif /* OCFS2_ENDIAN_H */ diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c new file mode 100644 index 000000000000..5810160d92a8 --- /dev/null +++ b/fs/ocfs2/export.c @@ -0,0 +1,248 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.c + * + * Functions to facilitate NFS exporting + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> + +#define MLOG_MASK_PREFIX ML_EXPORT +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dir.h" +#include "dlmglue.h" +#include "export.h" +#include "inode.h" + +#include "buffer_head_io.h" + +struct ocfs2_inode_handle +{ + u64 ih_blkno; + u32 ih_generation; +}; + +static struct dentry *ocfs2_get_dentry(struct super_block *sb, void *vobjp) +{ + struct ocfs2_inode_handle *handle = vobjp; + struct inode *inode; + struct dentry *result; + + mlog_entry("(0x%p, 0x%p)\n", sb, handle); + + if (handle->ih_blkno == 0) { + mlog_errno(-ESTALE); + return ERR_PTR(-ESTALE); + } + + inode = ocfs2_iget(OCFS2_SB(sb), handle->ih_blkno); + + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + return (void *)inode; + } + + if (handle->ih_generation != inode->i_generation) { + iput(inode); + mlog_errno(-ESTALE); + return ERR_PTR(-ESTALE); + } + + result = d_alloc_anon(inode); + + if (!result) { + iput(inode); + mlog_errno(-ENOMEM); + return ERR_PTR(-ENOMEM); + } + + mlog_exit_ptr(result); + return result; +} + +static struct dentry *ocfs2_get_parent(struct dentry *child) +{ + int status; + u64 blkno; + struct dentry *parent; + struct inode *inode; + struct inode *dir = child->d_inode; + struct buffer_head *dirent_bh = NULL; + struct ocfs2_dir_entry *dirent; + + mlog_entry("(0x%p, '%.*s')\n", child, + child->d_name.len, child->d_name.name); + + mlog(0, "find parent of directory %"MLFu64"\n", + OCFS2_I(dir)->ip_blkno); + + status = ocfs2_meta_lock(dir, NULL, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + parent = ERR_PTR(status); + goto bail; + } + + status = ocfs2_find_files_on_disk("..", 2, &blkno, dir, &dirent_bh, + &dirent); + if (status < 0) { + parent = ERR_PTR(-ENOENT); + goto bail_unlock; + } + + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); + if (IS_ERR(inode)) { + mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); + parent = ERR_PTR(-EACCES); + goto bail_unlock; + } + + parent = d_alloc_anon(inode); + if (!parent) { + iput(inode); + parent = ERR_PTR(-ENOMEM); + } + +bail_unlock: + ocfs2_meta_unlock(dir, 0); + + if (dirent_bh) + brelse(dirent_bh); + +bail: + mlog_exit_ptr(parent); + + return parent; +} + +static int ocfs2_encode_fh(struct dentry *dentry, __be32 *fh, int *max_len, + int connectable) +{ + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type = 1; + u64 blkno; + u32 generation; + + mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, + dentry->d_name.len, dentry->d_name.name, + fh, len, connectable); + + if (len < 3 || (connectable && len < 6)) { + mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + type = 255; + goto bail; + } + + blkno = OCFS2_I(inode)->ip_blkno; + generation = inode->i_generation; + + mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", + blkno, generation); + + len = 3; + fh[0] = cpu_to_le32((u32)(blkno >> 32)); + fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[2] = cpu_to_le32(generation); + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + blkno = OCFS2_I(parent)->ip_blkno; + generation = parent->i_generation; + + fh[3] = cpu_to_le32((u32)(blkno >> 32)); + fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff)); + fh[5] = cpu_to_le32(generation); + + spin_unlock(&dentry->d_lock); + + len = 6; + type = 2; + + mlog(0, "Encoding parent: blkno: %"MLFu64", generation: %u\n", + blkno, generation); + } + + *max_len = len; + +bail: + mlog_exit(type); + return type; +} + +static struct dentry *ocfs2_decode_fh(struct super_block *sb, __be32 *fh, + int fh_len, int fileid_type, + int (*acceptable)(void *context, + struct dentry *de), + void *context) +{ + struct ocfs2_inode_handle handle, parent; + struct dentry *ret = NULL; + + mlog_entry("(0x%p, 0x%p, %d, %d, 0x%p, 0x%p)\n", + sb, fh, fh_len, fileid_type, acceptable, context); + + if (fh_len < 3 || fileid_type > 2) + goto bail; + + if (fileid_type == 2) { + if (fh_len < 6) + goto bail; + + parent.ih_blkno = (u64)le32_to_cpu(fh[3]) << 32; + parent.ih_blkno |= (u64)le32_to_cpu(fh[4]); + parent.ih_generation = le32_to_cpu(fh[5]); + + mlog(0, "Decoding parent: blkno: %"MLFu64", generation: %u\n", + parent.ih_blkno, parent.ih_generation); + } + + handle.ih_blkno = (u64)le32_to_cpu(fh[0]) << 32; + handle.ih_blkno |= (u64)le32_to_cpu(fh[1]); + handle.ih_generation = le32_to_cpu(fh[2]); + + mlog(0, "Encoding fh: blkno: %"MLFu64", generation: %u\n", + handle.ih_blkno, handle.ih_generation); + + ret = ocfs2_export_ops.find_exported_dentry(sb, &handle, &parent, + acceptable, context); + +bail: + mlog_exit_ptr(ret); + return ret; +} + +struct export_operations ocfs2_export_ops = { + .decode_fh = ocfs2_decode_fh, + .encode_fh = ocfs2_encode_fh, + + .get_parent = ocfs2_get_parent, + .get_dentry = ocfs2_get_dentry, +}; diff --git a/fs/ocfs2/export.h b/fs/ocfs2/export.h new file mode 100644 index 000000000000..5b77ee7866ef --- /dev/null +++ b/fs/ocfs2/export.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * export.h + * + * Function prototypes + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_EXPORT_H +#define OCFS2_EXPORT_H + +extern struct export_operations ocfs2_export_ops; + +#endif /* OCFS2_EXPORT_H */ diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c new file mode 100644 index 000000000000..f2fb40cd296a --- /dev/null +++ b/fs/ocfs2/extent_map.c @@ -0,0 +1,994 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.c + * + * In-memory extent map for OCFS2. Man, this code was prettier in + * the library. + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/rbtree.h> + +#define MLOG_MASK_PREFIX ML_EXTENT_MAP +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "extent_map.h" +#include "inode.h" +#include "super.h" + +#include "buffer_head_io.h" + + +/* + * SUCK SUCK SUCK + * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h + */ + +struct ocfs2_extent_map_entry { + struct rb_node e_node; + int e_tree_depth; + struct ocfs2_extent_rec e_rec; +}; + +struct ocfs2_em_insert_context { + int need_left; + int need_right; + struct ocfs2_extent_map_entry *new_ent; + struct ocfs2_extent_map_entry *old_ent; + struct ocfs2_extent_map_entry *left_ent; + struct ocfs2_extent_map_entry *right_ent; +}; + +static kmem_cache_t *ocfs2_em_ent_cachep = NULL; + + +static struct ocfs2_extent_map_entry * +ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + u32 cpos, u32 clusters, + struct rb_node ***ret_p, + struct rb_node **ret_parent); +static int ocfs2_extent_map_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth); +static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, + struct ocfs2_extent_map_entry *ent); +static int ocfs2_extent_map_find_leaf(struct inode *inode, + u32 cpos, u32 clusters, + struct ocfs2_extent_list *el); +static int ocfs2_extent_map_lookup_read(struct inode *inode, + u32 cpos, u32 clusters, + struct ocfs2_extent_map_entry **ret_ent); +static int ocfs2_extent_map_try_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth, + struct ocfs2_em_insert_context *ctxt); + +/* returns 1 only if the rec contains all the given clusters -- that is that + * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + + * clusters) is >= the argument's endpoint */ +static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, + u32 cpos, u32 clusters) +{ + if (le32_to_cpu(rec->e_cpos) > cpos) + return 0; + if (cpos + clusters > le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)) + return 0; + return 1; +} + + +/* + * Find an entry in the tree that intersects the region passed in. + * Note that this will find straddled intervals, it is up to the + * callers to enforce any boundary conditions. + * + * Callers must hold ip_lock. This lookup is not guaranteed to return + * a tree_depth 0 match, and as such can race inserts if the lock + * were not held. + * + * The rb_node garbage lets insertion share the search. Trivial + * callers pass NULL. + */ +static struct ocfs2_extent_map_entry * +ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + u32 cpos, u32 clusters, + struct rb_node ***ret_p, + struct rb_node **ret_parent) +{ + struct rb_node **p = &em->em_extents.rb_node; + struct rb_node *parent = NULL; + struct ocfs2_extent_map_entry *ent = NULL; + + while (*p) + { + parent = *p; + ent = rb_entry(parent, struct ocfs2_extent_map_entry, + e_node); + if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { + p = &(*p)->rb_left; + ent = NULL; + } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + + le32_to_cpu(ent->e_rec.e_clusters))) { + p = &(*p)->rb_right; + ent = NULL; + } else + break; + } + + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; + return ent; +} + +/* + * Find the leaf containing the interval we want. While we're on our + * way down the tree, fill in every record we see at any depth, because + * we might want it later. + * + * Note that this code is run without ip_lock. That's because it + * sleeps while reading. If someone is also filling the extent list at + * the same time we are, we might have to restart. + */ +static int ocfs2_extent_map_find_leaf(struct inode *inode, + u32 cpos, u32 clusters, + struct ocfs2_extent_list *el) +{ + int i, ret; + struct buffer_head *eb_bh = NULL; + u64 blkno; + u32 rec_end; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + + /* + * The bh data containing the el cannot change here, because + * we hold alloc_sem. So we can do this without other + * locks. + */ + while (el->l_tree_depth) + { + blkno = 0; + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + rec_end = (le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)); + + ret = -EBADR; + if (rec_end > OCFS2_I(inode)->ip_clusters) { + mlog_errno(ret); + goto out_free; + } + + if (rec_end <= cpos) { + ret = ocfs2_extent_map_insert(inode, rec, + le16_to_cpu(el->l_tree_depth)); + if (ret && (ret != -EEXIST)) { + mlog_errno(ret); + goto out_free; + } + continue; + } + if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { + ret = ocfs2_extent_map_insert(inode, rec, + le16_to_cpu(el->l_tree_depth)); + if (ret && (ret != -EEXIST)) { + mlog_errno(ret); + goto out_free; + } + continue; + } + + /* + * We've found a record that matches our + * interval. We don't insert it because we're + * about to traverse it. + */ + + /* Check to see if we're stradling */ + ret = -ESRCH; + if (!ocfs2_extent_rec_contains_clusters(rec, + cpos, + clusters)) { + mlog_errno(ret); + goto out_free; + } + + /* + * If we've already found a record, the el has + * two records covering the same interval. + * EEEK! + */ + ret = -EBADR; + if (blkno) { + mlog_errno(ret); + goto out_free; + } + + blkno = le64_to_cpu(rec->e_blkno); + } + + /* + * We don't support holes, and we're still up + * in the branches, so we'd better have found someone + */ + ret = -EBADR; + if (!blkno) { + mlog_errno(ret); + goto out_free; + } + + if (eb_bh) { + brelse(eb_bh); + eb_bh = NULL; + } + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + blkno, &eb_bh, OCFS2_BH_CACHED, + inode); + if (ret) { + mlog_errno(ret); + goto out_free; + } + eb = (struct ocfs2_extent_block *)eb_bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EIO; + goto out_free; + } + el = &eb->h_list; + } + + if (el->l_tree_depth) + BUG(); + + for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; + ret = ocfs2_extent_map_insert(inode, rec, + le16_to_cpu(el->l_tree_depth)); + if (ret) { + mlog_errno(ret); + goto out_free; + } + } + + ret = 0; + +out_free: + if (eb_bh) + brelse(eb_bh); + + return ret; +} + +/* + * This lookup actually will read from disk. It has one invariant: + * It will never re-traverse blocks. This means that all inserts should + * be new regions or more granular regions (both allowed by insert). + */ +static int ocfs2_extent_map_lookup_read(struct inode *inode, + u32 cpos, + u32 clusters, + struct ocfs2_extent_map_entry **ret_ent) +{ + int ret; + u64 blkno; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + struct buffer_head *bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_dinode *di; + struct ocfs2_extent_list *el; + + spin_lock(&OCFS2_I(inode)->ip_lock); + ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); + if (ent) { + if (!ent->e_tree_depth) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + *ret_ent = ent; + return 0; + } + blkno = le64_to_cpu(ent->e_rec.e_blkno); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + if (bh) + brelse(bh); + return ret; + } + eb = (struct ocfs2_extent_block *)bh->b_data; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + brelse(bh); + return -EIO; + } + el = &eb->h_list; + } else { + spin_unlock(&OCFS2_I(inode)->ip_lock); + + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + if (bh) + brelse(bh); + return ret; + } + di = (struct ocfs2_dinode *)bh->b_data; + if (!OCFS2_IS_VALID_DINODE(di)) { + brelse(bh); + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); + return -EIO; + } + el = &di->id2.i_list; + } + + ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); + brelse(bh); + if (ret) { + mlog_errno(ret); + return ret; + } + + ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); + if (!ent) { + ret = -ESRCH; + mlog_errno(ret); + return ret; + } + + if (ent->e_tree_depth) + BUG(); /* FIXME: Make sure this isn't a corruption */ + + *ret_ent = ent; + + return 0; +} + +/* + * Callers must hold ip_lock. This can insert pieces of the tree, + * thus racing lookup if the lock weren't held. + */ +static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, + struct ocfs2_extent_map_entry *ent) +{ + struct rb_node **p, *parent; + struct ocfs2_extent_map_entry *old_ent; + + old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), + le32_to_cpu(ent->e_rec.e_clusters), + &p, &parent); + if (old_ent) + return -EEXIST; + + rb_link_node(&ent->e_node, parent, p); + rb_insert_color(&ent->e_node, &em->em_extents); + + return 0; +} + + +/* + * Simple rule: on any return code other than -EAGAIN, anything left + * in the insert_context will be freed. + */ +static int ocfs2_extent_map_try_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth, + struct ocfs2_em_insert_context *ctxt) +{ + int ret; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *old_ent; + + ctxt->need_left = 0; + ctxt->need_right = 0; + ctxt->old_ent = NULL; + + spin_lock(&OCFS2_I(inode)->ip_lock); + ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); + if (!ret) { + ctxt->new_ent = NULL; + goto out_unlock; + } + + old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), + le32_to_cpu(rec->e_clusters), NULL, + NULL); + + if (!old_ent) + BUG(); + + ret = -EEXIST; + if (old_ent->e_tree_depth < tree_depth) + goto out_unlock; + + if (old_ent->e_tree_depth == tree_depth) { + if (!memcmp(rec, &old_ent->e_rec, + sizeof(struct ocfs2_extent_rec))) + ret = 0; + + /* FIXME: Should this be ESRCH/EBADR??? */ + goto out_unlock; + } + + /* + * We do it in this order specifically so that no actual tree + * changes occur until we have all the pieces we need. We + * don't want malloc failures to leave an inconsistent tree. + * Whenever we drop the lock, another process could be + * inserting. Also note that, if another process just beat us + * to an insert, we might not need the same pieces we needed + * the first go round. In the end, the pieces we need will + * be used, and the pieces we don't will be freed. + */ + ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > + le32_to_cpu(old_ent->e_rec.e_cpos)); + ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + + le32_to_cpu(old_ent->e_rec.e_clusters)) > + (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); + ret = -EAGAIN; + if (ctxt->need_left) { + if (!ctxt->left_ent) + goto out_unlock; + *(ctxt->left_ent) = *old_ent; + ctxt->left_ent->e_rec.e_clusters = + cpu_to_le32(le32_to_cpu(rec->e_cpos) - + le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); + } + if (ctxt->need_right) { + if (!ctxt->right_ent) + goto out_unlock; + *(ctxt->right_ent) = *old_ent; + ctxt->right_ent->e_rec.e_cpos = + cpu_to_le32(le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)); + ctxt->right_ent->e_rec.e_clusters = + cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + + le32_to_cpu(old_ent->e_rec.e_clusters)) - + le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); + } + + rb_erase(&old_ent->e_node, &em->em_extents); + /* Now that he's erased, set him up for deletion */ + ctxt->old_ent = old_ent; + + if (ctxt->need_left) { + ret = ocfs2_extent_map_insert_entry(em, + ctxt->left_ent); + if (ret) + goto out_unlock; + ctxt->left_ent = NULL; + } + + if (ctxt->need_right) { + ret = ocfs2_extent_map_insert_entry(em, + ctxt->right_ent); + if (ret) + goto out_unlock; + ctxt->right_ent = NULL; + } + + ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); + + if (!ret) + ctxt->new_ent = NULL; + +out_unlock: + spin_unlock(&OCFS2_I(inode)->ip_lock); + + return ret; +} + + +static int ocfs2_extent_map_insert(struct inode *inode, + struct ocfs2_extent_rec *rec, + int tree_depth) +{ + int ret; + struct ocfs2_em_insert_context ctxt = {0, }; + + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > + OCFS2_I(inode)->ip_map.em_clusters) { + ret = -EBADR; + mlog_errno(ret); + return ret; + } + + /* Zero e_clusters means a truncated tail record. It better be EOF */ + if (!rec->e_clusters) { + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != + OCFS2_I(inode)->ip_map.em_clusters) { + ret = -EBADR; + mlog_errno(ret); + return ret; + } + + /* Ignore the truncated tail */ + return 0; + } + + ret = -ENOMEM; + ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, + GFP_KERNEL); + if (!ctxt.new_ent) { + mlog_errno(ret); + return ret; + } + + ctxt.new_ent->e_rec = *rec; + ctxt.new_ent->e_tree_depth = tree_depth; + + do { + ret = -ENOMEM; + if (ctxt.need_left && !ctxt.left_ent) { + ctxt.left_ent = + kmem_cache_alloc(ocfs2_em_ent_cachep, + GFP_KERNEL); + if (!ctxt.left_ent) + break; + } + if (ctxt.need_right && !ctxt.right_ent) { + ctxt.right_ent = + kmem_cache_alloc(ocfs2_em_ent_cachep, + GFP_KERNEL); + if (!ctxt.right_ent) + break; + } + + ret = ocfs2_extent_map_try_insert(inode, rec, + tree_depth, &ctxt); + } while (ret == -EAGAIN); + + if (ret < 0) + mlog_errno(ret); + + if (ctxt.left_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); + if (ctxt.right_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); + if (ctxt.old_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); + if (ctxt.new_ent) + kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); + + return ret; +} + +/* + * Append this record to the tail of the extent map. It must be + * tree_depth 0. The record might be an extension of an existing + * record, and as such that needs to be handled. eg: + * + * Existing record in the extent map: + * + * cpos = 10, len = 10 + * |---------| + * + * New Record: + * + * cpos = 10, len = 20 + * |------------------| + * + * The passed record is the new on-disk record. The new_clusters value + * is how many clusters were added to the file. If the append is a + * contiguous append, the new_clusters has been added to + * rec->e_clusters. If the append is an entirely new extent, then + * rec->e_clusters is == new_clusters. + */ +int ocfs2_extent_map_append(struct inode *inode, + struct ocfs2_extent_rec *rec, + u32 new_clusters) +{ + int ret; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + struct ocfs2_extent_rec *old; + + BUG_ON(!new_clusters); + BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); + + if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters; + } + + mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)) != + (em->em_clusters + new_clusters), + "Inode %"MLFu64":\n" + "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" + "em->em_clusters = %u + new_clusters = %u = %u\n", + OCFS2_I(inode)->ip_blkno, + le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), + le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), + em->em_clusters, new_clusters, + em->em_clusters + new_clusters); + + em->em_clusters += new_clusters; + + ret = -ENOENT; + if (le32_to_cpu(rec->e_clusters) > new_clusters) { + /* This is a contiguous append */ + ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, + NULL, NULL); + if (ent) { + old = &ent->e_rec; + BUG_ON((le32_to_cpu(rec->e_cpos) + + le32_to_cpu(rec->e_clusters)) != + (le32_to_cpu(old->e_cpos) + + le32_to_cpu(old->e_clusters) + + new_clusters)); + if (ent->e_tree_depth == 0) { + BUG_ON(le32_to_cpu(old->e_cpos) != + le32_to_cpu(rec->e_cpos)); + BUG_ON(le64_to_cpu(old->e_blkno) != + le64_to_cpu(rec->e_blkno)); + ret = 0; + } + /* + * Let non-leafs fall through as -ENOENT to + * force insertion of the new leaf. + */ + le32_add_cpu(&old->e_clusters, new_clusters); + } + } + + if (ret == -ENOENT) + ret = ocfs2_extent_map_insert(inode, rec, 0); + if (ret < 0) + mlog_errno(ret); + return ret; +} + +#if 0 +/* Code here is included but defined out as it completes the extent + * map api and may be used in the future. */ + +/* + * Look up the record containing this cluster offset. This record is + * part of the extent map. Do not free it. Any changes you make to + * it will reflect in the extent map. So, if your last extent + * is (cpos = 10, clusters = 10) and you truncate the file by 5 + * clusters, you can do: + * + * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); + * rec->e_clusters -= 5; + * + * The lookup does not read from disk. If the map isn't filled in for + * an entry, you won't find it. + * + * Also note that the returned record is valid until alloc_sem is + * dropped. After that, truncate and extend can happen. Caveat Emptor. + */ +int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, + struct ocfs2_extent_rec **rec, + int *tree_depth) +{ + int ret = -ENOENT; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + + *rec = NULL; + + if (cpos >= OCFS2_I(inode)->ip_clusters) + return -EINVAL; + + if (cpos >= em->em_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters ; + } + + ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, + NULL, NULL); + + if (ent) { + *rec = &ent->e_rec; + if (tree_depth) + *tree_depth = ent->e_tree_depth; + ret = 0; + } + + return ret; +} + +int ocfs2_extent_map_get_clusters(struct inode *inode, + u32 v_cpos, int count, + u32 *p_cpos, int *ret_count) +{ + int ret; + u32 coff, ccount; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent = NULL; + + *p_cpos = ccount = 0; + + if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) + return -EINVAL; + + if ((v_cpos + count) > em->em_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters; + } + + + ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); + if (ret) + return ret; + + if (ent) { + /* We should never find ourselves straddling an interval */ + if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, + v_cpos, + count)) + return -ESRCH; + + coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); + *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(ent->e_rec.e_blkno)) + + coff; + + if (ret_count) + *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; + + return 0; + } + + + return -ENOENT; +} + +#endif /* 0 */ + +int ocfs2_extent_map_get_blocks(struct inode *inode, + u64 v_blkno, int count, + u64 *p_blkno, int *ret_count) +{ + int ret; + u64 boff; + u32 cpos, clusters; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + struct ocfs2_extent_map_entry *ent = NULL; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_rec *rec; + + *p_blkno = 0; + + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); + clusters = ocfs2_blocks_to_clusters(inode->i_sb, + (u64)count + bpc - 1); + if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { + ret = -EINVAL; + mlog_errno(ret); + return ret; + } + + if ((cpos + clusters) > em->em_clusters) { + /* + * Size changed underneath us on disk. Drop any + * straddling records and update our idea of + * i_clusters + */ + ocfs2_extent_map_drop(inode, em->em_clusters - 1); + em->em_clusters = OCFS2_I(inode)->ip_clusters; + } + + ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); + if (ret) { + mlog_errno(ret); + return ret; + } + + if (ent) + { + rec = &ent->e_rec; + + /* We should never find ourselves straddling an interval */ + if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { + ret = -ESRCH; + mlog_errno(ret); + return ret; + } + + boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - + le32_to_cpu(rec->e_cpos)); + boff += (v_blkno & (u64)(bpc - 1)); + *p_blkno = le64_to_cpu(rec->e_blkno) + boff; + + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(rec->e_clusters)) - boff; + } + + return 0; + } + + return -ENOENT; +} + +int ocfs2_extent_map_init(struct inode *inode) +{ + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + + em->em_extents = RB_ROOT; + em->em_clusters = 0; + + return 0; +} + +/* Needs the lock */ +static void __ocfs2_extent_map_drop(struct inode *inode, + u32 new_clusters, + struct rb_node **free_head, + struct ocfs2_extent_map_entry **tail_ent) +{ + struct rb_node *node, *next; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + + *free_head = NULL; + + ent = NULL; + node = rb_last(&em->em_extents); + while (node) + { + next = rb_prev(node); + + ent = rb_entry(node, struct ocfs2_extent_map_entry, + e_node); + if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) + break; + + rb_erase(&ent->e_node, &em->em_extents); + + node->rb_right = *free_head; + *free_head = node; + + ent = NULL; + node = next; + } + + /* Do we have an entry straddling new_clusters? */ + if (tail_ent) { + if (ent && + ((le32_to_cpu(ent->e_rec.e_cpos) + + le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) + *tail_ent = ent; + else + *tail_ent = NULL; + } +} + +static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) +{ + struct rb_node *node; + struct ocfs2_extent_map_entry *ent; + + while (free_head) { + node = free_head; + free_head = node->rb_right; + + ent = rb_entry(node, struct ocfs2_extent_map_entry, + e_node); + kmem_cache_free(ocfs2_em_ent_cachep, ent); + } +} + +/* + * Remove all entries past new_clusters, inclusive of an entry that + * contains new_clusters. This is effectively a cache forget. + * + * If you want to also clip the last extent by some number of clusters, + * you need to call ocfs2_extent_map_trunc(). + * This code does not check or modify ip_clusters. + */ +int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) +{ + struct rb_node *free_head = NULL; + struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + struct ocfs2_extent_map_entry *ent; + + spin_lock(&OCFS2_I(inode)->ip_lock); + + __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + + if (ent) { + rb_erase(&ent->e_node, &em->em_extents); + ent->e_node.rb_right = free_head; + free_head = &ent->e_node; + } + + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (free_head) + __ocfs2_extent_map_drop_cleanup(free_head); + + return 0; +} + +/* + * Remove all entries past new_clusters and also clip any extent + * straddling new_clusters, if there is one. This does not check + * or modify ip_clusters + */ +int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) +{ + struct rb_node *free_head = NULL; + struct ocfs2_extent_map_entry *ent = NULL; + + spin_lock(&OCFS2_I(inode)->ip_lock); + + __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + + if (ent) + ent->e_rec.e_clusters = cpu_to_le32(new_clusters - + le32_to_cpu(ent->e_rec.e_cpos)); + + OCFS2_I(inode)->ip_map.em_clusters = new_clusters; + + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (free_head) + __ocfs2_extent_map_drop_cleanup(free_head); + + return 0; +} + +int __init init_ocfs2_extent_maps(void) +{ + ocfs2_em_ent_cachep = + kmem_cache_create("ocfs2_em_ent", + sizeof(struct ocfs2_extent_map_entry), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ocfs2_em_ent_cachep) + return -ENOMEM; + + return 0; +} + +void __exit exit_ocfs2_extent_maps(void) +{ + kmem_cache_destroy(ocfs2_em_ent_cachep); +} diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h new file mode 100644 index 000000000000..fa3745efa886 --- /dev/null +++ b/fs/ocfs2/extent_map.h @@ -0,0 +1,46 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * extent_map.h + * + * In-memory file extent mappings for OCFS2. + * + * Copyright (C) 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _EXTENT_MAP_H +#define _EXTENT_MAP_H + +int init_ocfs2_extent_maps(void); +void exit_ocfs2_extent_maps(void); + +/* + * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem + * to be held. The allocation cannot change at all while the map is + * in the process of being updated. + */ +int ocfs2_extent_map_init(struct inode *inode); +int ocfs2_extent_map_append(struct inode *inode, + struct ocfs2_extent_rec *rec, + u32 new_clusters); +int ocfs2_extent_map_get_blocks(struct inode *inode, + u64 v_blkno, int count, + u64 *p_blkno, int *ret_count); +int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); +int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); + +#endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c new file mode 100644 index 000000000000..eaf33caa0a1f --- /dev/null +++ b/fs/ocfs2/file.c @@ -0,0 +1,1238 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.c + * + * File open, close, extend, truncate + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "aops.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "mmap.h" +#include "suballoc.h" +#include "super.h" + +#include "buffer_head_io.h" + +static int ocfs2_sync_inode(struct inode *inode) +{ + filemap_fdatawrite(inode->i_mapping); + return sync_mapping_buffers(inode->i_mapping); +} + +static int ocfs2_file_open(struct inode *inode, struct file *file) +{ + int status; + int mode = file->f_flags; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, + file->f_dentry->d_name.len, file->f_dentry->d_name.name); + + spin_lock(&oi->ip_lock); + + /* Check that the inode hasn't been wiped from disk by another + * node. If it hasn't then we're safe as long as we hold the + * spin lock until our increment of open count. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&oi->ip_lock); + + status = -ENOENT; + goto leave; + } + + if (mode & O_DIRECT) + oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; + + oi->ip_open_count++; + spin_unlock(&oi->ip_lock); + status = 0; +leave: + mlog_exit(status); + return status; +} + +static int ocfs2_file_release(struct inode *inode, struct file *file) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, + file->f_dentry->d_name.len, + file->f_dentry->d_name.name); + + spin_lock(&oi->ip_lock); + if (!--oi->ip_open_count) + oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; + spin_unlock(&oi->ip_lock); + + mlog_exit(0); + + return 0; +} + +static int ocfs2_sync_file(struct file *file, + struct dentry *dentry, + int datasync) +{ + int err = 0; + journal_t *journal; + struct inode *inode = dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, + dentry->d_name.len, dentry->d_name.name); + + err = ocfs2_sync_inode(dentry->d_inode); + if (err) + goto bail; + + journal = osb->journal->j_journal; + err = journal_force_commit(journal); + +bail: + mlog_exit(err); + + return (err < 0) ? -EIO : 0; +} + +int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + + mlog_entry_void(); + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_journal_handle *handle = NULL; + + handle = ocfs2_start_trans(osb, NULL, + OCFS2_INODE_UPDATE_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_set_inode_size(handle, inode, di_bh, + new_i_size); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(handle); +out: + return ret; +} + +static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size) +{ + int status; + struct ocfs2_journal_handle *handle; + + mlog_entry_void(); + + /* TODO: This needs to actually orphan the inode in this + * transaction. */ + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); + if (status < 0) + mlog_errno(status); + + ocfs2_commit_trans(handle); +out: + mlog_exit(status); + return status; +} + +static int ocfs2_truncate_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int status = 0; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_truncate_context *tc = NULL; + + mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n", + OCFS2_I(inode)->ip_blkno, new_i_size); + + truncate_inode_pages(inode->i_mapping, new_i_size); + + fe = (struct ocfs2_dinode *) di_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto bail; + } + + mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), + "Inode %"MLFu64", inode i_size = %lld != di " + "i_size = %"MLFu64", i_flags = 0x%x\n", + OCFS2_I(inode)->ip_blkno, + i_size_read(inode), + le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags)); + + if (new_i_size > le64_to_cpu(fe->i_size)) { + mlog(0, "asked to truncate file with size (%"MLFu64") " + "to size (%"MLFu64")!\n", + le64_to_cpu(fe->i_size), new_i_size); + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n", + le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size); + + /* lets handle the simple truncate cases before doing any more + * cluster locking. */ + if (new_i_size == le64_to_cpu(fe->i_size)) + goto bail; + + if (le32_to_cpu(fe->i_clusters) == + ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { + mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", + fe->i_clusters); + /* No allocation change is required, so lets fast path + * this truncate. */ + status = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (status < 0) + mlog_errno(status); + goto bail; + } + + /* This forces other nodes to sync and drop their pages */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + + /* alright, we're going to need to do a full blown alloc size + * change. Orphan the inode so that recovery can complete the + * truncate if necessary. This does the task of marking + * i_size. */ + status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_commit_truncate(osb, inode, di_bh, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* TODO: orphan dir cleanup here. */ +bail: + + mlog_exit(status); + return status; +} + +/* + * extend allocation only here. + * we'll update all the disk stuff, and oip->alloc_size + * + * expect stuff to be locked, a transaction started and enough data / + * metadata reservations in the contexts. + * + * Will return -EAGAIN, and a reason if a restart is needed. + * If passed in, *reason will always be set, even in error. + */ +int ocfs2_do_extend_allocation(struct ocfs2_super *osb, + struct inode *inode, + u32 clusters_to_add, + struct buffer_head *fe_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0; + int free_extents; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + + BUG_ON(!clusters_to_add); + + free_extents = ocfs2_num_free_extents(osb, inode, fe); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + mlog(0, "we haven't reserved any metadata!\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(fe))) { + mlog(0, "filesystem is really fragmented...\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = ocfs2_claim_clusters(osb, handle, data_ac, 1, + &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the inode */ + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n", + num_bits, bit_off, OCFS2_I(inode)->ip_blkno); + status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, + num_bits, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + le32_add_cpu(&fe->i_clusters, num_bits); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= num_bits; + + if (clusters_to_add) { + mlog(0, "need to alloc once more, clusters = %u, wanted = " + "%u\n", fe->i_clusters, clusters_to_add); + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: + mlog_exit(status); + if (reason_ret) + *reason_ret = reason; + return status; +} + +static int ocfs2_extend_allocation(struct inode *inode, + u32 clusters_to_add) +{ + int status = 0; + int restart_func = 0; + int drop_alloc_sem = 0; + int credits, num_free_extents; + u32 prev_clusters; + struct buffer_head *bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, + OCFS2_BH_CACHED, inode); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); + status = -EIO; + goto leave; + } + +restart_all: + BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); + + mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, " + "clusters_to_add = %u\n", + OCFS2_I(inode)->ip_blkno, i_size_read(inode), + fe->i_clusters, clusters_to_add); + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + num_free_extents = ocfs2_num_free_extents(osb, + inode, + fe); + if (num_free_extents < 0) { + status = num_free_extents; + mlog_errno(status); + goto leave; + } + + if (!num_free_extents) { + status = ocfs2_reserve_new_metadata(osb, + handle, + fe, + &meta_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + } + + status = ocfs2_reserve_clusters(osb, + handle, + clusters_to_add, + &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + /* blocks peope in read/write from reading our allocation + * until we're done changing it. We depend on i_mutex to block + * other extend/truncate calls while we're here. Ordering wrt + * start_trans is important here -- always do it before! */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + drop_alloc_sem = 1; + + credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + /* reserve a write to the file entry early on - that we if we + * run out of credits in the allocation path, we can still + * update i_size. */ + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = OCFS2_I(inode)->ip_clusters; + + status = ocfs2_do_extend_allocation(osb, + inode, + clusters_to_add, + bh, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + mlog(0, "restarting function.\n"); + restart_func = 1; + } else { + BUG_ON(why != RESTART_TRANS); + + mlog(0, "restarting transaction.\n"); + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + fe, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + + mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n", + fe->i_clusters, fe->i_size); + mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", + OCFS2_I(inode)->ip_clusters, i_size_read(inode)); + +leave: + if (drop_alloc_sem) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); + drop_alloc_sem = 0; + } + if (handle) { + ocfs2_commit_trans(handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + if (bh) { + brelse(bh); + bh = NULL; + } + + mlog_exit(status); + return status; +} + +/* Some parts of this taken from generic_cont_expand, which turned out + * to be too fragile to do exactly what we need without us having to + * worry about recursive locking in ->commit_write(). */ +static int ocfs2_write_zero_page(struct inode *inode, + u64 size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index; + unsigned int offset; + struct ocfs2_journal_handle *handle = NULL; + int ret; + + offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ + /* ugh. in prepare/commit_write, if from==to==start of block, we + ** skip the prepare. make sure we never send an offset for the start + ** of a block + */ + if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { + offset++; + } + index = size >> PAGE_CACHE_SHIFT; + + page = grab_cache_page(mapping, index); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_write(NULL, page, offset, offset); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + if (ocfs2_should_order_data(inode)) { + handle = ocfs2_start_walk_page_trans(inode, page, offset, + offset); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out_unlock; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, offset, offset); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + + if (handle) + ocfs2_commit_trans(handle); +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +static int ocfs2_zero_extend(struct inode *inode, + u64 zero_to_size) +{ + int ret = 0; + u64 start_off; + struct super_block *sb = inode->i_sb; + + start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + while (start_off < zero_to_size) { + ret = ocfs2_write_zero_page(inode, start_off); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + start_off += sb->s_blocksize; + } + +out: + return ret; +} + +static int ocfs2_extend_file(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) +{ + int ret = 0; + u32 clusters_to_add; + + /* setattr sometimes calls us like this. */ + if (new_i_size == 0) + goto out; + + if (i_size_read(inode) == new_i_size) + goto out; + BUG_ON(new_i_size < i_size_read(inode)); + + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - + OCFS2_I(inode)->ip_clusters; + + if (clusters_to_add) { + ret = ocfs2_extend_allocation(inode, clusters_to_add); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_zero_extend(inode, new_i_size); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + /* No allocation required, we just use this helper to + * do a trivial update of i_size. */ + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + return ret; +} + +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + int status = 0, size_change; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct buffer_head *bh = NULL; + struct ocfs2_journal_handle *handle = NULL; + + mlog_entry("(0x%p, '%.*s')\n", dentry, + dentry->d_name.len, dentry->d_name.name); + + if (attr->ia_valid & ATTR_MODE) + mlog(0, "mode change: %d\n", attr->ia_mode); + if (attr->ia_valid & ATTR_UID) + mlog(0, "uid change: %d\n", attr->ia_uid); + if (attr->ia_valid & ATTR_GID) + mlog(0, "gid change: %d\n", attr->ia_gid); + if (attr->ia_valid & ATTR_SIZE) + mlog(0, "size change...\n"); + if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) + mlog(0, "time change...\n"); + +#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ + | ATTR_GID | ATTR_UID | ATTR_MODE) + if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { + mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); + return 0; + } + + status = inode_change_ok(inode, attr); + if (status) + return status; + + size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; + if (size_change) { + status = ocfs2_rw_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_meta_lock(inode, NULL, &bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail_unlock_rw; + } + + if (size_change && attr->ia_size != i_size_read(inode)) { + if (i_size_read(inode) > attr->ia_size) + status = ocfs2_truncate_file(inode, bh, attr->ia_size); + else + status = ocfs2_extend_file(inode, bh, attr->ia_size); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + status = -ENOSPC; + goto bail_unlock; + } + } + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + status = inode_setattr(inode, attr); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + status = ocfs2_mark_inode_dirty(handle, inode, bh); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(handle); +bail_unlock: + ocfs2_meta_unlock(inode, 1); +bail_unlock_rw: + if (size_change) + ocfs2_rw_unlock(inode, 1); +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +int ocfs2_getattr(struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct super_block *sb = dentry->d_inode->i_sb; + struct ocfs2_super *osb = sb->s_fs_info; + int err; + + mlog_entry_void(); + + err = ocfs2_inode_revalidate(dentry); + if (err) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + generic_fillattr(inode, stat); + + /* We set the blksize from the cluster size for performance */ + stat->blksize = osb->s_clustersize; + +bail: + mlog_exit(err); + + return err; +} + +static int ocfs2_write_remove_suid(struct inode *inode) +{ + int ret; + struct buffer_head *bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_journal_handle *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; + + mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno, + inode->i_mode); + + handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + if (ret < 0) { + mlog_errno(ret); + goto out_trans; + } + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_bh; + } + + inode->i_mode &= ~S_ISUID; + if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) + inode->i_mode &= ~S_ISGID; + + di = (struct ocfs2_dinode *) bh->b_data; + di->i_mode = cpu_to_le16(inode->i_mode); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret < 0) + mlog_errno(ret); +out_bh: + brelse(bh); +out_trans: + ocfs2_commit_trans(handle); +out: + mlog_exit(ret); + return ret; +} + +static inline int ocfs2_write_should_remove_suid(struct inode *inode) +{ + mode_t mode = inode->i_mode; + + if (!capable(CAP_FSETID)) { + if (unlikely(mode & S_ISUID)) + return 1; + + if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) + return 1; + } + return 0; +} + +static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + struct iovec local_iov = { .iov_base = (void __user *)buf, + .iov_len = count }; + int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; + u32 clusters; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_dentry->d_inode; + loff_t newsize, saved_pos; +#ifdef OCFS2_ORACORE_WORKAROUNDS + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +#endif + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, + (unsigned int)count, + filp->f_dentry->d_name.len, + filp->f_dentry->d_name.name); + + /* happy write of zero bytes */ + if (count == 0) + return 0; + + if (!inode) { + mlog(0, "bad inode\n"); + return -EIO; + } + +#ifdef OCFS2_ORACORE_WORKAROUNDS + /* ugh, work around some applications which open everything O_DIRECT + + * O_APPEND and really don't mean to use O_DIRECT. */ + if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && + (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) + filp->f_flags &= ~O_DIRECT; +#endif + + mutex_lock(&inode->i_mutex); + /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + if (filp->f_flags & O_DIRECT) { + have_alloc_sem = 1; + down_read(&inode->i_alloc_sem); + } + + /* concurrent O_DIRECT writes are allowed */ + rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; + ret = ocfs2_rw_lock(inode, rw_level); + if (ret < 0) { + rw_level = -1; + mlog_errno(ret); + goto out; + } + + /* + * We sample i_size under a read level meta lock to see if our write + * is extending the file, if it is we back off and get a write level + * meta lock. + */ + meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; + for(;;) { + ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); + if (ret < 0) { + meta_level = -1; + mlog_errno(ret); + goto out; + } + + /* Clear suid / sgid if necessary. We do this here + * instead of later in the write path because + * remove_suid() calls ->setattr without any hint that + * we may have already done our cluster locking. Since + * ocfs2_setattr() *must* take cluster locks to + * proceeed, this will lead us to recursively lock the + * inode. There's also the dinode i_size state which + * can be lost via setattr during extending writes (we + * set inode->i_size at the end of a write. */ + if (ocfs2_write_should_remove_suid(inode)) { + if (meta_level == 0) { + ocfs2_meta_unlock(inode, meta_level); + meta_level = 1; + continue; + } + + ret = ocfs2_write_remove_suid(inode); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + /* work on a copy of ppos until we're sure that we won't have + * to recalculate it due to relocking. */ + if (filp->f_flags & O_APPEND) { + saved_pos = i_size_read(inode); + mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); + } else { + saved_pos = iocb->ki_pos; + } + newsize = count + saved_pos; + + mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", + saved_pos, newsize, i_size_read(inode)); + + /* No need for a higher level metadata lock if we're + * never going past i_size. */ + if (newsize <= i_size_read(inode)) + break; + + if (meta_level == 0) { + ocfs2_meta_unlock(inode, meta_level); + meta_level = 1; + continue; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - + OCFS2_I(inode)->ip_clusters; + spin_unlock(&OCFS2_I(inode)->ip_lock); + + mlog(0, "Writing at EOF, may need more allocation: " + "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", + i_size_read(inode), newsize, clusters); + + /* We only want to continue the rest of this loop if + * our extend will actually require more + * allocation. */ + if (!clusters) + break; + + ret = ocfs2_extend_allocation(inode, clusters); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + /* Fill any holes which would've been created by this + * write. If we're O_APPEND, this will wind up + * (correctly) being a noop. */ + ret = ocfs2_zero_extend(inode, (u64) newsize - count); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + break; + } + + /* ok, we're done with i_size and alloc work */ + iocb->ki_pos = saved_pos; + ocfs2_meta_unlock(inode, meta_level); + meta_level = -1; + + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb); + +#ifdef OCFS2_ORACORE_WORKAROUNDS + if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && + filp->f_flags & O_DIRECT) { + unsigned int saved_flags = filp->f_flags; + int sector_size = 1 << osb->s_sectsize_bits; + + if ((saved_pos & (sector_size - 1)) || + (count & (sector_size - 1)) || + ((unsigned long)buf & (sector_size - 1))) { + filp->f_flags |= O_SYNC; + filp->f_flags &= ~O_DIRECT; + } + + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + &iocb->ki_pos); + + filp->f_flags = saved_flags; + } else +#endif + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, + &iocb->ki_pos); + + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + + /* + * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io + * function pointer which is called when o_direct io completes so that + * it can unlock our rw lock. (it's the clustered equivalent of + * i_alloc_sem; protects truncate from racing with pending ios). + * Unfortunately there are error cases which call end_io and others + * that don't. so we don't have to unlock the rw_lock if either an + * async dio is going to do it in the future or an end_io after an + * error has already done it. + */ + if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + rw_level = -1; + have_alloc_sem = 0; + } + +out: + if (meta_level != -1) + ocfs2_meta_unlock(inode, meta_level); + if (have_alloc_sem) + up_read(&inode->i_alloc_sem); + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + mutex_unlock(&inode->i_mutex); + + mlog_exit(ret); + return ret; +} + +static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + int ret = 0, rw_level = -1, have_alloc_sem = 0; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_dentry->d_inode; +#ifdef OCFS2_ORACORE_WORKAROUNDS + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +#endif + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, + (unsigned int)count, + filp->f_dentry->d_name.len, + filp->f_dentry->d_name.name); + + if (!inode) { + ret = -EINVAL; + mlog_errno(ret); + goto bail; + } + +#ifdef OCFS2_ORACORE_WORKAROUNDS + if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { + if (filp->f_flags & O_DIRECT) { + int sector_size = 1 << osb->s_sectsize_bits; + + if ((pos & (sector_size - 1)) || + (count & (sector_size - 1)) || + ((unsigned long)buf & (sector_size - 1)) || + (i_size_read(inode) & (sector_size -1))) { + filp->f_flags &= ~O_DIRECT; + } + } + } +#endif + + /* + * buffered reads protect themselves in ->readpage(). O_DIRECT reads + * need locks to protect pending reads from racing with truncate. + */ + if (filp->f_flags & O_DIRECT) { + down_read(&inode->i_alloc_sem); + have_alloc_sem = 1; + + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + rw_level = 0; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb); + } + + ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); + if (ret == -EINVAL) + mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); + + /* buffered aio wouldn't have proper lock coverage today */ + BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + + /* see ocfs2_file_aio_write */ + if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { + rw_level = -1; + have_alloc_sem = 0; + } + +bail: + if (have_alloc_sem) + up_read(&inode->i_alloc_sem); + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + mlog_exit(ret); + + return ret; +} + +struct inode_operations ocfs2_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, +}; + +struct inode_operations ocfs2_special_file_iops = { + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, +}; + +struct file_operations ocfs2_fops = { + .read = do_sync_read, + .write = do_sync_write, + .sendfile = generic_file_sendfile, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, +}; + +struct file_operations ocfs2_dops = { + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, +}; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h new file mode 100644 index 000000000000..a5ea33b24060 --- /dev/null +++ b/fs/ocfs2/file.h @@ -0,0 +1,57 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * file.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_FILE_H +#define OCFS2_FILE_H + +extern struct file_operations ocfs2_fops; +extern struct file_operations ocfs2_dops; +extern struct inode_operations ocfs2_file_iops; +extern struct inode_operations ocfs2_special_file_iops; +struct ocfs2_alloc_context; + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_do_extend_allocation(struct ocfs2_super *osb, + struct inode *inode, + u32 clusters_to_add, + struct buffer_head *fe_bh, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason); +int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); +int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u64 new_i_size); + +#endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c new file mode 100644 index 000000000000..0bbd22f46c80 --- /dev/null +++ b/fs/ocfs2/heartbeat.c @@ -0,0 +1,378 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.c + * + * Register ourselves with the heartbaet service, keep our node maps + * up to date, and fire off recovery when needed. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/kmod.h> + +#include <cluster/heartbeat.h> +#include <cluster/nodemanager.h> + +#include <dlm/dlmapi.h> + +#define MLOG_MASK_PREFIX ML_SUPER +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define OCFS2_HB_NODE_DOWN_PRI (0x0000002) +#define OCFS2_HB_NODE_UP_PRI OCFS2_HB_NODE_DOWN_PRI + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit); +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit); +static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); +static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, + struct ocfs2_node_map *from); +static void __ocfs2_node_map_set(struct ocfs2_node_map *target, + struct ocfs2_node_map *from); + +void ocfs2_init_node_maps(struct ocfs2_super *osb) +{ + spin_lock_init(&osb->node_map_lock); + ocfs2_node_map_init(&osb->mounted_map); + ocfs2_node_map_init(&osb->recovery_map); + ocfs2_node_map_init(&osb->umount_map); +} + +static void ocfs2_do_node_down(int node_num, + struct ocfs2_super *osb) +{ + BUG_ON(osb->node_num == node_num); + + mlog(0, "ocfs2: node down event for %d\n", node_num); + + if (!osb->dlm) { + /* + * No DLM means we're not even ready to participate yet. + * We check the slots after the DLM comes up, so we will + * notice the node death then. We can safely ignore it + * here. + */ + return; + } + + if (ocfs2_node_map_test_bit(osb, &osb->umount_map, node_num)) { + /* If a node is in the umount map, then we've been + * expecting him to go down and we know ahead of time + * that recovery is not necessary. */ + ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); + return; + } + + ocfs2_recovery_thread(osb, node_num); + + ocfs2_remove_node_from_vote_queues(osb, node_num); +} + +static void ocfs2_hb_node_down_cb(struct o2nm_node *node, + int node_num, + void *data) +{ + ocfs2_do_node_down(node_num, (struct ocfs2_super *) data); +} + +/* Called from the dlm when it's about to evict a node. We may also + * get a heartbeat callback later. */ +static void ocfs2_dlm_eviction_cb(int node_num, + void *data) +{ + struct ocfs2_super *osb = (struct ocfs2_super *) data; + struct super_block *sb = osb->sb; + + mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n", + MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num); + + ocfs2_do_node_down(node_num, osb); +} + +static void ocfs2_hb_node_up_cb(struct o2nm_node *node, + int node_num, + void *data) +{ + struct ocfs2_super *osb = data; + + BUG_ON(osb->node_num == node_num); + + mlog(0, "node up event for %d\n", node_num); + ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); +} + +void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) +{ + o2hb_setup_callback(&osb->osb_hb_down, O2HB_NODE_DOWN_CB, + ocfs2_hb_node_down_cb, osb, + OCFS2_HB_NODE_DOWN_PRI); + + o2hb_setup_callback(&osb->osb_hb_up, O2HB_NODE_UP_CB, + ocfs2_hb_node_up_cb, osb, OCFS2_HB_NODE_UP_PRI); + + /* Not exactly a heartbeat callback, but leads to essentially + * the same path so we set it up here. */ + dlm_setup_eviction_cb(&osb->osb_eviction_cb, + ocfs2_dlm_eviction_cb, + osb); +} + +/* Most functions here are just stubs for now... */ +int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) +{ + int status; + + status = o2hb_register_callback(&osb->osb_hb_down); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = o2hb_register_callback(&osb->osb_hb_up); + if (status < 0) + mlog_errno(status); + +bail: + return status; +} + +void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) +{ + int status; + + status = o2hb_unregister_callback(&osb->osb_hb_down); + if (status < 0) + mlog_errno(status); + + status = o2hb_unregister_callback(&osb->osb_hb_up); + if (status < 0) + mlog_errno(status); +} + +void ocfs2_stop_heartbeat(struct ocfs2_super *osb) +{ + int ret; + char *argv[5], *envp[3]; + + if (!osb->uuid_str) { + /* This can happen if we don't get far enough in mount... */ + mlog(0, "No UUID with which to stop heartbeat!\n\n"); + return; + } + + argv[0] = (char *)o2nm_get_hb_ctl_path(); + argv[1] = "-K"; + argv[2] = "-u"; + argv[3] = osb->uuid_str; + argv[4] = NULL; + + mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); + + /* minimal command environment taken from cpu_run_sbin_hotplug */ + envp[0] = "HOME=/"; + envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[2] = NULL; + + ret = call_usermodehelper(argv[0], argv, envp, 1); + if (ret < 0) + mlog_errno(ret); +} + +/* special case -1 for now + * TODO: should *really* make sure the calling func never passes -1!! */ +void ocfs2_node_map_init(struct ocfs2_node_map *map) +{ + map->num_nodes = OCFS2_NODE_MAP_MAX_NODES; + memset(map->map, 0, BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES) * + sizeof(unsigned long)); +} + +static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, + int bit) +{ + set_bit(bit, map->map); +} + +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_set_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, + int bit) +{ + clear_bit(bit, map->map); +} + +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + if (bit==-1) + return; + BUG_ON(bit >= map->num_nodes); + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_clear_bit(map, bit); + spin_unlock(&osb->node_map_lock); +} + +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit) +{ + int ret; + if (bit >= map->num_nodes) { + mlog(ML_ERROR, "bit=%d map->num_nodes=%d\n", bit, map->num_nodes); + BUG(); + } + spin_lock(&osb->node_map_lock); + ret = test_bit(bit, map->map); + spin_unlock(&osb->node_map_lock); + return ret; +} + +static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) +{ + int bit; + bit = find_next_bit(map->map, map->num_nodes, 0); + if (bit < map->num_nodes) + return 0; + return 1; +} + +int ocfs2_node_map_is_empty(struct ocfs2_super *osb, + struct ocfs2_node_map *map) +{ + int ret; + BUG_ON(map->num_nodes == 0); + spin_lock(&osb->node_map_lock); + ret = __ocfs2_node_map_is_empty(map); + spin_unlock(&osb->node_map_lock); + return ret; +} + +static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, + struct ocfs2_node_map *from) +{ + BUG_ON(from->num_nodes == 0); + ocfs2_node_map_init(target); + __ocfs2_node_map_set(target, from); +} + +/* returns 1 if bit is the only bit set in target, 0 otherwise */ +int ocfs2_node_map_is_only(struct ocfs2_super *osb, + struct ocfs2_node_map *target, + int bit) +{ + struct ocfs2_node_map temp; + int ret; + + spin_lock(&osb->node_map_lock); + __ocfs2_node_map_dup(&temp, target); + __ocfs2_node_map_clear_bit(&temp, bit); + ret = __ocfs2_node_map_is_empty(&temp); + spin_unlock(&osb->node_map_lock); + + return ret; +} + +static void __ocfs2_node_map_set(struct ocfs2_node_map *target, + struct ocfs2_node_map *from) +{ + int num_longs, i; + + BUG_ON(target->num_nodes != from->num_nodes); + BUG_ON(target->num_nodes == 0); + + num_longs = BITS_TO_LONGS(target->num_nodes); + for (i = 0; i < num_longs; i++) + target->map[i] = from->map[i]; +} + +/* Returns whether the recovery bit was actually set - it may not be + * if a node is still marked as needing recovery */ +int ocfs2_recovery_map_set(struct ocfs2_super *osb, + int num) +{ + int set = 0; + + spin_lock(&osb->node_map_lock); + + __ocfs2_node_map_clear_bit(&osb->mounted_map, num); + + if (!test_bit(num, osb->recovery_map.map)) { + __ocfs2_node_map_set_bit(&osb->recovery_map, num); + set = 1; + } + + spin_unlock(&osb->node_map_lock); + + return set; +} + +void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + int num) +{ + ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); +} + +int ocfs2_node_map_iterate(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int idx) +{ + int i = idx; + + idx = O2NM_INVALID_NODE_NUM; + spin_lock(&osb->node_map_lock); + if ((i != O2NM_INVALID_NODE_NUM) && + (i >= 0) && + (i < map->num_nodes)) { + while(i < map->num_nodes) { + if (test_bit(i, map->map)) { + idx = i; + break; + } + i++; + } + } + spin_unlock(&osb->node_map_lock); + return idx; +} diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h new file mode 100644 index 000000000000..e8fb079122e4 --- /dev/null +++ b/fs/ocfs2/heartbeat.h @@ -0,0 +1,67 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * heartbeat.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_HEARTBEAT_H +#define OCFS2_HEARTBEAT_H + +void ocfs2_init_node_maps(struct ocfs2_super *osb); + +void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); +int ocfs2_register_hb_callbacks(struct ocfs2_super *osb); +void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb); +void ocfs2_stop_heartbeat(struct ocfs2_super *osb); + +/* node map functions - used to keep track of mounted and in-recovery + * nodes. */ +void ocfs2_node_map_init(struct ocfs2_node_map *map); +int ocfs2_node_map_is_empty(struct ocfs2_super *osb, + struct ocfs2_node_map *map); +void ocfs2_node_map_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +int ocfs2_node_map_test_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int bit); +int ocfs2_node_map_iterate(struct ocfs2_super *osb, + struct ocfs2_node_map *map, + int idx); +static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, + struct ocfs2_node_map *map) +{ + return ocfs2_node_map_iterate(osb, map, 0); +} +int ocfs2_recovery_map_set(struct ocfs2_super *osb, + int num); +void ocfs2_recovery_map_clear(struct ocfs2_super *osb, + int num); +/* returns 1 if bit is the only bit set in target, 0 otherwise */ +int ocfs2_node_map_is_only(struct ocfs2_super *osb, + struct ocfs2_node_map *target, + int bit); + +#endif /* OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c new file mode 100644 index 000000000000..d4ecc0627716 --- /dev/null +++ b/fs/ocfs2/inode.c @@ -0,0 +1,1140 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.c + * + * vfs' aops, fops, dops and iops + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/smp_lock.h> + +#include <asm/byteorder.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "super.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define OCFS2_FI_FLAG_NOWAIT 0x1 +#define OCFS2_FI_FLAG_DELETE 0x2 +struct ocfs2_find_inode_args +{ + u64 fi_blkno; + unsigned long fi_ino; + unsigned int fi_flags; +}; + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args); +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); +static int ocfs2_find_actor(struct inode *inode, void *opaque); +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh); + +struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, + u64 blkno, + int delete_vote) +{ + struct ocfs2_find_inode_args args; + + /* ocfs2_ilookup_for_vote should *only* be called from the + * vote thread */ + BUG_ON(current != osb->vote_task); + + args.fi_blkno = blkno; + args.fi_flags = OCFS2_FI_FLAG_NOWAIT; + if (delete_vote) + args.fi_flags |= OCFS2_FI_FLAG_DELETE; + args.fi_ino = ino_from_blkno(osb->sb, blkno); + return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); +} + +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) +{ + struct inode *inode = NULL; + struct super_block *sb = osb->sb; + struct ocfs2_find_inode_args args; + + mlog_entry("(blkno = %"MLFu64")\n", blkno); + + /* Ok. By now we've either got the offsets passed to us by the + * caller, or we just pulled them off the bh. Lets do some + * sanity checks to make sure they're OK. */ + if (blkno == 0) { + inode = ERR_PTR(-EINVAL); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + + args.fi_blkno = blkno; + args.fi_flags = 0; + args.fi_ino = ino_from_blkno(sb, blkno); + + inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, + ocfs2_init_locked_inode, &args); + /* inode was *not* in the inode cache. 2.6.x requires + * us to do our own read_inode call and unlock it + * afterwards. */ + if (inode && inode->i_state & I_NEW) { + mlog(0, "Inode was not in inode cache, reading it.\n"); + ocfs2_read_locked_inode(inode, &args); + unlock_new_inode(inode); + } + if (inode == NULL) { + inode = ERR_PTR(-ENOMEM); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + if (is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + mlog_errno(PTR_ERR(inode)); + goto bail; + } + +bail: + if (!IS_ERR(inode)) { + mlog(0, "returning inode with number %"MLFu64"\n", + OCFS2_I(inode)->ip_blkno); + mlog_exit_ptr(inode); + } else + mlog_errno(PTR_ERR(inode)); + + return inode; +} + + +/* + * here's how inodes get read from disk: + * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR + * found? : return the in-memory inode + * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE + */ + +static int ocfs2_find_actor(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); + + args = opaque; + + mlog_bug_on_msg(!inode, "No inode in find actor!\n"); + + if (oi->ip_blkno != args->fi_blkno) + goto bail; + + /* OCFS2_FI_FLAG_NOWAIT is *only* set from + * ocfs2_ilookup_for_vote which won't create an inode for one + * that isn't found. The vote thread which doesn't want to get + * an inode which is in the process of going away - otherwise + * the call to __wait_on_freeing_inode in find_inode_fast will + * cause it to deadlock on an inode which may be waiting on a + * vote (or lock release) in delete_inode */ + if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && + (inode->i_state & (I_FREEING|I_CLEAR))) { + /* As stated above, we're not going to return an + * inode. In the case of a delete vote, the voting + * code is going to signal the other node to go + * ahead. Mark that state here, so this freeing inode + * has the state when it gets to delete_inode. */ + if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { + spin_lock(&oi->ip_lock); + ocfs2_mark_inode_remotely_deleted(inode); + spin_unlock(&oi->ip_lock); + } + goto bail; + } + + ret = 1; +bail: + mlog_exit(ret); + return ret; +} + +/* + * initialize the new inode, but don't do anything that would cause + * us to sleep. + * return 0 on success, 1 on failure + */ +static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) +{ + struct ocfs2_find_inode_args *args = opaque; + + mlog_entry("inode = %p, opaque = %p\n", inode, opaque); + + inode->i_ino = args->fi_ino; + OCFS2_I(inode)->ip_blkno = args->fi_blkno; + + mlog_exit(0); + return 0; +} + +int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino) +{ + struct super_block *sb; + struct ocfs2_super *osb; + int status = -EINVAL; + + mlog_entry("(0x%p, size:%"MLFu64")\n", inode, fe->i_size); + + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + /* this means that read_inode cannot create a superblock inode + * today. change if needed. */ + if (!OCFS2_IS_VALID_DINODE(fe) || + !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%"MLFu64", " + "signature = %.*s, flags = 0x%x\n", + inode->i_ino, le64_to_cpu(fe->i_blkno), 7, + fe->i_signature, le32_to_cpu(fe->i_flags)); + goto bail; + } + + if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { + mlog(ML_ERROR, "file entry generation does not match " + "superblock! osb->fs_generation=%x, " + "fe->i_fs_generation=%x\n", + osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); + goto bail; + } + + inode->i_version = 1; + inode->i_generation = le32_to_cpu(fe->i_generation); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_mode = le16_to_cpu(fe->i_mode); + inode->i_uid = le32_to_cpu(fe->i_uid); + inode->i_gid = le32_to_cpu(fe->i_gid); + inode->i_blksize = (u32)osb->s_clustersize; + + /* Fast symlinks will have i_size but no allocated clusters. */ + if (S_ISLNK(inode->i_mode) && !fe->i_clusters) + inode->i_blocks = 0; + else + inode->i_blocks = + ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); + inode->i_mapping->a_ops = &ocfs2_aops; + inode->i_flags |= S_NOATIME; + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) + mlog(ML_ERROR, + "ip_blkno %"MLFu64" != i_blkno %"MLFu64"!\n", + OCFS2_I(inode)->ip_blkno, fe->i_blkno); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; + + if (create_ino) + inode->i_ino = ino_from_blkno(inode->i_sb, + le64_to_cpu(fe->i_blkno)); + + mlog(0, "blkno = %"MLFu64", ino = %lu, create_ino = %s\n", + fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); + + inode->i_nlink = le16_to_cpu(fe->i_links_count); + + if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); + } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { + mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); + /* we can't actually hit this as read_inode can't + * handle superblocks today ;-) */ + BUG(); + } + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_fop = &ocfs2_fops; + inode->i_op = &ocfs2_file_iops; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + case S_IFDIR: + inode->i_op = &ocfs2_dir_iops; + inode->i_fop = &ocfs2_dops; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + case S_IFLNK: + if (ocfs2_inode_is_fast_symlink(inode)) + inode->i_op = &ocfs2_fast_symlink_inode_operations; + else + inode->i_op = &ocfs2_symlink_inode_operations; + i_size_write(inode, le64_to_cpu(fe->i_size)); + break; + default: + inode->i_op = &ocfs2_special_file_iops; + init_special_inode(inode, inode->i_mode, + inode->i_rdev); + break; + } + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, + OCFS2_LOCK_TYPE_RW, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, + OCFS2_LOCK_TYPE_META, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, + OCFS2_LOCK_TYPE_DATA, inode); + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_read_locked_inode(struct inode *inode, + struct ocfs2_find_inode_args *args) +{ + struct super_block *sb; + struct ocfs2_super *osb; + struct ocfs2_dinode *fe; + struct buffer_head *bh = NULL; + int status; + int sysfile = 0; + + mlog_entry("(0x%p, 0x%p)\n", inode, args); + + status = -EINVAL; + if (inode == NULL || inode->i_sb == NULL) { + mlog(ML_ERROR, "bad inode\n"); + goto bail; + } + sb = inode->i_sb; + osb = OCFS2_SB(sb); + + if (!args) { + mlog(ML_ERROR, "bad inode args\n"); + make_bad_inode(inode); + goto bail; + } + + /* Read the FE off disk. This is safe because the kernel only + * does one read_inode2 for a new inode, and if it doesn't + * exist yet then nobody can be working on it! */ + status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); + if (status < 0) { + mlog_errno(status); + make_bad_inode(inode); + goto bail; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + mlog(ML_ERROR, "Invalid dinode #%"MLFu64": signature = %.*s\n", + fe->i_blkno, 7, fe->i_signature); + make_bad_inode(inode); + goto bail; + } + + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) + sysfile = 1; + + if (S_ISCHR(le16_to_cpu(fe->i_mode)) || + S_ISBLK(le16_to_cpu(fe->i_mode))) + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + + status = -EINVAL; + if (ocfs2_populate_inode(inode, fe, 0) < 0) { + mlog(ML_ERROR, "populate inode failed! i_blkno=%"MLFu64", " + "i_ino=%lu\n", fe->i_blkno, inode->i_ino); + make_bad_inode(inode); + goto bail; + } + + BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); + + if (sysfile) + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + + status = 0; + +bail: + if (args && bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +void ocfs2_sync_blockdev(struct super_block *sb) +{ + sync_blockdev(sb->s_bdev); +} + +static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, + struct inode *inode, + struct buffer_head *fe_bh) +{ + int status = 0; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_truncate_context *tc = NULL; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + /* zero allocation, zero truncate :) */ + if (!fe->i_clusters) + goto bail; + + handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_commit_trans(handle); + handle = NULL; + + status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); + if (status < 0) { + mlog_errno(status); + goto bail; + } +bail: + if (handle) + ocfs2_commit_trans(handle); + + mlog_exit(status); + return status; +} + +static int ocfs2_remove_inode(struct inode *inode, + struct buffer_head *di_bh, + struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh) +{ + int status; + struct inode *inode_alloc_inode = NULL; + struct buffer_head *inode_alloc_bh = NULL; + struct ocfs2_journal_handle *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + + inode_alloc_inode = + ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, + le16_to_cpu(di->i_suballoc_slot)); + if (!inode_alloc_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + mutex_lock(&inode_alloc_inode->i_mutex); + status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1); + if (status < 0) { + mutex_unlock(&inode_alloc_inode->i_mutex); + + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + /* set the inodes dtime */ + status = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); + + status = ocfs2_journal_dirty(handle, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } + + ocfs2_remove_from_cache(inode, di_bh); + + status = ocfs2_free_dinode(handle, inode_alloc_inode, + inode_alloc_bh, di); + if (status < 0) + mlog_errno(status); + +bail_commit: + ocfs2_commit_trans(handle); +bail_unlock: + ocfs2_meta_unlock(inode_alloc_inode, 1); + mutex_unlock(&inode_alloc_inode->i_mutex); + brelse(inode_alloc_bh); +bail: + iput(inode_alloc_inode); + + return status; +} + +static int ocfs2_wipe_inode(struct inode *inode, + struct buffer_head *di_bh) +{ + int status, orphaned_slot; + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* We've already voted on this so it should be readonly - no + * spinlock needed. */ + orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + orphaned_slot); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + /* Lock the orphan dir. The lock will be held for the entire + * delete_inode operation. We do this now to avoid races with + * recovery completion on other nodes. */ + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + + mlog_errno(status); + goto bail; + } + + /* we do this while holding the orphan dir lock because we + * don't want recovery being run from another node to vote for + * an inode delete on us -- this will result in two nodes + * truncating the same file! */ + status = ocfs2_truncate_for_delete(osb, inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, + orphan_dir_bh); + if (status < 0) + mlog_errno(status); + +bail_unlock_dir: + ocfs2_meta_unlock(orphan_dir_inode, 1); + mutex_unlock(&orphan_dir_inode->i_mutex); + brelse(orphan_dir_bh); +bail: + iput(orphan_dir_inode); + + return status; +} + +/* There is a series of simple checks that should be done before a + * vote is even considered. Encapsulate those in this function. */ +static int ocfs2_inode_is_valid_to_delete(struct inode *inode) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + /* We shouldn't be getting here for the root directory + * inode.. */ + if (inode == osb->root_inode) { + mlog(ML_ERROR, "Skipping delete of root inode.\n"); + goto bail; + } + + /* If we're coming from process_vote we can't go into our own + * voting [hello, deadlock city!], so unforuntately we just + * have to skip deleting this guy. That's OK though because + * the node who's doing the actual deleting should handle it + * anyway. */ + if (current == osb->vote_task) { + mlog(0, "Skipping delete of %lu because we're currently " + "in process_vote\n", inode->i_ino); + goto bail; + } + + spin_lock(&oi->ip_lock); + /* OCFS2 *never* deletes system files. This should technically + * never get here as system file inodes should always have a + * positive link count. */ + if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + mlog(ML_ERROR, "Skipping delete of system file %"MLFu64".\n", + oi->ip_blkno); + goto bail_unlock; + } + + /* If we have voted "yes" on the wipe of this inode for + * another node, it will be marked here so we can safely skip + * it. Recovery will cleanup any inodes we might inadvertantly + * skip here. */ + if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { + mlog(0, "Skipping delete of %lu because another node " + "has done this for us.\n", inode->i_ino); + goto bail_unlock; + } + + ret = 1; +bail_unlock: + spin_unlock(&oi->ip_lock); +bail: + return ret; +} + +/* Query the cluster to determine whether we should wipe an inode from + * disk or not. + * + * Requires the inode to have the cluster lock. */ +static int ocfs2_query_inode_wipe(struct inode *inode, + struct buffer_head *di_bh, + int *wipe) +{ + int status = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di; + + *wipe = 0; + + /* While we were waiting for the cluster lock in + * ocfs2_delete_inode, another node might have asked to delete + * the inode. Recheck our flags to catch this. */ + if (!ocfs2_inode_is_valid_to_delete(inode)) { + mlog(0, "Skipping delete of %"MLFu64" because flags changed\n", + oi->ip_blkno); + goto bail; + } + + /* Now that we have an up to date inode, we can double check + * the link count. */ + if (inode->i_nlink) { + mlog(0, "Skipping delete of %"MLFu64" because nlink = %u\n", + oi->ip_blkno, inode->i_nlink); + goto bail; + } + + /* Do some basic inode verification... */ + di = (struct ocfs2_dinode *) di_bh->b_data; + if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { + /* for lack of a better error? */ + status = -EEXIST; + mlog(ML_ERROR, + "Inode %"MLFu64" (on-disk %"MLFu64") not orphaned! " + "Disk flags 0x%x, inode flags 0x%x\n", + oi->ip_blkno, di->i_blkno, di->i_flags, oi->ip_flags); + goto bail; + } + + /* has someone already deleted us?! baaad... */ + if (di->i_dtime) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } + + status = ocfs2_request_delete_vote(inode); + /* -EBUSY means that other nodes are still using the + * inode. We're done here though, so avoid doing anything on + * disk and let them worry about deleting it. */ + if (status == -EBUSY) { + status = 0; + mlog(0, "Skipping delete of %"MLFu64" because it is in use on" + "other nodes\n", oi->ip_blkno); + goto bail; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + + spin_lock(&oi->ip_lock); + if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { + /* Nobody knew which slot this inode was orphaned + * into. This may happen during node death and + * recovery knows how to clean it up so we can safely + * ignore this inode for now on. */ + mlog(0, "Nobody knew where inode %"MLFu64" was orphaned!\n", + oi->ip_blkno); + } else { + *wipe = 1; + + mlog(0, "Inode %"MLFu64" is ok to wipe from orphan dir %d\n", + oi->ip_blkno, oi->ip_orphaned_slot); + } + spin_unlock(&oi->ip_lock); + +bail: + return status; +} + +/* Support function for ocfs2_delete_inode. Will help us keep the + * inode data in a consistent state for clear_inode. Always truncates + * pages, optionally sync's them first. */ +static void ocfs2_cleanup_delete_inode(struct inode *inode, + int sync_data) +{ + mlog(0, "Cleanup inode %"MLFu64", sync = %d\n", + OCFS2_I(inode)->ip_blkno, sync_data); + if (sync_data) + write_inode_now(inode, 1); + truncate_inode_pages(&inode->i_data, 0); +} + +void ocfs2_delete_inode(struct inode *inode) +{ + int wipe, status; + sigset_t blocked, oldset; + struct buffer_head *di_bh = NULL; + + mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); + + if (is_bad_inode(inode)) { + mlog(0, "Skipping delete of bad inode\n"); + goto bail; + } + + if (!ocfs2_inode_is_valid_to_delete(inode)) { + /* It's probably not necessary to truncate_inode_pages + * here but we do it for safety anyway (it will most + * likely be a no-op anyway) */ + ocfs2_cleanup_delete_inode(inode, 0); + goto bail; + } + + /* We want to block signals in delete_inode as the lock and + * messaging paths may return us -ERESTARTSYS. Which would + * cause us to exit early, resulting in inodes being orphaned + * forever. */ + sigfillset(&blocked); + status = sigprocmask(SIG_BLOCK, &blocked, &oldset); + if (status < 0) { + mlog_errno(status); + ocfs2_cleanup_delete_inode(inode, 1); + goto bail; + } + + /* Lock down the inode. This gives us an up to date view of + * it's metadata (for verification), and allows us to + * serialize delete_inode votes. + * + * Even though we might be doing a truncate, we don't take the + * allocation lock here as it won't be needed - nobody will + * have the file open. + */ + status = ocfs2_meta_lock(inode, NULL, &di_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ocfs2_cleanup_delete_inode(inode, 0); + goto bail_unblock; + } + + /* Query the cluster. This will be the final decision made + * before we go ahead and wipe the inode. */ + status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); + if (!wipe || status < 0) { + /* Error and inode busy vote both mean we won't be + * removing the inode, so they take almost the same + * path. */ + if (status < 0) + mlog_errno(status); + + /* Someone in the cluster has voted to not wipe this + * inode, or it was never completely orphaned. Write + * out the pages and exit now. */ + ocfs2_cleanup_delete_inode(inode, 1); + goto bail_unlock_inode; + } + + ocfs2_cleanup_delete_inode(inode, 0); + + status = ocfs2_wipe_inode(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_inode; + } + + /* Mark the inode as successfully deleted. This is important + * for ocfs2_clear_inode as it will check this flag and skip + * any checkpointing work */ + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; + +bail_unlock_inode: + ocfs2_meta_unlock(inode, 1); + brelse(di_bh); +bail_unblock: + status = sigprocmask(SIG_SETMASK, &oldset, NULL); + if (status < 0) + mlog_errno(status); +bail: + clear_inode(inode); + mlog_exit_void(); +} + +void ocfs2_clear_inode(struct inode *inode) +{ + int status; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry_void(); + + if (!inode) + goto bail; + + mlog(0, "Clearing inode: %"MLFu64", nlink = %u\n", + OCFS2_I(inode)->ip_blkno, inode->i_nlink); + + mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, + "Inode=%lu\n", inode->i_ino); + + /* Do these before all the other work so that we don't bounce + * the vote thread while waiting to destroy the locks. */ + ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); + + /* We very well may get a clear_inode before all an inodes + * metadata has hit disk. Of course, we can't drop any cluster + * locks until the journal has finished with it. The only + * exception here are successfully wiped inodes - their + * metadata can now be considered to be part of the system + * inodes from which it came. */ + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) + ocfs2_checkpoint_inode(inode); + + mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), + "Clear inode of %"MLFu64", inode has io markers\n", + oi->ip_blkno); + + ocfs2_extent_map_drop(inode, 0); + ocfs2_extent_map_init(inode); + + status = ocfs2_drop_inode_locks(inode); + if (status < 0) + mlog_errno(status); + + ocfs2_lock_res_free(&oi->ip_rw_lockres); + ocfs2_lock_res_free(&oi->ip_meta_lockres); + ocfs2_lock_res_free(&oi->ip_data_lockres); + + ocfs2_metadata_cache_purge(inode); + + mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, + "Clear inode of %"MLFu64", inode has %u cache items\n", + oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); + + mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), + "Clear inode of %"MLFu64", inode has a bad flag\n", + oi->ip_blkno); + + mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), + "Clear inode of %"MLFu64", inode is locked\n", + oi->ip_blkno); + + mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), + "Clear inode of %"MLFu64", io_sem is locked\n", + oi->ip_blkno); + up(&oi->ip_io_sem); + + /* + * down_trylock() returns 0, down_write_trylock() returns 1 + * kernel 1, world 0 + */ + mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), + "Clear inode of %"MLFu64", alloc_sem is locked\n", + oi->ip_blkno); + up_write(&oi->ip_alloc_sem); + + mlog_bug_on_msg(oi->ip_open_count, + "Clear inode of %"MLFu64" has open count %d\n", + oi->ip_blkno, oi->ip_open_count); + mlog_bug_on_msg(!list_empty(&oi->ip_handle_list), + "Clear inode of %"MLFu64" has non empty handle list\n", + oi->ip_blkno); + mlog_bug_on_msg(oi->ip_handle, + "Clear inode of %"MLFu64" has non empty handle pointer\n", + oi->ip_blkno); + + /* Clear all other flags. */ + oi->ip_flags = OCFS2_INODE_CACHE_INLINE; + oi->ip_created_trans = 0; + oi->ip_last_trans = 0; + oi->ip_dir_start_lookup = 0; + oi->ip_blkno = 0ULL; + +bail: + mlog_exit_void(); +} + +/* Called under inode_lock, with no more references on the + * struct inode, so it's safe here to check the flags field + * and to manipulate i_nlink without any other locks. */ +void ocfs2_drop_inode(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog_entry_void(); + + mlog(0, "Drop inode %"MLFu64", nlink = %u, ip_flags = 0x%x\n", + oi->ip_blkno, inode->i_nlink, oi->ip_flags); + + /* Testing ip_orphaned_slot here wouldn't work because we may + * not have gotten a delete_inode vote from any other nodes + * yet. */ + if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) { + mlog(0, "Inode was orphaned on another node, clearing nlink.\n"); + inode->i_nlink = 0; + } + + generic_drop_inode(inode); + + mlog_exit_void(); +} + +/* + * TODO: this should probably be merged into ocfs2_get_block + * + * However, you now need to pay attention to the cont_prepare_write() + * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much + * expects never to extend). + */ +struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada) +{ + struct buffer_head *bh = NULL; + int tmperr; + u64 p_blkno; + int readflags = OCFS2_BH_CACHED; + +#if 0 + /* only turn this on if we know we can deal with read_block + * returning nothing */ + if (reada) + readflags |= OCFS2_BH_READAHEAD; +#endif + + if (((u64)block << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!reada); + return NULL; + } + + tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, + &p_blkno, NULL); + if (tmperr < 0) { + mlog_errno(tmperr); + goto fail; + } + + tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, + readflags, inode); + if (tmperr < 0) + goto fail; + + tmperr = 0; + + *err = 0; + return bh; + +fail: + if (bh) { + brelse(bh); + bh = NULL; + } + *err = -EIO; + return NULL; +} + +/* + * This is called from our getattr. + */ +int ocfs2_inode_revalidate(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int status = 0; + + mlog_entry("(inode = 0x%p, ino = %"MLFu64")\n", inode, + inode ? OCFS2_I(inode)->ip_blkno : 0ULL); + + if (!inode) { + mlog(0, "eep, no inode!\n"); + status = -ENOENT; + goto bail; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { + spin_unlock(&OCFS2_I(inode)->ip_lock); + mlog(0, "inode deleted!\n"); + status = -ENOENT; + goto bail; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* Let ocfs2_meta_lock do the work of updating our struct + * inode for us. */ + status = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + ocfs2_meta_unlock(inode, 0); +bail: + mlog_exit(status); + + return status; +} + +/* + * Updates a disk inode from a + * struct inode. + * Only takes ip_lock. + */ +int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh) +{ + int status; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + + mlog_entry("(inode %"MLFu64")\n", OCFS2_I(inode)->ip_blkno); + + status = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + spin_lock(&OCFS2_I(inode)->ip_lock); + fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + fe->i_size = cpu_to_le64(i_size_read(inode)); + fe->i_links_count = cpu_to_le16(inode->i_nlink); + fe->i_uid = cpu_to_le32(inode->i_uid); + fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_mode = cpu_to_le16(inode->i_mode); + fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); + fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); + fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) + mlog_errno(status); + + status = 0; +leave: + + mlog_exit(status); + return status; +} + +/* + * + * Updates a struct inode from a disk inode. + * does no i/o, only takes ip_lock. + */ +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + spin_lock(&OCFS2_I(inode)->ip_lock); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + i_size_write(inode, le64_to_cpu(fe->i_size)); + inode->i_nlink = le16_to_cpu(fe->i_links_count); + inode->i_uid = le32_to_cpu(fe->i_uid); + inode->i_gid = le32_to_cpu(fe->i_gid); + inode->i_mode = le16_to_cpu(fe->i_mode); + inode->i_blksize = (u32) osb->s_clustersize; + if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) + inode->i_blocks = 0; + else + inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); + inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); + inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); + inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); + inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); + inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + + spin_unlock(&OCFS2_I(inode)->ip_lock); +} diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h new file mode 100644 index 000000000000..9b0177433653 --- /dev/null +++ b/fs/ocfs2/inode.h @@ -0,0 +1,145 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * inode.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_INODE_H +#define OCFS2_INODE_H + +/* OCFS2 Inode Private Data */ +struct ocfs2_inode_info +{ + u64 ip_blkno; + + struct ocfs2_lock_res ip_rw_lockres; + struct ocfs2_lock_res ip_meta_lockres; + struct ocfs2_lock_res ip_data_lockres; + + /* protects allocation changes on this inode. */ + struct rw_semaphore ip_alloc_sem; + + /* These fields are protected by ip_lock */ + spinlock_t ip_lock; + u32 ip_open_count; + u32 ip_clusters; + struct ocfs2_extent_map ip_map; + struct list_head ip_io_markers; + int ip_orphaned_slot; + + struct semaphore ip_io_sem; + + /* Used by the journalling code to attach an inode to a + * handle. These are protected by ip_io_sem in order to lock + * out other I/O to the inode until we either commit or + * abort. */ + struct list_head ip_handle_list; + struct ocfs2_journal_handle *ip_handle; + + u32 ip_flags; /* see below */ + + /* protected by recovery_lock. */ + struct inode *ip_next_orphan; + + u32 ip_dir_start_lookup; + + /* next two are protected by trans_inc_lock */ + /* which transaction were we created on? Zero if none. */ + unsigned long ip_created_trans; + /* last transaction we were a part of. */ + unsigned long ip_last_trans; + + struct ocfs2_caching_info ip_metadata_cache; + + struct inode vfs_inode; +}; + +/* + * Flags for the ip_flags field + */ +/* System file inodes */ +#define OCFS2_INODE_SYSTEM_FILE 0x00000001 +#define OCFS2_INODE_JOURNAL 0x00000002 +#define OCFS2_INODE_BITMAP 0x00000004 +/* This inode has been wiped from disk */ +#define OCFS2_INODE_DELETED 0x00000008 +/* Another node is deleting, so our delete is a nop */ +#define OCFS2_INODE_SKIP_DELETE 0x00000010 +/* Has the inode been orphaned on another node? + * + * This hints to ocfs2_drop_inode that it should clear i_nlink before + * continuing. + * + * We *only* set this on unlink vote from another node. If the inode + * was locally orphaned, then we're sure of the state and don't need + * to twiddle i_nlink later - it's either zero or not depending on + * whether our unlink succeeded. Otherwise we got this from a node + * whose intention was to orphan the inode, however he may have + * crashed, failed etc, so we let ocfs2_drop_inode zero the value and + * rely on ocfs2_delete_inode to sort things out under the proper + * cluster locks. + */ +#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 +/* Does someone have the file open O_DIRECT */ +#define OCFS2_INODE_OPEN_DIRECT 0x00000040 +/* Indicates that the metadata cache should be used as an array. */ +#define OCFS2_INODE_CACHE_INLINE 0x00000080 + +static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) +{ + return container_of(inode, struct ocfs2_inode_info, vfs_inode); +} + +#define INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags & OCFS2_INODE_JOURNAL) +#define SET_INODE_JOURNAL(i) (OCFS2_I(i)->ip_flags |= OCFS2_INODE_JOURNAL) + +extern kmem_cache_t *ocfs2_inode_cache; + +extern struct address_space_operations ocfs2_aops; + +struct buffer_head *ocfs2_bread(struct inode *inode, int block, + int *err, int reada); +void ocfs2_clear_inode(struct inode *inode); +void ocfs2_delete_inode(struct inode *inode); +void ocfs2_drop_inode(struct inode *inode); +struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff); +struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, + u64 blkno, + int delete_vote); +int ocfs2_inode_init_private(struct inode *inode); +int ocfs2_inode_revalidate(struct dentry *dentry); +int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino); +void ocfs2_read_inode(struct inode *inode); +void ocfs2_read_inode2(struct inode *inode, void *opaque); +ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, + size_t size, loff_t *offp); +void ocfs2_sync_blockdev(struct super_block *sb); +void ocfs2_refresh_inode(struct inode *inode, + struct ocfs2_dinode *fe); +int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh); +int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); +int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); + +#endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c new file mode 100644 index 000000000000..303c8d96457f --- /dev/null +++ b/fs/ocfs2/journal.c @@ -0,0 +1,1652 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.c + * + * Defines functions of journalling api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/kthread.h> + +#define MLOG_MASK_PREFIX ML_JOURNAL +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "namei.h" +#include "slot_map.h" +#include "super.h" +#include "vote.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; + +static int ocfs2_force_read_journal(struct inode *inode); +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num); +static int __ocfs2_recovery_thread(void *arg); +static int ocfs2_commit_cache(struct ocfs2_super *osb); +static int ocfs2_wait_on_mount(struct ocfs2_super *osb); +static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, + struct ocfs2_journal_handle *handle); +static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle); +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty); +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num); +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot); +static int ocfs2_commit_thread(void *arg); + +static int ocfs2_commit_cache(struct ocfs2_super *osb) +{ + int status = 0; + unsigned int flushed; + unsigned long old_id; + struct ocfs2_journal *journal = NULL; + + mlog_entry_void(); + + journal = osb->journal; + + /* Flush all pending commits and checkpoint the journal. */ + down_write(&journal->j_trans_barrier); + + if (atomic_read(&journal->j_num_trans) == 0) { + up_write(&journal->j_trans_barrier); + mlog(0, "No transactions for me to flush!\n"); + goto finally; + } + + journal_lock_updates(journal->j_journal); + status = journal_flush(journal->j_journal); + journal_unlock_updates(journal->j_journal); + if (status < 0) { + up_write(&journal->j_trans_barrier); + mlog_errno(status); + goto finally; + } + + old_id = ocfs2_inc_trans_id(journal); + + flushed = atomic_read(&journal->j_num_trans); + atomic_set(&journal->j_num_trans, 0); + up_write(&journal->j_trans_barrier); + + mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", + journal->j_trans_id, flushed); + + ocfs2_kick_vote_thread(osb); + wake_up(&journal->j_checkpointed); +finally: + mlog_exit(status); + return status; +} + +struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) +{ + struct ocfs2_journal_handle *retval = NULL; + + retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); + if (!retval) { + mlog(ML_ERROR, "Failed to allocate memory for journal " + "handle!\n"); + return NULL; + } + + retval->max_buffs = 0; + retval->num_locks = 0; + retval->k_handle = NULL; + + INIT_LIST_HEAD(&retval->locks); + INIT_LIST_HEAD(&retval->inode_list); + retval->journal = osb->journal; + + return retval; +} + +/* pass it NULL and it will allocate a new handle object for you. If + * you pass it a handle however, it may still return error, in which + * case it has free'd the passed handle for you. */ +struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + int max_buffs) +{ + int ret; + journal_t *journal = osb->journal->j_journal; + + mlog_entry("(max_buffs = %d)\n", max_buffs); + + if (!osb || !osb->journal->j_journal) + BUG(); + + if (ocfs2_is_hard_readonly(osb)) { + ret = -EROFS; + goto done_free; + } + + BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); + BUG_ON(max_buffs <= 0); + + /* JBD might support this, but our journalling code doesn't yet. */ + if (journal_current_handle()) { + mlog(ML_ERROR, "Recursive transaction attempted!\n"); + BUG(); + } + + if (!handle) + handle = ocfs2_alloc_handle(osb); + if (!handle) { + ret = -ENOMEM; + mlog(ML_ERROR, "Failed to allocate memory for journal " + "handle!\n"); + goto done_free; + } + + handle->max_buffs = max_buffs; + + down_read(&osb->journal->j_trans_barrier); + + /* actually start the transaction now */ + handle->k_handle = journal_start(journal, max_buffs); + if (IS_ERR(handle->k_handle)) { + up_read(&osb->journal->j_trans_barrier); + + ret = PTR_ERR(handle->k_handle); + handle->k_handle = NULL; + mlog_errno(ret); + + if (is_journal_aborted(journal)) { + ocfs2_abort(osb->sb, "Detected aborted journal"); + ret = -EROFS; + } + goto done_free; + } + + atomic_inc(&(osb->journal->j_num_trans)); + handle->flags |= OCFS2_HANDLE_STARTED; + + mlog_exit_ptr(handle); + return handle; + +done_free: + if (handle) + ocfs2_commit_unstarted_handle(handle); /* will kfree handle */ + + mlog_exit(ret); + return ERR_PTR(ret); +} + +void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, + struct inode *inode) +{ + BUG_ON(!handle); + BUG_ON(!inode); + + atomic_inc(&inode->i_count); + + /* we're obviously changing it... */ + mutex_lock(&inode->i_mutex); + + /* sanity check */ + BUG_ON(OCFS2_I(inode)->ip_handle); + BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); + + OCFS2_I(inode)->ip_handle = handle; + list_del(&(OCFS2_I(inode)->ip_handle_list)); + list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); +} + +static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) +{ + struct list_head *p, *n; + struct inode *inode; + struct ocfs2_inode_info *oi; + + list_for_each_safe(p, n, &handle->inode_list) { + oi = list_entry(p, struct ocfs2_inode_info, + ip_handle_list); + inode = &oi->vfs_inode; + + OCFS2_I(inode)->ip_handle = NULL; + list_del_init(&OCFS2_I(inode)->ip_handle_list); + + mutex_unlock(&inode->i_mutex); + iput(inode); + } +} + +/* This is trivial so we do it out of the main commit + * paths. Beware, it can be called from start_trans too! */ +static void ocfs2_commit_unstarted_handle(struct ocfs2_journal_handle *handle) +{ + mlog_entry_void(); + + BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); + + ocfs2_handle_unlock_inodes(handle); + /* You are allowed to add journal locks before the transaction + * has started. */ + ocfs2_handle_cleanup_locks(handle->journal, handle); + + kfree(handle); + + mlog_exit_void(); +} + +void ocfs2_commit_trans(struct ocfs2_journal_handle *handle) +{ + handle_t *jbd_handle; + int retval; + struct ocfs2_journal *journal = handle->journal; + + mlog_entry_void(); + + BUG_ON(!handle); + + if (!(handle->flags & OCFS2_HANDLE_STARTED)) { + ocfs2_commit_unstarted_handle(handle); + mlog_exit_void(); + return; + } + + /* release inode semaphores we took during this transaction */ + ocfs2_handle_unlock_inodes(handle); + + /* ocfs2_extend_trans may have had to call journal_restart + * which will always commit the transaction, but may return + * error for any number of reasons. If this is the case, we + * clear k_handle as it's not valid any more. */ + if (handle->k_handle) { + jbd_handle = handle->k_handle; + + if (handle->flags & OCFS2_HANDLE_SYNC) + jbd_handle->h_sync = 1; + else + jbd_handle->h_sync = 0; + + /* actually stop the transaction. if we've set h_sync, + * it'll have been committed when we return */ + retval = journal_stop(jbd_handle); + if (retval < 0) { + mlog_errno(retval); + mlog(ML_ERROR, "Could not commit transaction\n"); + BUG(); + } + + handle->k_handle = NULL; /* it's been free'd in journal_stop */ + } + + ocfs2_handle_cleanup_locks(journal, handle); + + up_read(&journal->j_trans_barrier); + + kfree(handle); + mlog_exit_void(); +} + +/* + * 'nblocks' is what you want to add to the current + * transaction. extend_trans will either extend the current handle by + * nblocks, or commit it and start a new one with nblocks credits. + * + * WARNING: This will not release any semaphores or disk locks taken + * during the transaction, so make sure they were taken *before* + * start_trans or we'll have ordering deadlocks. + * + * WARNING2: Note that we do *not* drop j_trans_barrier here. This is + * good because transaction ids haven't yet been recorded on the + * cluster locks associated with this handle. + */ +int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, + int nblocks) +{ + int status; + + BUG_ON(!handle); + BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); + BUG_ON(!nblocks); + + mlog_entry_void(); + + mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); + + status = journal_extend(handle->k_handle, nblocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (status > 0) { + mlog(0, "journal_extend failed, trying journal_restart\n"); + status = journal_restart(handle->k_handle, nblocks); + if (status < 0) { + handle->k_handle = NULL; + mlog_errno(status); + goto bail; + } + handle->max_buffs = nblocks; + } else + handle->max_buffs += nblocks; + + status = 0; +bail: + + mlog_exit(status); + return status; +} + +int ocfs2_journal_access(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh, + int type) +{ + int status; + + BUG_ON(!inode); + BUG_ON(!handle); + BUG_ON(!bh); + BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); + + mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %hu\n", + (unsigned long long)bh->b_blocknr, type, + (type == OCFS2_JOURNAL_ACCESS_CREATE) ? + "OCFS2_JOURNAL_ACCESS_CREATE" : + "OCFS2_JOURNAL_ACCESS_WRITE", + bh->b_size); + + /* we can safely remove this assertion after testing. */ + if (!buffer_uptodate(bh)) { + mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); + mlog(ML_ERROR, "b_blocknr=%llu\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + /* Set the current transaction information on the inode so + * that the locking code knows whether it can drop it's locks + * on this inode or not. We're protected from the commit + * thread updating the current transaction id until + * ocfs2_commit_trans() because ocfs2_start_trans() took + * j_trans_barrier for us. */ + ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); + + down(&OCFS2_I(inode)->ip_io_sem); + switch (type) { + case OCFS2_JOURNAL_ACCESS_CREATE: + case OCFS2_JOURNAL_ACCESS_WRITE: + status = journal_get_write_access(handle->k_handle, bh); + break; + + case OCFS2_JOURNAL_ACCESS_UNDO: + status = journal_get_undo_access(handle->k_handle, bh); + break; + + default: + status = -EINVAL; + mlog(ML_ERROR, "Uknown access type!\n"); + } + up(&OCFS2_I(inode)->ip_io_sem); + + if (status < 0) + mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", + status, type); + + mlog_exit(status); + return status; +} + +int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, + struct buffer_head *bh) +{ + int status; + + BUG_ON(!(handle->flags & OCFS2_HANDLE_STARTED)); + + mlog_entry("(bh->b_blocknr=%llu)\n", + (unsigned long long)bh->b_blocknr); + + status = journal_dirty_metadata(handle->k_handle, bh); + if (status < 0) + mlog(ML_ERROR, "Could not dirty metadata buffer. " + "(bh->b_blocknr=%llu)\n", + (unsigned long long)bh->b_blocknr); + + mlog_exit(status); + return status; +} + +int ocfs2_journal_dirty_data(handle_t *handle, + struct buffer_head *bh) +{ + int err = journal_dirty_data(handle, bh); + if (err) + mlog_errno(err); + /* TODO: When we can handle it, abort the handle and go RO on + * error here. */ + + return err; +} + +/* We always assume you're adding a metadata lock at level 'ex' */ +int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, + struct inode *inode) +{ + int status; + struct ocfs2_journal_lock *lock; + + BUG_ON(!inode); + + lock = kmem_cache_alloc(ocfs2_lock_cache, GFP_NOFS); + if (!lock) { + status = -ENOMEM; + mlog_errno(-ENOMEM); + goto bail; + } + + if (!igrab(inode)) + BUG(); + lock->jl_inode = inode; + + list_add_tail(&(lock->jl_lock_list), &(handle->locks)); + handle->num_locks++; + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static void ocfs2_handle_cleanup_locks(struct ocfs2_journal *journal, + struct ocfs2_journal_handle *handle) +{ + struct list_head *p, *n; + struct ocfs2_journal_lock *lock; + struct inode *inode; + + list_for_each_safe(p, n, &(handle->locks)) { + lock = list_entry(p, struct ocfs2_journal_lock, + jl_lock_list); + list_del(&lock->jl_lock_list); + handle->num_locks--; + + inode = lock->jl_inode; + ocfs2_meta_unlock(inode, 1); + if (atomic_read(&inode->i_count) == 1) + mlog(ML_ERROR, + "Inode %"MLFu64", I'm doing a last iput for!", + OCFS2_I(inode)->ip_blkno); + iput(inode); + kmem_cache_free(ocfs2_lock_cache, lock); + } +} + +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) + +void ocfs2_set_journal_params(struct ocfs2_super *osb) +{ + journal_t *journal = osb->journal->j_journal; + + spin_lock(&journal->j_state_lock); + journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); +} + +int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +{ + int status = -1; + struct inode *inode = NULL; /* the journal inode */ + journal_t *j_journal = NULL; + struct ocfs2_dinode *di = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_super *osb; + int meta_lock = 0; + + mlog_entry_void(); + + BUG_ON(!journal); + + osb = journal->j_osb; + + /* already have the inode for our journal */ + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + osb->slot_num); + if (inode == NULL) { + status = -EACCES; + mlog_errno(status); + goto done; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto done; + } + + SET_INODE_JOURNAL(inode); + OCFS2_I(inode)->ip_open_count++; + + status = ocfs2_meta_lock(inode, NULL, &bh, 1); + if (status < 0) { + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not get lock on journal!\n"); + goto done; + } + + meta_lock = 1; + di = (struct ocfs2_dinode *)bh->b_data; + + if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { + mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", + inode->i_size); + status = -EINVAL; + goto done; + } + + mlog(0, "inode->i_size = %lld\n", inode->i_size); + mlog(0, "inode->i_blocks = %lu\n", inode->i_blocks); + mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); + + /* call the kernels journal init function now */ + j_journal = journal_init_inode(inode); + if (j_journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EINVAL; + goto done; + } + + mlog(0, "Returned from journal_init_inode\n"); + mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); + + *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL); + + journal->j_journal = j_journal; + journal->j_inode = inode; + journal->j_bh = bh; + + ocfs2_set_journal_params(osb); + + journal->j_state = OCFS2_JOURNAL_LOADED; + + status = 0; +done: + if (status < 0) { + if (meta_lock) + ocfs2_meta_unlock(inode, 1); + if (bh != NULL) + brelse(bh); + if (inode) { + OCFS2_I(inode)->ip_open_count--; + iput(inode); + } + } + + mlog_exit(status); + return status; +} + +static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, + int dirty) +{ + int status; + unsigned int flags; + struct ocfs2_journal *journal = osb->journal; + struct buffer_head *bh = journal->j_bh; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + fe = (struct ocfs2_dinode *)bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + /* This is called from startup/shutdown which will + * handle the errors in a specific manner, so no need + * to call ocfs2_error() here. */ + mlog(ML_ERROR, "Journal dinode %"MLFu64" has invalid " + "signature: %.*s", fe->i_blkno, 7, fe->i_signature); + status = -EIO; + goto out; + } + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + if (dirty) + flags |= OCFS2_JOURNAL_DIRTY_FL; + else + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + status = ocfs2_write_block(osb, bh, journal->j_inode); + if (status < 0) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +/* + * If the journal has been kmalloc'd it needs to be freed after this + * call. + */ +void ocfs2_journal_shutdown(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = NULL; + int status = 0; + struct inode *inode = NULL; + int num_running_trans = 0; + + mlog_entry_void(); + + if (!osb) + BUG(); + + journal = osb->journal; + if (!journal) + goto done; + + inode = journal->j_inode; + + if (journal->j_state != OCFS2_JOURNAL_LOADED) + goto done; + + /* need to inc inode use count as journal_destroy will iput. */ + if (!igrab(inode)) + BUG(); + + num_running_trans = atomic_read(&(osb->journal->j_num_trans)); + if (num_running_trans > 0) + mlog(0, "Shutting down journal: must wait on %d " + "running transactions!\n", + num_running_trans); + + /* Do a commit_cache here. It will flush our journal, *and* + * release any locks that are still held. + * set the SHUTDOWN flag and release the trans lock. + * the commit thread will take the trans lock for us below. */ + journal->j_state = OCFS2_JOURNAL_IN_SHUTDOWN; + + /* The OCFS2_JOURNAL_IN_SHUTDOWN will signal to commit_cache to not + * drop the trans_lock (which we want to hold until we + * completely destroy the journal. */ + if (osb->commit_task) { + /* Wait for the commit thread */ + mlog(0, "Waiting for ocfs2commit to exit....\n"); + kthread_stop(osb->commit_task); + osb->commit_task = NULL; + } + + BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); + + status = ocfs2_journal_toggle_dirty(osb, 0); + if (status < 0) + mlog_errno(status); + + /* Shutdown the kernel journal system */ + journal_destroy(journal->j_journal); + + OCFS2_I(inode)->ip_open_count--; + + /* unlock our journal */ + ocfs2_meta_unlock(inode, 1); + + brelse(journal->j_bh); + journal->j_bh = NULL; + + journal->j_state = OCFS2_JOURNAL_FREE; + +// up_write(&journal->j_trans_barrier); +done: + if (inode) + iput(inode); + mlog_exit_void(); +} + +static void ocfs2_clear_journal_error(struct super_block *sb, + journal_t *journal, + int slot) +{ + int olderr; + + olderr = journal_errno(journal); + if (olderr) { + mlog(ML_ERROR, "File system error %d recorded in " + "journal %u.\n", olderr, slot); + mlog(ML_ERROR, "File system on device %s needs checking.\n", + sb->s_id); + + journal_ack_err(journal); + journal_clear_err(journal); + } +} + +int ocfs2_journal_load(struct ocfs2_journal *journal) +{ + int status = 0; + struct ocfs2_super *osb; + + mlog_entry_void(); + + if (!journal) + BUG(); + + osb = journal->j_osb; + + status = journal_load(journal->j_journal); + if (status < 0) { + mlog(ML_ERROR, "Failed to load journal!\n"); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num); + + status = ocfs2_journal_toggle_dirty(osb, 1); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* Launch the commit thread */ + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d", + osb->osb_id); + if (IS_ERR(osb->commit_task)) { + status = PTR_ERR(osb->commit_task); + osb->commit_task = NULL; + mlog(ML_ERROR, "unable to launch ocfs2commit thread, error=%d", + status); + goto done; + } + +done: + mlog_exit(status); + return status; +} + + +/* 'full' flag tells us whether we clear out all blocks or if we just + * mark the journal clean */ +int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) +{ + int status; + + mlog_entry_void(); + + if (!journal) + BUG(); + + status = journal_wipe(journal->j_journal, full); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_toggle_dirty(journal->j_osb, 0); + if (status < 0) + mlog_errno(status); + +bail: + mlog_exit(status); + return status; +} + +/* + * JBD Might read a cached version of another nodes journal file. We + * don't want this as this file changes often and we get no + * notification on those changes. The only way to be sure that we've + * got the most up to date version of those blocks then is to force + * read them off disk. Just searching through the buffer cache won't + * work as there may be pages backing this file which are still marked + * up to date. We know things can't change on this file underneath us + * as we have the lock by now :) + */ +static int ocfs2_force_read_journal(struct inode *inode) +{ + int status = 0; + int i, p_blocks; + u64 v_blkno, p_blkno; +#define CONCURRENT_JOURNAL_FILL 32 + struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; + + mlog_entry_void(); + + BUG_ON(inode->i_blocks != + ocfs2_align_bytes_to_sectors(i_size_read(inode))); + + memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); + + mlog(0, "Force reading %lu blocks\n", + (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))); + + v_blkno = 0; + while (v_blkno < + (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { + + status = ocfs2_extent_map_get_blocks(inode, v_blkno, + 1, &p_blkno, + &p_blocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (p_blocks > CONCURRENT_JOURNAL_FILL) + p_blocks = CONCURRENT_JOURNAL_FILL; + + status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs, 0, + inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + for(i = 0; i < p_blocks; i++) { + brelse(bhs[i]); + bhs[i] = NULL; + } + + v_blkno += p_blocks; + } + +bail: + for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) + if (bhs[i]) + brelse(bhs[i]); + mlog_exit(status); + return status; +} + +struct ocfs2_la_recovery_item { + struct list_head lri_list; + int lri_slot; + struct ocfs2_dinode *lri_la_dinode; + struct ocfs2_dinode *lri_tl_dinode; +}; + +/* Does the second half of the recovery process. By this point, the + * node is marked clean and can actually be considered recovered, + * hence it's no longer in the recovery map, but there's still some + * cleanup we can do which shouldn't happen within the recovery thread + * as locking in that context becomes very difficult if we are to take + * recovering nodes into account. + * + * NOTE: This function can and will sleep on recovery of other nodes + * during cluster locking, just like any other ocfs2 process. + */ +void ocfs2_complete_recovery(void *data) +{ + int ret; + struct ocfs2_super *osb = data; + struct ocfs2_journal *journal = osb->journal; + struct ocfs2_dinode *la_dinode, *tl_dinode; + struct ocfs2_la_recovery_item *item; + struct list_head *p, *n; + LIST_HEAD(tmp_la_list); + + mlog_entry_void(); + + mlog(0, "completing recovery from keventd\n"); + + spin_lock(&journal->j_lock); + list_splice_init(&journal->j_la_cleanups, &tmp_la_list); + spin_unlock(&journal->j_lock); + + list_for_each_safe(p, n, &tmp_la_list) { + item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); + list_del_init(&item->lri_list); + + mlog(0, "Complete recovery for slot %d\n", item->lri_slot); + + la_dinode = item->lri_la_dinode; + if (la_dinode) { + mlog(0, "Clean up local alloc %"MLFu64"\n", + la_dinode->i_blkno); + + ret = ocfs2_complete_local_alloc_recovery(osb, + la_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(la_dinode); + } + + tl_dinode = item->lri_tl_dinode; + if (tl_dinode) { + mlog(0, "Clean up truncate log %"MLFu64"\n", + tl_dinode->i_blkno); + + ret = ocfs2_complete_truncate_log_recovery(osb, + tl_dinode); + if (ret < 0) + mlog_errno(ret); + + kfree(tl_dinode); + } + + ret = ocfs2_recover_orphans(osb, item->lri_slot); + if (ret < 0) + mlog_errno(ret); + + kfree(item); + } + + mlog(0, "Recovery completion\n"); + mlog_exit_void(); +} + +/* NOTE: This function always eats your references to la_dinode and + * tl_dinode, either manually on error, or by passing them to + * ocfs2_complete_recovery */ +static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, + int slot_num, + struct ocfs2_dinode *la_dinode, + struct ocfs2_dinode *tl_dinode) +{ + struct ocfs2_la_recovery_item *item; + + item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); + if (!item) { + /* Though we wish to avoid it, we are in fact safe in + * skipping local alloc cleanup as fsck.ocfs2 is more + * than capable of reclaiming unused space. */ + if (la_dinode) + kfree(la_dinode); + + if (tl_dinode) + kfree(tl_dinode); + + mlog_errno(-ENOMEM); + return; + } + + INIT_LIST_HEAD(&item->lri_list); + item->lri_la_dinode = la_dinode; + item->lri_slot = slot_num; + item->lri_tl_dinode = tl_dinode; + + spin_lock(&journal->j_lock); + list_add_tail(&item->lri_list, &journal->j_la_cleanups); + queue_work(ocfs2_wq, &journal->j_recovery_work); + spin_unlock(&journal->j_lock); +} + +/* Called by the mount code to queue recovery the last part of + * recovery for it's own slot. */ +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) +{ + struct ocfs2_journal *journal = osb->journal; + + if (osb->dirty) { + /* No need to queue up our truncate_log as regular + * cleanup will catch that. */ + ocfs2_queue_recovery_completion(journal, + osb->slot_num, + osb->local_alloc_copy, + NULL); + ocfs2_schedule_truncate_log_flush(osb, 0); + + osb->local_alloc_copy = NULL; + osb->dirty = 0; + } +} + +static int __ocfs2_recovery_thread(void *arg) +{ + int status, node_num; + struct ocfs2_super *osb = arg; + + mlog_entry_void(); + + status = ocfs2_wait_on_mount(osb); + if (status < 0) { + goto bail; + } + +restart: + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + node_num = ocfs2_node_map_first_set_bit(osb, + &osb->recovery_map); + if (node_num == O2NM_INVALID_NODE_NUM) { + mlog(0, "Out of nodes to recover.\n"); + break; + } + + status = ocfs2_recover_node(osb, node_num); + if (status < 0) { + mlog(ML_ERROR, + "Error %d recovering node %d on device (%u,%u)!\n", + status, node_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + mlog(ML_ERROR, "Volume requires unmount.\n"); + continue; + } + + ocfs2_recovery_map_clear(osb, node_num); + } + ocfs2_super_unlock(osb, 1); + + /* We always run recovery on our own orphan dir - the dead + * node(s) may have voted "no" on an inode delete earlier. A + * revote is therefore required. */ + ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, + NULL); + +bail: + down(&osb->recovery_lock); + if (!status && + !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { + up(&osb->recovery_lock); + goto restart; + } + + osb->recovery_thread_task = NULL; + mb(); /* sync with ocfs2_recovery_thread_running */ + wake_up(&osb->recovery_event); + + up(&osb->recovery_lock); + + mlog_exit(status); + /* no one is callint kthread_stop() for us so the kthread() api + * requires that we call do_exit(). And it isn't exported, but + * complete_and_exit() seems to be a minimal wrapper around it. */ + complete_and_exit(NULL, status); + return status; +} + +void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) +{ + mlog_entry("(node_num=%d, osb->node_num = %d)\n", + node_num, osb->node_num); + + down(&osb->recovery_lock); + if (osb->disable_recovery) + goto out; + + /* People waiting on recovery will wait on + * the recovery map to empty. */ + if (!ocfs2_recovery_map_set(osb, node_num)) + mlog(0, "node %d already be in recovery.\n", node_num); + + mlog(0, "starting recovery thread...\n"); + + if (osb->recovery_thread_task) + goto out; + + osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, + "ocfs2rec-%d", osb->osb_id); + if (IS_ERR(osb->recovery_thread_task)) { + mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); + osb->recovery_thread_task = NULL; + } + +out: + up(&osb->recovery_lock); + wake_up(&osb->recovery_event); + + mlog_exit_void(); +} + +/* Does the actual journal replay and marks the journal inode as + * clean. Will only replay if the journal inode is marked dirty. */ +static int ocfs2_replay_journal(struct ocfs2_super *osb, + int node_num, + int slot_num) +{ + int status; + int got_lock = 0; + unsigned int flags; + struct inode *inode = NULL; + struct ocfs2_dinode *fe; + journal_t *journal = NULL; + struct buffer_head *bh = NULL; + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (inode == NULL) { + status = -EACCES; + mlog_errno(status); + goto done; + } + if (is_bad_inode(inode)) { + status = -EACCES; + iput(inode); + inode = NULL; + mlog_errno(status); + goto done; + } + SET_INODE_JOURNAL(inode); + + status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, + OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + mlog(0, "status returned from ocfs2_meta_lock=%d\n", status); + if (status != -ERESTARTSYS) + mlog(ML_ERROR, "Could not lock journal!\n"); + goto done; + } + got_lock = 1; + + fe = (struct ocfs2_dinode *) bh->b_data; + + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + + if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { + mlog(0, "No recovery required for node %d\n", node_num); + goto done; + } + + mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", + node_num, slot_num, + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + + status = ocfs2_force_read_journal(inode); + if (status < 0) { + mlog_errno(status); + goto done; + } + + mlog(0, "calling journal_init_inode\n"); + journal = journal_init_inode(inode); + if (journal == NULL) { + mlog(ML_ERROR, "Linux journal layer error\n"); + status = -EIO; + goto done; + } + + status = journal_load(journal); + if (status < 0) { + mlog_errno(status); + if (!igrab(inode)) + BUG(); + journal_destroy(journal); + goto done; + } + + ocfs2_clear_journal_error(osb->sb, journal, slot_num); + + /* wipe the journal */ + mlog(0, "flushing the journal.\n"); + journal_lock_updates(journal); + status = journal_flush(journal); + journal_unlock_updates(journal); + if (status < 0) + mlog_errno(status); + + /* This will mark the node clean */ + flags = le32_to_cpu(fe->id1.journal1.ij_flags); + flags &= ~OCFS2_JOURNAL_DIRTY_FL; + fe->id1.journal1.ij_flags = cpu_to_le32(flags); + + status = ocfs2_write_block(osb, bh, inode); + if (status < 0) + mlog_errno(status); + + if (!igrab(inode)) + BUG(); + + journal_destroy(journal); + +done: + /* drop the lock on this nodes journal */ + if (got_lock) + ocfs2_meta_unlock(inode, 1); + + if (inode) + iput(inode); + + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +/* + * Do the most important parts of node recovery: + * - Replay it's journal + * - Stamp a clean local allocator file + * - Stamp a clean truncate log + * - Mark the node clean + * + * If this function completes without error, a node in OCFS2 can be + * said to have been safely recovered. As a result, failure during the + * second part of a nodes recovery process (local alloc recovery) is + * far less concerning. + */ +static int ocfs2_recover_node(struct ocfs2_super *osb, + int node_num) +{ + int status = 0; + int slot_num; + struct ocfs2_slot_info *si = osb->slot_info; + struct ocfs2_dinode *la_copy = NULL; + struct ocfs2_dinode *tl_copy = NULL; + + mlog_entry("(node_num=%d, osb->node_num = %d)\n", + node_num, osb->node_num); + + mlog(0, "checking node %d\n", node_num); + + /* Should not ever be called to recover ourselves -- in that + * case we should've called ocfs2_journal_load instead. */ + if (osb->node_num == node_num) + BUG(); + + slot_num = ocfs2_node_num_to_slot(si, node_num); + if (slot_num == OCFS2_INVALID_SLOT) { + status = 0; + mlog(0, "no slot for this node, so no recovery required.\n"); + goto done; + } + + mlog(0, "node %d was using slot %d\n", node_num, slot_num); + + status = ocfs2_replay_journal(osb, node_num, slot_num); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* Stamp a clean local alloc file AFTER recovering the journal... */ + status = ocfs2_begin_local_alloc_recovery(osb, slot_num, &la_copy); + if (status < 0) { + mlog_errno(status); + goto done; + } + + /* An error from begin_truncate_log_recovery is not + * serious enough to warrant halting the rest of + * recovery. */ + status = ocfs2_begin_truncate_log_recovery(osb, slot_num, &tl_copy); + if (status < 0) + mlog_errno(status); + + /* Likewise, this would be a strange but ultimately not so + * harmful place to get an error... */ + ocfs2_clear_slot(si, slot_num); + status = ocfs2_update_disk_slots(osb, si); + if (status < 0) + mlog_errno(status); + + /* This will kfree the memory pointed to by la_copy and tl_copy */ + ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, + tl_copy); + + status = 0; +done: + + mlog_exit(status); + return status; +} + +/* Test node liveness by trylocking his journal. If we get the lock, + * we drop it here. Return 0 if we got the lock, -EAGAIN if node is + * still alive (we couldn't get the lock) and < 0 on error. */ +static int ocfs2_trylock_journal(struct ocfs2_super *osb, + int slot_num) +{ + int status, flags; + struct inode *inode = NULL; + + inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, + slot_num); + if (inode == NULL) { + mlog(ML_ERROR, "access error\n"); + status = -EACCES; + goto bail; + } + if (is_bad_inode(inode)) { + mlog(ML_ERROR, "access error (bad inode)\n"); + iput(inode); + inode = NULL; + status = -EACCES; + goto bail; + } + SET_INODE_JOURNAL(inode); + + flags = OCFS2_META_LOCK_RECOVERY | OCFS2_META_LOCK_NOQUEUE; + status = ocfs2_meta_lock_full(inode, NULL, NULL, 1, flags); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + goto bail; + } + + ocfs2_meta_unlock(inode, 1); +bail: + if (inode) + iput(inode); + + return status; +} + +/* Call this underneath ocfs2_super_lock. It also assumes that the + * slot info struct has been updated from disk. */ +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) +{ + int status, i, node_num; + struct ocfs2_slot_info *si = osb->slot_info; + + /* This is called with the super block cluster lock, so we + * know that the slot map can't change underneath us. */ + + spin_lock(&si->si_lock); + for(i = 0; i < si->si_num_slots; i++) { + if (i == osb->slot_num) + continue; + if (ocfs2_is_empty_slot(si, i)) + continue; + + node_num = si->si_global_node_nums[i]; + if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) + continue; + spin_unlock(&si->si_lock); + + /* Ok, we have a slot occupied by another node which + * is not in the recovery map. We trylock his journal + * file here to test if he's alive. */ + status = ocfs2_trylock_journal(osb, i); + if (!status) { + /* Since we're called from mount, we know that + * the recovery thread can't race us on + * setting / checking the recovery bits. */ + ocfs2_recovery_thread(osb, node_num); + } else if ((status < 0) && (status != -EAGAIN)) { + mlog_errno(status); + goto bail; + } + + spin_lock(&si->si_lock); + } + spin_unlock(&si->si_lock); + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot) +{ + int status = 0; + int have_disk_lock = 0; + struct inode *inode = NULL; + struct inode *iter; + struct inode *orphan_dir_inode = NULL; + unsigned long offset, blk, local; + struct buffer_head *bh = NULL; + struct ocfs2_dir_entry *de; + struct super_block *sb = osb->sb; + struct ocfs2_inode_info *oi; + + mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + slot); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto out; + } + + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + mlog_errno(status); + goto out; + } + have_disk_lock = 1; + + offset = 0; + iter = NULL; + while(offset < i_size_read(orphan_dir_inode)) { + blk = offset >> sb->s_blocksize_bits; + + bh = ocfs2_bread(orphan_dir_inode, blk, &status, 0); + if (!bh) + status = -EINVAL; + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + if (bh) + brelse(bh); + mlog_errno(status); + goto out; + } + + local = 0; + while(offset < i_size_read(orphan_dir_inode) + && local < sb->s_blocksize) { + de = (struct ocfs2_dir_entry *) (bh->b_data + local); + + if (!ocfs2_check_dir_entry(orphan_dir_inode, + de, bh, local)) { + mutex_unlock(&orphan_dir_inode->i_mutex); + status = -EINVAL; + mlog_errno(status); + brelse(bh); + goto out; + } + + local += le16_to_cpu(de->rec_len); + offset += le16_to_cpu(de->rec_len); + + /* I guess we silently fail on no inode? */ + if (!le64_to_cpu(de->inode)) + continue; + if (de->file_type > OCFS2_FT_MAX) { + mlog(ML_ERROR, + "block %llu contains invalid de: " + "inode = %"MLFu64", rec_len = %u, " + "name_len = %u, file_type = %u, " + "name='%.*s'\n", + (unsigned long long)bh->b_blocknr, + le64_to_cpu(de->inode), + le16_to_cpu(de->rec_len), + de->name_len, + de->file_type, + de->name_len, + de->name); + continue; + } + if (de->name_len == 1 && !strncmp(".", de->name, 1)) + continue; + if (de->name_len == 2 && !strncmp("..", de->name, 2)) + continue; + + iter = ocfs2_iget(osb, le64_to_cpu(de->inode)); + if (IS_ERR(iter)) + continue; + + mlog(0, "queue orphan %"MLFu64"\n", + OCFS2_I(iter)->ip_blkno); + OCFS2_I(iter)->ip_next_orphan = inode; + inode = iter; + } + brelse(bh); + } + mutex_unlock(&orphan_dir_inode->i_mutex); + + ocfs2_meta_unlock(orphan_dir_inode, 0); + have_disk_lock = 0; + + iput(orphan_dir_inode); + orphan_dir_inode = NULL; + + while (inode) { + oi = OCFS2_I(inode); + mlog(0, "iput orphan %"MLFu64"\n", oi->ip_blkno); + + iter = oi->ip_next_orphan; + + spin_lock(&oi->ip_lock); + /* Delete voting may have set these on the assumption + * that the other node would wipe them successfully. + * If they are still in the node's orphan dir, we need + * to reset that state. */ + oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); + + /* Set the proper information to get us going into + * ocfs2_delete_inode. */ + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + oi->ip_orphaned_slot = slot; + spin_unlock(&oi->ip_lock); + + iput(inode); + + inode = iter; + } + +out: + if (have_disk_lock) + ocfs2_meta_unlock(orphan_dir_inode, 0); + + if (orphan_dir_inode) + iput(orphan_dir_inode); + + return status; +} + +static int ocfs2_wait_on_mount(struct ocfs2_super *osb) +{ + /* This check is good because ocfs2 will wait on our recovery + * thread before changing it to something other than MOUNTED + * or DISABLED. */ + wait_event(osb->osb_mount_event, + atomic_read(&osb->vol_state) == VOLUME_MOUNTED || + atomic_read(&osb->vol_state) == VOLUME_DISABLED); + + /* If there's an error on mount, then we may never get to the + * MOUNTED flag, but this is set right before + * dismount_volume() so we can trust it. */ + if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { + mlog(0, "mount error, exiting!\n"); + return -EBUSY; + } + + return 0; +} + +static int ocfs2_commit_thread(void *arg) +{ + int status; + struct ocfs2_super *osb = arg; + struct ocfs2_journal *journal = osb->journal; + + /* we can trust j_num_trans here because _should_stop() is only set in + * shutdown and nobody other than ourselves should be able to start + * transactions. committing on shutdown might take a few iterations + * as final transactions put deleted inodes on the list */ + while (!(kthread_should_stop() && + atomic_read(&journal->j_num_trans) == 0)) { + + wait_event_interruptible_timeout(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop(), + OCFS2_CHECKPOINT_INTERVAL); + + status = ocfs2_commit_cache(osb); + if (status < 0) + mlog_errno(status); + + if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ + mlog(ML_KTHREAD, + "commit_thread: %u transactions pending on " + "shutdown\n", + atomic_read(&journal->j_num_trans)); + } + } + + return 0; +} + +/* Look for a dirty journal without taking any cluster locks. Used for + * hard readonly access to determine whether the file system journals + * require recovery. */ +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb) +{ + int ret = 0; + unsigned int slot; + struct buffer_head *di_bh; + struct ocfs2_dinode *di; + struct inode *journal = NULL; + + for(slot = 0; slot < osb->max_slots; slot++) { + journal = ocfs2_get_system_file_inode(osb, + JOURNAL_SYSTEM_INODE, + slot); + if (!journal || is_bad_inode(journal)) { + ret = -EACCES; + mlog_errno(ret); + goto out; + } + + di_bh = NULL; + ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh, + 0, journal); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + di = (struct ocfs2_dinode *) di_bh->b_data; + + if (le32_to_cpu(di->id1.journal1.ij_flags) & + OCFS2_JOURNAL_DIRTY_FL) + ret = -EROFS; + + brelse(di_bh); + if (ret) + break; + } + +out: + if (journal) + iput(journal); + + return ret; +} diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h new file mode 100644 index 000000000000..7d0a816184fa --- /dev/null +++ b/fs/ocfs2/journal.h @@ -0,0 +1,457 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * journal.h + * + * Defines journalling api and structures. + * + * Copyright (C) 2003, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_JOURNAL_H +#define OCFS2_JOURNAL_H + +#include <linux/fs.h> +#include <linux/jbd.h> + +#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) + +enum ocfs2_journal_state { + OCFS2_JOURNAL_FREE = 0, + OCFS2_JOURNAL_LOADED, + OCFS2_JOURNAL_IN_SHUTDOWN, +}; + +struct ocfs2_super; +struct ocfs2_dinode; +struct ocfs2_journal_handle; + +struct ocfs2_journal { + enum ocfs2_journal_state j_state; /* Journals current state */ + + journal_t *j_journal; /* The kernels journal type */ + struct inode *j_inode; /* Kernel inode pointing to + * this journal */ + struct ocfs2_super *j_osb; /* pointer to the super + * block for the node + * we're currently + * running on -- not + * necessarily the super + * block from the node + * which we usually run + * from (recovery, + * etc) */ + struct buffer_head *j_bh; /* Journal disk inode block */ + atomic_t j_num_trans; /* Number of transactions + * currently in the system. */ + unsigned long j_trans_id; + struct rw_semaphore j_trans_barrier; + wait_queue_head_t j_checkpointed; + + spinlock_t j_lock; + struct list_head j_la_cleanups; + struct work_struct j_recovery_work; +}; + +extern spinlock_t trans_inc_lock; + +/* wrap j_trans_id so we never have it equal to zero. */ +static inline unsigned long ocfs2_inc_trans_id(struct ocfs2_journal *j) +{ + unsigned long old_id; + spin_lock(&trans_inc_lock); + old_id = j->j_trans_id++; + if (unlikely(!j->j_trans_id)) + j->j_trans_id = 1; + spin_unlock(&trans_inc_lock); + return old_id; +} + +static inline void ocfs2_set_inode_lock_trans(struct ocfs2_journal *journal, + struct inode *inode) +{ + spin_lock(&trans_inc_lock); + OCFS2_I(inode)->ip_last_trans = journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +/* Used to figure out whether it's safe to drop a metadata lock on an + * inode. Returns true if all the inodes changes have been + * checkpointed to disk. You should be holding the spinlock on the + * metadata lock while calling this to be sure that nobody can take + * the lock and put it on another transaction. */ +static inline int ocfs2_inode_fully_checkpointed(struct inode *inode) +{ + int ret; + struct ocfs2_journal *journal = OCFS2_SB(inode->i_sb)->journal; + + spin_lock(&trans_inc_lock); + ret = time_after(journal->j_trans_id, OCFS2_I(inode)->ip_last_trans); + spin_unlock(&trans_inc_lock); + return ret; +} + +/* convenience function to check if an inode is still new (has never + * hit disk) Will do you a favor and set created_trans = 0 when you've + * been checkpointed. returns '1' if the inode is still new. */ +static inline int ocfs2_inode_is_new(struct inode *inode) +{ + int ret; + + /* System files are never "new" as they're written out by + * mkfs. This helps us early during mount, before we have the + * journal open and j_trans_id could be junk. */ + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) + return 0; + spin_lock(&trans_inc_lock); + ret = !(time_after(OCFS2_SB(inode->i_sb)->journal->j_trans_id, + OCFS2_I(inode)->ip_created_trans)); + if (!ret) + OCFS2_I(inode)->ip_created_trans = 0; + spin_unlock(&trans_inc_lock); + return ret; +} + +static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, + struct inode *inode) +{ + spin_lock(&trans_inc_lock); + OCFS2_I(inode)->ip_created_trans = osb->journal->j_trans_id; + spin_unlock(&trans_inc_lock); +} + +extern kmem_cache_t *ocfs2_lock_cache; + +struct ocfs2_journal_lock { + struct inode *jl_inode; + struct list_head jl_lock_list; +}; + +struct ocfs2_journal_handle { + handle_t *k_handle; /* kernel handle. */ + struct ocfs2_journal *journal; + u32 flags; /* see flags below. */ + int max_buffs; /* Buffs reserved by this handle */ + + /* The following two fields are for ocfs2_handle_add_lock */ + int num_locks; + struct list_head locks; /* A bunch of locks to + * release on commit. This + * should be a list_head */ + + struct list_head inode_list; +}; + +#define OCFS2_HANDLE_STARTED 1 +/* should we sync-commit this handle? */ +#define OCFS2_HANDLE_SYNC 2 +static inline int ocfs2_handle_started(struct ocfs2_journal_handle *handle) +{ + return handle->flags & OCFS2_HANDLE_STARTED; +} + +static inline void ocfs2_handle_set_sync(struct ocfs2_journal_handle *handle, int sync) +{ + if (sync) + handle->flags |= OCFS2_HANDLE_SYNC; + else + handle->flags &= ~OCFS2_HANDLE_SYNC; +} + +/* Exported only for the journal struct init code in super.c. Do not call. */ +void ocfs2_complete_recovery(void *data); + +/* + * Journal Control: + * Initialize, Load, Shutdown, Wipe a journal. + * + * ocfs2_journal_init - Initialize journal structures in the OSB. + * ocfs2_journal_load - Load the given journal off disk. Replay it if + * there's transactions still in there. + * ocfs2_journal_shutdown - Shutdown a journal, this will flush all + * uncommitted, uncheckpointed transactions. + * ocfs2_journal_wipe - Wipe transactions from a journal. Optionally + * zero out each block. + * ocfs2_recovery_thread - Perform recovery on a node. osb is our own osb. + * ocfs2_mark_dead_nodes - Start recovery on nodes we won't get a heartbeat + * event on. + * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. + */ +void ocfs2_set_journal_params(struct ocfs2_super *osb); +int ocfs2_journal_init(struct ocfs2_journal *journal, + int *dirty); +void ocfs2_journal_shutdown(struct ocfs2_super *osb); +int ocfs2_journal_wipe(struct ocfs2_journal *journal, + int full); +int ocfs2_journal_load(struct ocfs2_journal *journal); +int ocfs2_check_journals_nolocks(struct ocfs2_super *osb); +void ocfs2_recovery_thread(struct ocfs2_super *osb, + int node_num); +int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); +void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); + +static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) +{ + atomic_set(&osb->needs_checkpoint, 1); + wake_up(&osb->checkpoint_event); +} + +static inline void ocfs2_checkpoint_inode(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!ocfs2_inode_fully_checkpointed(inode)) { + /* WARNING: This only kicks off a single + * checkpoint. If someone races you and adds more + * metadata to the journal, you won't know, and will + * wind up waiting *alot* longer than necessary. Right + * now we only use this in clear_inode so that's + * OK. */ + ocfs2_start_checkpoint(osb); + + wait_event(osb->journal->j_checkpointed, + ocfs2_inode_fully_checkpointed(inode)); + } +} + +/* + * Transaction Handling: + * Manage the lifetime of a transaction handle. + * + * ocfs2_alloc_handle - Only allocate a handle so we can start putting + * cluster locks on it. To actually change blocks, + * call ocfs2_start_trans with the handle returned + * from this function. You may call ocfs2_commit_trans + * at any time in the lifetime of a handle. + * ocfs2_start_trans - Begin a transaction. Give it an upper estimate of + * the number of blocks that will be changed during + * this handle. + * ocfs2_commit_trans - Complete a handle. + * ocfs2_extend_trans - Extend a handle by nblocks credits. This may + * commit the handle to disk in the process, but will + * not release any locks taken during the transaction. + * ocfs2_journal_access - Notify the handle that we want to journal this + * buffer. Will have to call ocfs2_journal_dirty once + * we've actually dirtied it. Type is one of . or . + * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. + * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before + * the current handle commits. + * ocfs2_handle_add_lock - Sometimes we need to delay lock release + * until after a transaction has been completed. Use + * ocfs2_handle_add_lock to indicate that a lock needs + * to be released at the end of that handle. Locks + * will be released in the order that they are added. + * ocfs2_handle_add_inode - Add a locked inode to a transaction. + */ + +/* You must always start_trans with a number of buffs > 0, but it's + * perfectly legal to go through an entire transaction without having + * dirtied any buffers. */ +struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb); +struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + int max_buffs); +void ocfs2_commit_trans(struct ocfs2_journal_handle *handle); +int ocfs2_extend_trans(struct ocfs2_journal_handle *handle, + int nblocks); + +/* + * Create access is for when we get a newly created buffer and we're + * not gonna read it off disk, but rather fill it ourselves. Right + * now, we don't do anything special with this (it turns into a write + * request), but this is a good placeholder in case we do... + * + * Write access is for when we read a block off disk and are going to + * modify it. This way the journalling layer knows it may need to make + * a copy of that block (if it's part of another, uncommitted + * transaction) before we do so. + */ +#define OCFS2_JOURNAL_ACCESS_CREATE 0 +#define OCFS2_JOURNAL_ACCESS_WRITE 1 +#define OCFS2_JOURNAL_ACCESS_UNDO 2 + +int ocfs2_journal_access(struct ocfs2_journal_handle *handle, + struct inode *inode, + struct buffer_head *bh, + int type); +/* + * A word about the journal_access/journal_dirty "dance". It is + * entirely legal to journal_access a buffer more than once (as long + * as the access type is the same -- I'm not sure what will happen if + * access type is different but this should never happen anyway) It is + * also legal to journal_dirty a buffer more than once. In fact, you + * can even journal_access a buffer after you've done a + * journal_access/journal_dirty pair. The only thing you cannot do + * however, is journal_dirty a buffer which you haven't yet passed to + * journal_access at least once. + * + * That said, 99% of the time this doesn't matter and this is what the + * path looks like: + * + * <read a bh> + * ocfs2_journal_access(handle, bh, OCFS2_JOURNAL_ACCESS_WRITE); + * <modify the bh> + * ocfs2_journal_dirty(handle, bh); + */ +int ocfs2_journal_dirty(struct ocfs2_journal_handle *handle, + struct buffer_head *bh); +int ocfs2_journal_dirty_data(handle_t *handle, + struct buffer_head *bh); +int ocfs2_handle_add_lock(struct ocfs2_journal_handle *handle, + struct inode *inode); +/* + * Use this to protect from other processes reading buffer state while + * it's in flight. + */ +void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, + struct inode *inode); + +/* + * Credit Macros: + * Convenience macros to calculate number of credits needed. + * + * For convenience sake, I have a set of macros here which calculate + * the *maximum* number of sectors which will be changed for various + * metadata updates. + */ + +/* simple file updates like chmod, etc. */ +#define OCFS2_INODE_UPDATE_CREDITS 1 + +/* get one bit out of a suballocator: dinode + group descriptor + + * prev. group desc. if we relink. */ +#define OCFS2_SUBALLOC_ALLOC (3) + +/* dinode + group descriptor update. We don't relink on free yet. */ +#define OCFS2_SUBALLOC_FREE (2) + +#define OCFS2_TRUNCATE_LOG_UPDATE OCFS2_INODE_UPDATE_CREDITS +#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + + OCFS2_TRUNCATE_LOG_UPDATE) + +/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + + * bitmap block for the new bit) */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) + +/* parent fe, parent block, new file entry, inode alloc fe, inode alloc + * group descriptor + mkdir/symlink blocks */ +#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ + + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) + +/* local alloc metadata change + main bitmap updates */ +#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_SUBALLOC_ALLOC + OCFS2_SUBALLOC_FREE) + +/* used when we don't need an allocation change for a dir extend. One + * for the dinode, one for the new block. */ +#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) + +/* file update (nlink, etc) + dir entry block */ +#define OCFS2_LINK_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* inode + dir inode (if we unlink a dir), + dir entry block + orphan + * dir inode link */ +#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ + + OCFS2_LINK_CREDITS) + +/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + + * inode alloc group descriptor */ +#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 1 + 1) + +/* dinode update, old dir dinode update, new dir dinode update, old + * dir dir entry, new dir dir entry, dir entry update for renaming + * directory + target unlink */ +#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ + + OCFS2_UNLINK_CREDITS) + +static inline int ocfs2_calc_extend_credits(struct super_block *sb, + struct ocfs2_dinode *fe, + u32 bits_wanted) +{ + int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; + + /* bitmap dinode, group desc. + relinked group. */ + bitmap_blocks = OCFS2_SUBALLOC_ALLOC; + + /* we might need to shift tree depth so lets assume an + * absolute worst case of complete fragmentation. Even with + * that, we only need one update for the dinode, and then + * however many metadata chunks needed * a remaining suballoc + * alloc. */ + sysfile_bitmap_blocks = 1 + + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); + + /* this does not include *new* metadata blocks, which are + * accounted for in sysfile_bitmap_blocks. fe + + * prev. last_eb_blk + blocks along edge of tree. + * calc_symlink_credits passes because we just need 1 + * credit for the dinode there. */ + dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); + + return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; +} + +static inline int ocfs2_calc_symlink_credits(struct super_block *sb) +{ + int blocks = OCFS2_MKNOD_CREDITS; + + /* links can be longer than one block so we may update many + * within our single allocated extent. */ + blocks += ocfs2_clusters_to_blocks(sb, 1); + + return blocks; +} + +static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, + unsigned int cpg) +{ + int blocks; + int bitmap_blocks = OCFS2_SUBALLOC_ALLOC + 1; + /* parent inode update + new block group header + bitmap inode update + + bitmap blocks affected */ + blocks = 1 + 1 + 1 + bitmap_blocks; + return blocks; +} + +static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, + unsigned int clusters_to_del, + struct ocfs2_dinode *fe, + struct ocfs2_extent_list *last_el) +{ + /* for dinode + all headers in this pass + update to next leaf */ + u16 next_free = le16_to_cpu(last_el->l_next_free_rec); + u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); + int credits = 1 + tree_depth + 1; + int i; + + i = next_free - 1; + BUG_ON(i < 0); + + /* We may be deleting metadata blocks, so metadata alloc dinode + + one desc. block for each possible delete. */ + if (tree_depth && next_free == 1 && + le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) + credits += 1 + tree_depth; + + /* update to the truncate log. */ + credits += OCFS2_TRUNCATE_LOG_UPDATE; + + return credits; +} + +#endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c new file mode 100644 index 000000000000..149b35181666 --- /dev/null +++ b/fs/ocfs2/localalloc.c @@ -0,0 +1,983 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.c + * + * Node local data allocation + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/bitops.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) + +static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 numbits); + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc); + +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh); + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh); + +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac); + +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode); + +/* + * Determine how large our local alloc window should be, in bits. + * + * These values (and the behavior in ocfs2_alloc_should_use_local) have + * been chosen so that most allocations, including new block groups go + * through local alloc. + */ +static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) +{ + BUG_ON(osb->s_clustersize_bits < 12); + + return 2048 >> (osb->s_clustersize_bits - 12); +} + +/* + * Tell us whether a given allocation should use the local alloc + * file. Otherwise, it has to go to the main bitmap. + */ +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) +{ + int la_bits = ocfs2_local_alloc_window_bits(osb); + + if (osb->local_alloc_state != OCFS2_LA_ENABLED) + return 0; + + /* la_bits should be at least twice the size (in clusters) of + * a new block group. We want to be sure block group + * allocations go through the local alloc, so allow an + * allocation to take up to half the bitmap. */ + if (bits > (la_bits / 2)) + return 0; + + return 1; +} + +int ocfs2_load_local_alloc(struct ocfs2_super *osb) +{ + int status = 0; + struct ocfs2_dinode *alloc = NULL; + struct buffer_head *alloc_bh = NULL; + u32 num_used; + struct inode *inode = NULL; + struct ocfs2_local_alloc *la; + + mlog_entry_void(); + + /* read the alloc off disk */ + inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, + &alloc_bh, 0, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + if (!(le32_to_cpu(alloc->i_flags) & + (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) { + mlog(ML_ERROR, "Invalid local alloc inode, %"MLFu64"\n", + OCFS2_I(inode)->ip_blkno); + status = -EINVAL; + goto bail; + } + + if ((la->la_size == 0) || + (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) { + mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n", + le16_to_cpu(la->la_size)); + status = -EINVAL; + goto bail; + } + + /* do a little verification. */ + num_used = ocfs2_local_alloc_count_bits(alloc); + + /* hopefully the local alloc has always been recovered before + * we load it. */ + if (num_used + || alloc->id1.bitmap1.i_used + || alloc->id1.bitmap1.i_total + || la->la_bm_off) + mlog(ML_ERROR, "Local alloc hasn't been recovered!\n" + "found = %u, set = %u, taken = %u, off = %u\n", + num_used, le32_to_cpu(alloc->id1.bitmap1.i_used), + le32_to_cpu(alloc->id1.bitmap1.i_total), + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + + osb->local_alloc_bh = alloc_bh; + osb->local_alloc_state = OCFS2_LA_ENABLED; + +bail: + if (status < 0) + if (alloc_bh) + brelse(alloc_bh); + if (inode) + iput(inode); + + mlog_exit(status); + return status; +} + +/* + * return any unused bits to the bitmap and write out a clean + * local_alloc. + * + * local_alloc_bh is optional. If not passed, we will simply use the + * one off osb. If you do pass it however, be warned that it *will* be + * returned brelse'd and NULL'd out.*/ +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_journal_handle *handle = NULL; + struct inode *local_alloc_inode = NULL; + struct buffer_head *bh = NULL; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_dinode *alloc = NULL; + + mlog_entry_void(); + + if (osb->local_alloc_state == OCFS2_LA_UNUSED) + goto bail; + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + + osb->local_alloc_state = OCFS2_LA_DISABLED; + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + ocfs2_handle_add_inode(handle, main_bm_inode); + status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* WINDOW_MOVE_CREDITS is a bit heavy... */ + handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + mlog_errno(PTR_ERR(handle)); + handle = NULL; + goto bail; + } + + bh = osb->local_alloc_bh; + alloc = (struct ocfs2_dinode *) bh->b_data; + + alloc_copy = kmalloc(bh->b_size, GFP_KERNEL); + if (!alloc_copy) { + status = -ENOMEM; + goto bail; + } + memcpy(alloc_copy, alloc, bh->b_size); + + status = ocfs2_journal_access(handle, local_alloc_inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_clear_local_alloc(alloc); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + brelse(bh); + osb->local_alloc_bh = NULL; + osb->local_alloc_state = OCFS2_LA_UNUSED; + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (main_bm_bh) + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + if (local_alloc_inode) + iput(local_alloc_inode); + + if (alloc_copy) + kfree(alloc_copy); + + mlog_exit_void(); +} + +/* + * We want to free the bitmap bits outside of any recovery context as + * we'll need a cluster lock to do so, but we must clear the local + * alloc before giving up the recovered nodes journal. To solve this, + * we kmalloc a copy of the local alloc before it's change for the + * caller to process with ocfs2_complete_local_alloc_recovery + */ +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int slot_num, + struct ocfs2_dinode **alloc_copy) +{ + int status = 0; + struct buffer_head *alloc_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_dinode *alloc; + + mlog_entry("(slot_num = %d)\n", slot_num); + + *alloc_copy = NULL; + + inode = ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + slot_num); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mutex_lock(&inode->i_mutex); + + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, + &alloc_bh, 0, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + *alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL); + if (!(*alloc_copy)) { + status = -ENOMEM; + goto bail; + } + memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size); + + alloc = (struct ocfs2_dinode *) alloc_bh->b_data; + ocfs2_clear_local_alloc(alloc); + + status = ocfs2_write_block(osb, alloc_bh, inode); + if (status < 0) + mlog_errno(status); + +bail: + if ((status < 0) && (*alloc_copy)) { + kfree(*alloc_copy); + *alloc_copy = NULL; + } + + if (alloc_bh) + brelse(alloc_bh); + + if (inode) { + mutex_unlock(&inode->i_mutex); + iput(inode); + } + + mlog_exit(status); + return status; +} + +/* + * Step 2: By now, we've completed the journal recovery, we've stamped + * a clean local alloc on disk and dropped the node out of the + * recovery map. Dlm locks will no longer stall, so lets clear out the + * main bitmap. + */ +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc) +{ + int status; + struct ocfs2_journal_handle *handle = NULL; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + + mlog_entry_void(); + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + main_bm_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!main_bm_inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + ocfs2_handle_add_inode(handle, main_bm_inode); + status = ocfs2_meta_lock(main_bm_inode, handle, &main_bm_bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + /* we want the bitmap change to be recorded on disk asap */ + ocfs2_handle_set_sync(handle, 1); + + status = ocfs2_sync_local_to_main(osb, handle, alloc, + main_bm_inode, main_bm_bh); + if (status < 0) + mlog_errno(status); + +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (main_bm_bh) + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + mlog_exit(status); + return status; +} + +/* + * make sure we've got at least bitswanted contiguous bits in the + * local alloc. You lose them when you drop i_mutex. + * + * We will add ourselves to the transaction passed in, but may start + * our own in order to shift windows. + */ +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *passed_handle, + u32 bits_wanted, + struct ocfs2_alloc_context *ac) +{ + int status; + struct ocfs2_dinode *alloc; + struct inode *local_alloc_inode; + unsigned int free_bits; + + mlog_entry_void(); + + BUG_ON(!passed_handle); + BUG_ON(!ac); + BUG_ON(passed_handle->flags & OCFS2_HANDLE_STARTED); + + local_alloc_inode = + ocfs2_get_system_file_inode(osb, + LOCAL_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!local_alloc_inode) { + status = -ENOENT; + mlog_errno(status); + goto bail; + } + ocfs2_handle_add_inode(passed_handle, local_alloc_inode); + + if (osb->local_alloc_state != OCFS2_LA_ENABLED) { + status = -ENOSPC; + goto bail; + } + + if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { + mlog(0, "Asking for more than my max window size!\n"); + status = -ENOSPC; + goto bail; + } + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + + if (le32_to_cpu(alloc->id1.bitmap1.i_used) != + ocfs2_local_alloc_count_bits(alloc)) { + ocfs2_error(osb->sb, "local alloc inode %"MLFu64" says it has " + "%u free bits, but a count shows %u", + le64_to_cpu(alloc->i_blkno), + le32_to_cpu(alloc->id1.bitmap1.i_used), + ocfs2_local_alloc_count_bits(alloc)); + status = -EIO; + goto bail; + } + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) { + /* uhoh, window change time. */ + status = + ocfs2_local_alloc_slide_window(osb, local_alloc_inode); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + ac->ac_inode = igrab(local_alloc_inode); + get_bh(osb->local_alloc_bh); + ac->ac_bh = osb->local_alloc_bh; + ac->ac_which = OCFS2_AC_USE_LOCAL; + status = 0; +bail: + if (local_alloc_inode) + iput(local_alloc_inode); + + mlog_exit(status); + return status; +} + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_bits, + u32 *bit_off, + u32 *num_bits) +{ + int status, start; + struct inode *local_alloc_inode; + u32 bits_wanted; + void *bitmap; + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + + mlog_entry_void(); + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); + + bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; + local_alloc_inode = ac->ac_inode; + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + if (start == -1) { + /* TODO: Shouldn't we just BUG here? */ + status = -ENOSPC; + mlog_errno(status); + goto bail; + } + + bitmap = la->la_bitmap; + *bit_off = le32_to_cpu(la->la_bm_off) + start; + /* local alloc is always contiguous by nature -- we never + * delete bits from it! */ + *num_bits = bits_wanted; + + status = ocfs2_journal_access(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + while(bits_wanted--) + ocfs2_set_bit(start++, bitmap); + + alloc->id1.bitmap1.i_used = cpu_to_le32(*num_bits + + le32_to_cpu(alloc->id1.bitmap1.i_used)); + + status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc) +{ + int i; + u8 *buffer; + u32 count = 0; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + mlog_entry_void(); + + buffer = la->la_bitmap; + for (i = 0; i < le16_to_cpu(la->la_size); i++) + count += hweight8(buffer[i]); + + mlog_exit(count); + return count; +} + +static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc, + u32 numbits) +{ + int numfound, bitoff, left, startoff, lastzero; + void *bitmap = NULL; + + mlog_entry("(numbits wanted = %u)\n", numbits); + + if (!alloc->id1.bitmap1.i_total) { + mlog(0, "No bits in my window!\n"); + bitoff = -1; + goto bail; + } + + bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap; + + numfound = bitoff = startoff = 0; + lastzero = -1; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) { + if (bitoff == left) { + /* mlog(0, "bitoff (%d) == left", bitoff); */ + break; + } + /* mlog(0, "Found a zero: bitoff = %d, startoff = %d, " + "numfound = %d\n", bitoff, startoff, numfound);*/ + + /* Ok, we found a zero bit... is it contig. or do we + * start over?*/ + if (bitoff == startoff) { + /* we found a zero */ + numfound++; + startoff++; + } else { + /* got a zero after some ones */ + numfound = 1; + startoff = bitoff+1; + } + /* we got everything we needed */ + if (numfound == numbits) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, + numfound); + + if (numfound == numbits) + bitoff = startoff - numfound; + else + bitoff = -1; + +bail: + mlog_exit(bitoff); + return bitoff; +} + +static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc) +{ + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + int i; + mlog_entry_void(); + + alloc->id1.bitmap1.i_total = 0; + alloc->id1.bitmap1.i_used = 0; + la->la_bm_off = 0; + for(i = 0; i < le16_to_cpu(la->la_size); i++) + la->la_bitmap[i] = 0; + + mlog_exit_void(); +} + +#if 0 +/* turn this on and uncomment below to aid debugging window shifts. */ +static void ocfs2_verify_zero_bits(unsigned long *bitmap, + unsigned int start, + unsigned int count) +{ + unsigned int tmp = count; + while(tmp--) { + if (ocfs2_test_bit(start + tmp, bitmap)) { + printk("ocfs2_verify_zero_bits: start = %u, count = " + "%u\n", start, count); + printk("ocfs2_verify_zero_bits: bit %u is set!", + start + tmp); + BUG(); + } + } +} +#endif + +/* + * sync the local alloc to main bitmap. + * + * assumes you've already locked the main bitmap -- the bitmap inode + * passed is used for caching. + */ +static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *alloc, + struct inode *main_bm_inode, + struct buffer_head *main_bm_bh) +{ + int status = 0; + int bit_off, left, count, start; + u64 la_start_blk; + u64 blkno; + void *bitmap; + struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); + + mlog_entry("total = %u, COUNT = %u, used = %u\n", + le32_to_cpu(alloc->id1.bitmap1.i_total), + ocfs2_local_alloc_count_bits(alloc), + le32_to_cpu(alloc->id1.bitmap1.i_used)); + + if (!alloc->id1.bitmap1.i_total) { + mlog(0, "nothing to sync!\n"); + goto bail; + } + + if (le32_to_cpu(alloc->id1.bitmap1.i_used) == + le32_to_cpu(alloc->id1.bitmap1.i_total)) { + mlog(0, "all bits were taken!\n"); + goto bail; + } + + la_start_blk = ocfs2_clusters_to_blocks(osb->sb, + le32_to_cpu(la->la_bm_off)); + bitmap = la->la_bitmap; + start = count = bit_off = 0; + left = le32_to_cpu(alloc->id1.bitmap1.i_total); + + while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start)) + != -1) { + if ((bit_off < left) && (bit_off == start)) { + count++; + start++; + continue; + } + if (count) { + blkno = la_start_blk + + ocfs2_clusters_to_blocks(osb->sb, + start - count); + + mlog(0, "freeing %u bits starting at local " + "alloc bit %u (la_start_blk = %"MLFu64", " + "blkno = %"MLFu64")\n", count, start - count, + la_start_blk, blkno); + + status = ocfs2_free_clusters(handle, main_bm_inode, + main_bm_bh, blkno, count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + if (bit_off >= left) + break; + count = 1; + start = bit_off + 1; + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac, + struct inode **bitmap_inode, + struct buffer_head **bitmap_bh) +{ + int status; + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_handle = handle; + (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); + + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + *bitmap_inode = (*ac)->ac_inode; + igrab(*bitmap_inode); + *bitmap_bh = (*ac)->ac_bh; + get_bh(*bitmap_bh); + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + mlog_exit(status); + return status; +} + +/* + * pass it the bitmap lock in lock_bh if you have it. + */ +static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac) +{ + int status = 0; + u32 cluster_off, cluster_count; + struct ocfs2_dinode *alloc = NULL; + struct ocfs2_local_alloc *la; + + mlog_entry_void(); + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + if (alloc->id1.bitmap1.i_total) + mlog(0, "asking me to alloc a new window over a non-empty " + "one\n"); + + mlog(0, "Allocating %u clusters for a new window.\n", + ocfs2_local_alloc_window_bits(osb)); + /* we used the generic suballoc reserve function, but we set + * everything up nicely, so there's no reason why we can't use + * the more specific cluster api to claim bits. */ + status = ocfs2_claim_clusters(osb, handle, ac, + ocfs2_local_alloc_window_bits(osb), + &cluster_off, &cluster_count); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + la->la_bm_off = cpu_to_le32(cluster_off); + alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); + /* just in case... In the future when we find space ourselves, + * we don't have to get all contiguous -- but we'll have to + * set all previously used bits in bitmap and update + * la_bits_set before setting the bits in the main bitmap. */ + alloc->id1.bitmap1.i_used = 0; + memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0, + le16_to_cpu(la->la_size)); + + mlog(0, "New window allocated:\n"); + mlog(0, "window la_bm_off = %u\n", + OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); + mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total)); + +bail: + mlog_exit(status); + return status; +} + +/* Note that we do *NOT* lock the local alloc inode here as + * it's been locked already for us. */ +static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, + struct inode *local_alloc_inode) +{ + int status = 0; + struct buffer_head *main_bm_bh = NULL; + struct inode *main_bm_inode = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_dinode *alloc; + struct ocfs2_dinode *alloc_copy = NULL; + struct ocfs2_alloc_context *ac = NULL; + + mlog_entry_void(); + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* This will lock the main bitmap for us. */ + status = ocfs2_local_alloc_reserve_for_window(osb, + handle, + &ac, + &main_bm_inode, + &main_bm_bh); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_WINDOW_MOVE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + + /* We want to clear the local alloc before doing anything + * else, so that if we error later during this operation, + * local alloc shutdown won't try to double free main bitmap + * bits. Make a copy so the sync function knows which bits to + * free. */ + alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL); + if (!alloc_copy) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); + + status = ocfs2_journal_access(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_clear_local_alloc(alloc); + + status = ocfs2_journal_dirty(handle, osb->local_alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_sync_local_to_main(osb, handle, alloc_copy, + main_bm_inode, main_bm_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_local_alloc_new_window(osb, handle, ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + atomic_inc(&osb->alloc_stats.moves); + + status = 0; +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (main_bm_bh) + brelse(main_bm_bh); + + if (main_bm_inode) + iput(main_bm_inode); + + if (alloc_copy) + kfree(alloc_copy); + + if (ac) + ocfs2_free_alloc_context(ac); + + mlog_exit(status); + return status; +} + diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h new file mode 100644 index 000000000000..30f88ce14e46 --- /dev/null +++ b/fs/ocfs2/localalloc.h @@ -0,0 +1,56 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * localalloc.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCALALLOC_H +#define OCFS2_LOCALALLOC_H + +int ocfs2_load_local_alloc(struct ocfs2_super *osb); + +void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); + +int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, + int node_num, + struct ocfs2_dinode **alloc_copy); + +int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, + struct ocfs2_dinode *alloc); + +int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, + u64 bits); + +struct ocfs2_alloc_context; +int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *passed_handle, + u32 bits_wanted, + struct ocfs2_alloc_context *ac); + +int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_bits, + u32 *bit_off, + u32 *num_bits); + +#endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c new file mode 100644 index 000000000000..843cf9ddefe8 --- /dev/null +++ b/fs/ocfs2/mmap.c @@ -0,0 +1,98 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * mmap.c + * + * Code to deal with the mess that is clustered mmap. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/signal.h> +#include <linux/rbtree.h> + +#define MLOG_MASK_PREFIX ML_FILE_IO +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "file.h" +#include "inode.h" +#include "mmap.h" + +static struct page *ocfs2_nopage(struct vm_area_struct * area, + unsigned long address, + int *type) +{ + struct inode *inode = area->vm_file->f_dentry->d_inode; + struct page *page = NOPAGE_SIGBUS; + sigset_t blocked, oldset; + int ret; + + mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address); + + /* The best way to deal with signals in this path is + * to block them upfront, rather than allowing the + * locking paths to return -ERESTARTSYS. */ + sigfillset(&blocked); + + /* We should technically never get a bad ret return + * from sigprocmask */ + ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + page = filemap_nopage(area, address, type); + + ret = sigprocmask(SIG_SETMASK, &oldset, NULL); + if (ret < 0) + mlog_errno(ret); +out: + mlog_exit_ptr(page); + return page; +} + +static struct vm_operations_struct ocfs2_file_vm_ops = { + .nopage = ocfs2_nopage, +}; + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* We don't want to support shared writable mappings yet. */ + if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) + && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { + mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); + /* This is -EINVAL because generic_file_readonly_mmap + * returns it in a similar situation. */ + return -EINVAL; + } + + file_accessed(file); + vma->vm_ops = &ocfs2_file_vm_ops; + return 0; +} + diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h new file mode 100644 index 000000000000..1274ee0f1fe2 --- /dev/null +++ b/fs/ocfs2/mmap.h @@ -0,0 +1,6 @@ +#ifndef OCFS2_MMAP_H +#define OCFS2_MMAP_H + +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma); + +#endif /* OCFS2_MMAP_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c new file mode 100644 index 000000000000..f6b77ff1d2bf --- /dev/null +++ b/fs/ocfs2/namei.c @@ -0,0 +1,2264 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.c + * + * Create and rename file, directory, symlinks + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * Portions of this code from linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linux Torvalds + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_NAMEI +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dcache.h" +#include "dir.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "namei.h" +#include "suballoc.h" +#include "symlink.h" +#include "sysfile.h" +#include "uptodate.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define NAMEI_RA_CHUNKS 2 +#define NAMEI_RA_BLOCKS 4 +#define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) +#define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + +static int inline ocfs2_search_dirblock(struct buffer_head *bh, + struct inode *dir, + const char *name, int namelen, + unsigned long offset, + struct ocfs2_dir_entry **res_dir); + +static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh); + +static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct buffer_head *insert_bh); + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct dentry *dentry, int mode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + struct ocfs2_journal_handle *handle, + struct inode **ret_inode, + struct ocfs2_alloc_context *inode_ac); + +static int ocfs2_fill_new_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac); + +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2); + +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + char *name, + struct buffer_head **de_bh); + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct ocfs2_dinode *fe, + char *name, + struct buffer_head *de_bh); + +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + const char *symname); + +static inline int ocfs2_add_entry(struct ocfs2_journal_handle *handle, + struct dentry *dentry, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct buffer_head *insert_bh) +{ + return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + dentry->d_name.name, dentry->d_name.len, + inode, blkno, parent_fe_bh, insert_bh); +} + +/* An orphan dir name is an 8 byte value, printed as a hex string */ +#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) + +static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int status; + u64 blkno; + struct buffer_head *dirent_bh = NULL; + struct inode *inode = NULL; + struct dentry *ret; + struct ocfs2_dir_entry *dirent; + struct ocfs2_inode_info *oi; + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { + ret = ERR_PTR(-ENAMETOOLONG); + goto bail; + } + + mlog(0, "find name %.*s in directory %"MLFu64"\n", dentry->d_name.len, + dentry->d_name.name, OCFS2_I(dir)->ip_blkno); + + status = ocfs2_meta_lock(dir, NULL, NULL, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + ret = ERR_PTR(status); + goto bail; + } + + status = ocfs2_find_files_on_disk(dentry->d_name.name, + dentry->d_name.len, &blkno, + dir, &dirent_bh, &dirent); + if (status < 0) + goto bail_add; + + inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno); + if (IS_ERR(inode)) { + mlog(ML_ERROR, "Unable to create inode %"MLFu64"\n", blkno); + ret = ERR_PTR(-EACCES); + goto bail_unlock; + } + + oi = OCFS2_I(inode); + /* Clear any orphaned state... If we were able to look up the + * inode from a directory, it certainly can't be orphaned. We + * might have the bad state from a node which intended to + * orphan this inode but crashed before it could commit the + * unlink. */ + spin_lock(&oi->ip_lock); + oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; + oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; + spin_unlock(&oi->ip_lock); + +bail_add: + + dentry->d_op = &ocfs2_dentry_ops; + ret = d_splice_alias(inode, dentry); + +bail_unlock: + /* Don't drop the cluster lock until *after* the d_add -- + * unlink on another node will message us to remove that + * dentry under this lock so otherwise we can race this with + * the vote thread and have a stale dentry. */ + ocfs2_meta_unlock(dir, 0); + +bail: + if (dirent_bh) + brelse(dirent_bh); + + mlog_exit_ptr(ret); + + return ret; +} + +static int ocfs2_fill_new_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *parent, + struct inode *inode, + struct buffer_head *fe_bh, + struct ocfs2_alloc_context *data_ac) +{ + int status; + struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry *de = NULL; + + mlog_entry_void(); + + status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, + data_ac, NULL, &new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + status = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + memset(new_bh->b_data, 0, osb->sb->s_blocksize); + + de = (struct ocfs2_dir_entry *) new_bh->b_data; + de->inode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + de->name_len = 1; + de->rec_len = + cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + strcpy(de->name, "."); + ocfs2_set_de_type(de, S_IFDIR); + de = (struct ocfs2_dir_entry *) ((char *)de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le64(OCFS2_I(parent)->ip_blkno); + de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize - + OCFS2_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy(de->name, ".."); + ocfs2_set_de_type(de, S_IFDIR); + + status = ocfs2_journal_dirty(handle, new_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + i_size_write(inode, inode->i_sb->s_blocksize); + inode->i_nlink = 2; + inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if (new_bh) + brelse(new_bh); + + mlog_exit(status); + return status; +} + +static int ocfs2_mknod(struct inode *dir, + struct dentry *dentry, + int mode, + dev_t dev) +{ + int status = 0; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_super *osb; + struct ocfs2_dinode *dirfe; + struct buffer_head *new_fe_bh = NULL; + struct buffer_head *de_bh = NULL; + struct inode *inode = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, + (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + + /* get our super block */ + osb = OCFS2_SB(dir->i_sb); + + if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) { + mlog(ML_ERROR, "inode %"MLFu64" has i_nlink of %u\n", + OCFS2_I(dir)->ip_blkno, dir->i_nlink); + status = -EMLINK; + goto leave; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!dirfe->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto leave; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto leave; + + /* get a spot inside the dir. */ + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* reserve an inode spot */ + status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + /* are we making a directory? If so, reserve a cluster for his + * 1st extent. */ + if (S_ISDIR(mode)) { + status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_MKNOD_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + /* do the real work now. */ + status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, + &new_fe_bh, parent_fe_bh, handle, + &inode, inode_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(mode)) { + status = ocfs2_fill_new_dir(osb, handle, dir, inode, + new_fe_bh, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle, dir, parent_fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + le16_add_cpu(&dirfe->i_links_count, 1); + status = ocfs2_journal_dirty(handle, parent_fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + dir->i_nlink++; + } + + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); + status = 0; +leave: + if (handle) + ocfs2_commit_trans(handle); + + if (status == -ENOSPC) + mlog(0, "Disk is full\n"); + + if (new_fe_bh) + brelse(new_fe_bh); + + if (de_bh) + brelse(de_bh); + + if (parent_fe_bh) + brelse(parent_fe_bh); + + if ((status < 0) && inode) + iput(inode); + + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + mlog_exit(status); + + return status; +} + +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct dentry *dentry, int mode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + struct ocfs2_journal_handle *handle, + struct inode **ret_inode, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_extent_list *fel; + u64 fe_blkno = 0; + u16 suballoc_bit; + struct inode *inode = NULL; + + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, + (unsigned long)dev, dentry->d_name.len, + dentry->d_name.name); + + *new_fe_bh = NULL; + *ret_inode = NULL; + + status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, + &fe_blkno); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + inode = new_inode(dir->i_sb); + if (IS_ERR(inode)) { + status = PTR_ERR(inode); + mlog(ML_ERROR, "new_inode failed!\n"); + goto leave; + } + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); + OCFS2_I(inode)->ip_blkno = fe_blkno; + if (S_ISDIR(mode)) + inode->i_nlink = 2; + else + inode->i_nlink = 1; + inode->i_mode = mode; + spin_lock(&osb->osb_lock); + inode->i_generation = osb->s_next_generation++; + spin_unlock(&osb->osb_lock); + + *new_fe_bh = sb_getblk(osb->sb, fe_blkno); + if (!*new_fe_bh) { + status = -EIO; + mlog_errno(status); + goto leave; + } + ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); + + status = ocfs2_journal_access(handle, inode, *new_fe_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) (*new_fe_bh)->b_data; + memset(fe, 0, osb->sb->s_blocksize); + + fe->i_generation = cpu_to_le32(inode->i_generation); + fe->i_fs_generation = cpu_to_le32(osb->fs_generation); + fe->i_blkno = cpu_to_le64(fe_blkno); + fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); + fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); + fe->i_uid = cpu_to_le32(current->fsuid); + if (dir->i_mode & S_ISGID) { + fe->i_gid = cpu_to_le32(dir->i_gid); + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + fe->i_gid = cpu_to_le32(current->fsgid); + fe->i_mode = cpu_to_le16(mode); + if (S_ISCHR(mode) || S_ISBLK(mode)) + fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); + + fe->i_links_count = cpu_to_le16(inode->i_nlink); + + fe->i_last_eb_blk = 0; + strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); + le32_add_cpu(&fe->i_flags, OCFS2_VALID_FL); + fe->i_atime = fe->i_ctime = fe->i_mtime = + cpu_to_le64(CURRENT_TIME.tv_sec); + fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = + cpu_to_le32(CURRENT_TIME.tv_nsec); + fe->i_dtime = 0; + + fel = &fe->id2.i_list; + fel->l_tree_depth = 0; + fel->l_next_free_rec = 0; + fel->l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(osb->sb)); + + status = ocfs2_journal_dirty(handle, *new_fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (ocfs2_populate_inode(inode, fe, 1) < 0) { + mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " + "i_blkno=%"MLFu64", i_ino=%lu\n", + (unsigned long long) (*new_fe_bh)->b_blocknr, + fe->i_blkno, inode->i_ino); + BUG(); + } + + ocfs2_inode_set_new(osb, inode); + status = ocfs2_create_new_inode_locks(inode); + if (status < 0) + mlog_errno(status); + + status = 0; /* error in ocfs2_create_new_inode_locks is not + * critical */ + + *ret_inode = inode; +leave: + if (status < 0) { + if (*new_fe_bh) { + brelse(*new_fe_bh); + *new_fe_bh = NULL; + } + if (inode) + iput(inode); + } + + mlog_exit(status); + return status; +} + +static int ocfs2_mkdir(struct inode *dir, + struct dentry *dentry, + int mode) +{ + int ret; + + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, + dentry->d_name.len, dentry->d_name.name); + ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + mlog_exit(ret); + + return ret; +} + +static int ocfs2_create(struct inode *dir, + struct dentry *dentry, + int mode, + struct nameidata *nd) +{ + int ret; + + mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, + dentry->d_name.len, dentry->d_name.name); + ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + mlog_exit(ret); + + return ret; +} + +static int ocfs2_link(struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct ocfs2_journal_handle *handle = NULL; + struct inode *inode = old_dentry->d_inode; + int err; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct buffer_head *de_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + + mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, + old_dentry->d_name.len, old_dentry->d_name.name, + dentry->d_name.len, dentry->d_name.name); + + if (S_ISDIR(inode->i_mode)) { + err = -EPERM; + goto bail; + } + + if (inode->i_nlink >= OCFS2_LINK_MAX) { + err = -EMLINK; + goto bail; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + err = -ENOMEM; + goto bail; + } + + err = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (err) + goto bail; + + err = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (err < 0) { + mlog_errno(err); + goto bail; + } + + err = ocfs2_meta_lock(inode, handle, &fe_bh, 1); + if (err < 0) { + if (err != -ENOENT) + mlog_errno(err); + goto bail; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + if (le16_to_cpu(fe->i_links_count) >= OCFS2_LINK_MAX) { + err = -EMLINK; + goto bail; + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_LINK_CREDITS); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + mlog_errno(err); + goto bail; + } + + err = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (err < 0) { + mlog_errno(err); + goto bail; + } + + inode->i_nlink++; + inode->i_ctime = CURRENT_TIME; + fe->i_links_count = cpu_to_le16(inode->i_nlink); + fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + err = ocfs2_journal_dirty(handle, fe_bh); + if (err < 0) { + le16_add_cpu(&fe->i_links_count, -1); + inode->i_nlink--; + mlog_errno(err); + goto bail; + } + + err = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, + parent_fe_bh, de_bh); + if (err) { + le16_add_cpu(&fe->i_links_count, -1); + inode->i_nlink--; + mlog_errno(err); + goto bail; + } + + atomic_inc(&inode->i_count); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); +bail: + if (handle) + ocfs2_commit_trans(handle); + if (de_bh) + brelse(de_bh); + if (fe_bh) + brelse(fe_bh); + if (parent_fe_bh) + brelse(parent_fe_bh); + + mlog_exit(err); + + return err; +} + +static int ocfs2_unlink(struct inode *dir, + struct dentry *dentry) +{ + int status; + unsigned int saved_nlink = 0; + struct inode *inode = dentry->d_inode; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + u64 blkno; + struct ocfs2_dinode *fe = NULL; + struct buffer_head *fe_bh = NULL; + struct buffer_head *parent_node_bh = NULL; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_dir_entry *dirent = NULL; + struct buffer_head *dirent_bh = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *orphan_entry_bh = NULL; + + mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, + dentry->d_name.len, dentry->d_name.name); + + BUG_ON(dentry->d_parent->d_inode != dir); + + mlog(0, "ino = %"MLFu64"\n", OCFS2_I(inode)->ip_blkno); + + if (inode == osb->root_inode) { + mlog(0, "Cannot delete the root directory\n"); + status = -EPERM; + goto leave; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + status = ocfs2_meta_lock(dir, handle, &parent_node_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + status = ocfs2_find_files_on_disk(dentry->d_name.name, + dentry->d_name.len, &blkno, + dir, &dirent_bh, &dirent); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + if (OCFS2_I(inode)->ip_blkno != blkno) { + status = -ENOENT; + + mlog(0, "ip_blkno (%"MLFu64") != dirent blkno (%"MLFu64") " + "ip_flags = %x\n", OCFS2_I(inode)->ip_blkno, blkno, + OCFS2_I(inode)->ip_flags); + goto leave; + } + + status = ocfs2_meta_lock(inode, handle, &fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(inode->i_mode)) { + if (!ocfs2_empty_dir(inode)) { + status = -ENOTEMPTY; + goto leave; + } else if (inode->i_nlink != 2) { + status = -ENOTEMPTY; + goto leave; + } + } + + /* There are still a few steps left until we can consider the + * unlink to have succeeded. Save off nlink here before + * modification so we can set it back in case we hit an issue + * before commit. */ + saved_nlink = inode->i_nlink; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 0; + else + inode->i_nlink--; + + status = ocfs2_request_unlink_vote(inode, dentry, + (unsigned int) inode->i_nlink); + if (status < 0) { + /* This vote should succeed under all normal + * circumstances. */ + mlog_errno(status); + goto leave; + } + + if (!inode->i_nlink) { + status = ocfs2_prepare_orphan_dir(osb, handle, inode, + orphan_name, + &orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_UNLINK_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + fe = (struct ocfs2_dinode *) fe_bh->b_data; + + if (!inode->i_nlink) { + status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, + orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + + /* delete the name from the parent dir */ + status = ocfs2_delete_entry(handle, dir, dirent, dirent_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* We can set nlink on the dinode now. clear the saved version + * so that it doesn't get set later. */ + fe->i_links_count = cpu_to_le16(inode->i_nlink); + saved_nlink = 0; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (S_ISDIR(inode->i_mode)) { + dir->i_nlink--; + status = ocfs2_mark_inode_dirty(handle, dir, + parent_node_bh); + if (status < 0) { + mlog_errno(status); + dir->i_nlink++; + } + } + +leave: + if (status < 0 && saved_nlink) + inode->i_nlink = saved_nlink; + + if (handle) + ocfs2_commit_trans(handle); + + if (fe_bh) + brelse(fe_bh); + + if (dirent_bh) + brelse(dirent_bh); + + if (parent_node_bh) + brelse(parent_node_bh); + + if (orphan_entry_bh) + brelse(orphan_entry_bh); + + mlog_exit(status); + + return status; +} + +/* + * The only place this should be used is rename! + * if they have the same id, then the 1st one is the only one locked. + */ +static int ocfs2_double_lock(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct buffer_head **bh1, + struct inode *inode1, + struct buffer_head **bh2, + struct inode *inode2) +{ + int status; + struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); + struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); + struct buffer_head **tmpbh; + struct inode *tmpinode; + + mlog_entry("(inode1 = %"MLFu64", inode2 = %"MLFu64")\n", + oi1->ip_blkno, oi2->ip_blkno); + + BUG_ON(!handle); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* we always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno != oi2->ip_blkno) { + if (oi1->ip_blkno < oi2->ip_blkno) { + /* switch id1 and id2 around */ + mlog(0, "switching them around...\n"); + tmpbh = bh2; + bh2 = bh1; + bh1 = tmpbh; + + tmpinode = inode2; + inode2 = inode1; + inode1 = tmpinode; + } + /* lock id2 */ + status = ocfs2_meta_lock(inode2, handle, bh2, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + } + /* lock id1 */ + status = ocfs2_meta_lock(inode1, handle, bh1, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } +bail: + mlog_exit(status); + return status; +} + +#define PARENT_INO(buffer) \ + ((struct ocfs2_dir_entry *) \ + ((char *)buffer + \ + le16_to_cpu(((struct ocfs2_dir_entry *)buffer)->rec_len)))->inode + +static int ocfs2_rename(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + int status = 0, rename_lock = 0; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct ocfs2_dinode *newfe = NULL; + char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; + struct buffer_head *orphan_entry_bh = NULL; + struct buffer_head *newfe_bh = NULL; + struct buffer_head *insert_entry_bh = NULL; + struct ocfs2_super *osb = NULL; + u64 newfe_blkno; + struct ocfs2_journal_handle *handle = NULL; + struct buffer_head *old_dir_bh = NULL; + struct buffer_head *new_dir_bh = NULL; + struct ocfs2_dir_entry *old_de = NULL, *new_de = NULL; // dirent for old_dentry + // and new_dentry + struct buffer_head *new_de_bh = NULL, *old_de_bh = NULL; // bhs for above + struct buffer_head *old_inode_de_bh = NULL; // if old_dentry is a dir, + // this is the 1st dirent bh + nlink_t old_dir_nlink = old_dir->i_nlink, new_dir_nlink = new_dir->i_nlink; + unsigned int links_count; + + /* At some point it might be nice to break this function up a + * bit. */ + + mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", + old_dir, old_dentry, new_dir, new_dentry, + old_dentry->d_name.len, old_dentry->d_name.name, + new_dentry->d_name.len, new_dentry->d_name.name); + + osb = OCFS2_SB(old_dir->i_sb); + + if (new_inode) { + if (!igrab(new_inode)) + BUG(); + } + + if (atomic_read(&old_dentry->d_count) > 2) { + shrink_dcache_parent(old_dentry); + if (atomic_read(&old_dentry->d_count) > 2) { + status = -EBUSY; + goto bail; + } + } + + /* Assume a directory heirarchy thusly: + * a/b/c + * a/d + * a,b,c, and d are all directories. + * + * from cwd of 'a' on both nodes: + * node1: mv b/c d + * node2: mv d b/c + * + * And that's why, just like the VFS, we need a file system + * rename lock. */ + if (old_dentry != new_dentry) { + status = ocfs2_rename_lock(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + rename_lock = 1; + } + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* if old and new are the same, this'll just do one lock. */ + status = ocfs2_double_lock(osb, handle, + &old_dir_bh, old_dir, + &new_dir_bh, new_dir); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* make sure both dirs have bhs + * get an extra ref on old_dir_bh if old==new */ + if (!new_dir_bh) { + if (old_dir_bh) { + new_dir_bh = old_dir_bh; + get_bh(new_dir_bh); + } else { + mlog(ML_ERROR, "no old_dir_bh!\n"); + status = -EIO; + goto bail; + } + } + + if (S_ISDIR(old_inode->i_mode)) { + /* Directories actually require metadata updates to + * the directory info so we can't get away with not + * doing node locking on it. */ + status = ocfs2_meta_lock(old_inode, handle, NULL, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + status = ocfs2_request_rename_vote(old_inode, old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = -EIO; + old_inode_de_bh = ocfs2_bread(old_inode, 0, &status, 0); + if (!old_inode_de_bh) + goto bail; + + status = -EIO; + if (le64_to_cpu(PARENT_INO(old_inode_de_bh->b_data)) != + OCFS2_I(old_dir)->ip_blkno) + goto bail; + status = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= OCFS2_LINK_MAX) + goto bail; + } else { + /* Ah, the simple case - we're a file so just send a + * message. */ + status = ocfs2_request_rename_vote(old_inode, old_dentry); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = -ENOENT; + old_de_bh = ocfs2_find_entry(old_dentry->d_name.name, + old_dentry->d_name.len, + old_dir, &old_de); + if (!old_de_bh) + goto bail; + + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + if (le64_to_cpu(old_de->inode) != OCFS2_I(old_inode)->ip_blkno) + goto bail; + + /* check if the target already exists (in which case we need + * to delete it */ + status = ocfs2_find_files_on_disk(new_dentry->d_name.name, + new_dentry->d_name.len, + &newfe_blkno, new_dir, &new_de_bh, + &new_de); + /* The only error we allow here is -ENOENT because the new + * file not existing is perfectly valid. */ + if ((status < 0) && (status != -ENOENT)) { + /* If we cannot find the file specified we should just */ + /* return the error... */ + mlog_errno(status); + goto bail; + } + + if (!new_de && new_inode) + mlog(ML_ERROR, "inode %lu does not exist in it's parent " + "directory!", new_inode->i_ino); + + /* In case we need to overwrite an existing file, we blow it + * away first */ + if (new_de) { + /* VFS didn't think there existed an inode here, but + * someone else in the cluster must have raced our + * rename to create one. Today we error cleanly, in + * the future we should consider calling iget to build + * a new struct inode for this entry. */ + if (!new_inode) { + status = -EACCES; + + mlog(0, "We found an inode for name %.*s but VFS " + "didn't give us one.\n", new_dentry->d_name.len, + new_dentry->d_name.name); + goto bail; + } + + if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { + status = -EACCES; + + mlog(0, "Inode blkno (%"MLFu64") and dir (%"MLFu64") " + "disagree. ip_flags = %x\n", + OCFS2_I(new_inode)->ip_blkno, newfe_blkno, + OCFS2_I(new_inode)->ip_flags); + goto bail; + } + + status = ocfs2_meta_lock(new_inode, handle, &newfe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode)) + links_count = 0; + else + links_count = (unsigned int) (new_inode->i_nlink - 1); + + status = ocfs2_request_unlink_vote(new_inode, new_dentry, + links_count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + newfe = (struct ocfs2_dinode *) newfe_bh->b_data; + + mlog(0, "aha rename over existing... new_de=%p " + "new_blkno=%"MLFu64" newfebh=%p bhblocknr=%llu\n", + new_de, newfe_blkno, newfe_bh, newfe_bh ? + (unsigned long long)newfe_bh->b_blocknr : 0ULL); + + if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) { + status = ocfs2_prepare_orphan_dir(osb, handle, + new_inode, + orphan_name, + &orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + } else { + BUG_ON(new_dentry->d_parent->d_inode != new_dir); + + status = ocfs2_check_dir_for_entry(new_dir, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, new_dir, new_dir_bh, + new_dentry->d_name.name, + new_dentry->d_name.len, + &insert_entry_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + handle = ocfs2_start_trans(osb, handle, OCFS2_RENAME_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + if (new_de) { + if (S_ISDIR(new_inode->i_mode)) { + if (!ocfs2_empty_dir(new_inode) || + new_inode->i_nlink != 2) { + status = -ENOTEMPTY; + goto bail; + } + } + status = ocfs2_journal_access(handle, new_inode, newfe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode) || + (newfe->i_links_count == cpu_to_le16(1))){ + status = ocfs2_orphan_add(osb, handle, new_inode, + newfe, orphan_name, + orphan_entry_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* change the dirent to point to the correct inode */ + status = ocfs2_journal_access(handle, new_dir, new_de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + new_de->inode = cpu_to_le64(OCFS2_I(old_inode)->ip_blkno); + new_de->file_type = old_de->file_type; + new_dir->i_version++; + status = ocfs2_journal_dirty(handle, new_de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (S_ISDIR(new_inode->i_mode)) + newfe->i_links_count = 0; + else + le16_add_cpu(&newfe->i_links_count, -1); + + status = ocfs2_journal_dirty(handle, newfe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } else { + /* if the name was not found in new_dir, add it now */ + status = ocfs2_add_entry(handle, new_dentry, old_inode, + OCFS2_I(old_inode)->ip_blkno, + new_dir_bh, insert_entry_bh); + } + + old_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_inode); + + /* now that the name has been added to new_dir, remove the old name */ + status = ocfs2_delete_entry(handle, old_dir, old_de, old_de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + if (old_inode_de_bh) { + status = ocfs2_journal_access(handle, old_inode, + old_inode_de_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + PARENT_INO(old_inode_de_bh->b_data) = + cpu_to_le64(OCFS2_I(new_dir)->ip_blkno); + status = ocfs2_journal_dirty(handle, old_inode_de_bh); + old_dir->i_nlink--; + if (new_inode) { + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; + mark_inode_dirty(new_dir); + } + } + mark_inode_dirty(old_dir); + if (new_inode) + mark_inode_dirty(new_inode); + + if (old_dir != new_dir) + if (new_dir_nlink != new_dir->i_nlink) { + if (!new_dir_bh) { + mlog(ML_ERROR, "need to change nlink for new " + "dir %"MLFu64" from %d to %d but bh is " + "NULL\n", OCFS2_I(new_dir)->ip_blkno, + (int)new_dir_nlink, new_dir->i_nlink); + } else { + struct ocfs2_dinode *fe; + status = ocfs2_journal_access(handle, + new_dir, + new_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + fe = (struct ocfs2_dinode *) new_dir_bh->b_data; + fe->i_links_count = cpu_to_le16(new_dir->i_nlink); + status = ocfs2_journal_dirty(handle, new_dir_bh); + } + } + + if (old_dir_nlink != old_dir->i_nlink) { + if (!old_dir_bh) { + mlog(ML_ERROR, "need to change nlink for old dir " + "%"MLFu64" from %d to %d but bh is NULL!\n", + OCFS2_I(old_dir)->ip_blkno, + (int)old_dir_nlink, + old_dir->i_nlink); + } else { + struct ocfs2_dinode *fe; + status = ocfs2_journal_access(handle, old_dir, + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + fe = (struct ocfs2_dinode *) old_dir_bh->b_data; + fe->i_links_count = cpu_to_le16(old_dir->i_nlink); + status = ocfs2_journal_dirty(handle, old_dir_bh); + } + } + + status = 0; +bail: + if (rename_lock) + ocfs2_rename_unlock(osb); + + if (handle) + ocfs2_commit_trans(handle); + + if (new_inode) + sync_mapping_buffers(old_inode->i_mapping); + + if (new_inode) + iput(new_inode); + if (newfe_bh) + brelse(newfe_bh); + if (old_dir_bh) + brelse(old_dir_bh); + if (new_dir_bh) + brelse(new_dir_bh); + if (new_de_bh) + brelse(new_de_bh); + if (old_de_bh) + brelse(old_de_bh); + if (old_inode_de_bh) + brelse(old_inode_de_bh); + if (orphan_entry_bh) + brelse(orphan_entry_bh); + if (insert_entry_bh) + brelse(insert_entry_bh); + + mlog_exit(status); + + return status; +} + +/* + * we expect i_size = strlen(symname). Copy symname into the file + * data, including the null terminator. + */ +static int ocfs2_create_symlink_data(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + const char *symname) +{ + struct buffer_head **bhs = NULL; + const char *c; + struct super_block *sb = osb->sb; + u64 p_blkno; + int p_blocks; + int virtual, blocks, status, i, bytes_left; + + bytes_left = i_size_read(inode) + 1; + /* we can't trust i_blocks because we're actually going to + * write i_size + 1 bytes. */ + blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; + + mlog_entry("i_blocks = %lu, i_size = %llu, blocks = %d\n", + inode->i_blocks, i_size_read(inode), blocks); + + /* Sanity check -- make sure we're going to fit. */ + if (bytes_left > + ocfs2_clusters_to_bytes(sb, OCFS2_I(inode)->ip_clusters)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + bhs = kcalloc(blocks, sizeof(struct buffer_head *), GFP_KERNEL); + if (!bhs) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, + &p_blocks); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* links can never be larger than one cluster so we know this + * is all going to be contiguous, but do a sanity check + * anyway. */ + if ((p_blocks << sb->s_blocksize_bits) < bytes_left) { + status = -EIO; + mlog_errno(status); + goto bail; + } + + virtual = 0; + while(bytes_left > 0) { + c = &symname[virtual * sb->s_blocksize]; + + bhs[virtual] = sb_getblk(sb, p_blkno); + if (!bhs[virtual]) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[virtual]); + + status = ocfs2_journal_access(handle, inode, bhs[virtual], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bhs[virtual]->b_data, 0, sb->s_blocksize); + + memcpy(bhs[virtual]->b_data, c, + (bytes_left > sb->s_blocksize) ? sb->s_blocksize : + bytes_left); + + status = ocfs2_journal_dirty(handle, bhs[virtual]); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + virtual++; + p_blkno++; + bytes_left -= sb->s_blocksize; + } + + status = 0; +bail: + + if (bhs) { + for(i = 0; i < blocks; i++) + if (bhs[i]) + brelse(bhs[i]); + kfree(bhs); + } + + mlog_exit(status); + return status; +} + +static int ocfs2_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + int status, l, credits; + u64 newsize; + struct ocfs2_super *osb = NULL; + struct inode *inode = NULL; + struct super_block *sb; + struct buffer_head *new_fe_bh = NULL; + struct buffer_head *de_bh = NULL; + struct buffer_head *parent_fe_bh = NULL; + struct ocfs2_dinode *fe = NULL; + struct ocfs2_dinode *dirfe; + struct ocfs2_journal_handle *handle = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + + mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, + dentry, symname, dentry->d_name.len, dentry->d_name.name); + + sb = dir->i_sb; + osb = OCFS2_SB(sb); + + l = strlen(symname) + 1; + + credits = ocfs2_calc_symlink_credits(sb); + + handle = ocfs2_alloc_handle(osb); + if (handle == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* lock the parent directory */ + status = ocfs2_meta_lock(dir, handle, &parent_fe_bh, 1); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto bail; + } + + dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + if (!dirfe->i_links_count) { + /* can't make a file in a deleted directory. */ + status = -ENOENT; + goto bail; + } + + status = ocfs2_check_dir_for_entry(dir, dentry->d_name.name, + dentry->d_name.len); + if (status) + goto bail; + + status = ocfs2_prepare_dir_for_insert(osb, dir, parent_fe_bh, + dentry->d_name.name, + dentry->d_name.len, &de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_reserve_new_inode(osb, handle, &inode_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + /* don't reserve bitmap space for fast symlinks. */ + if (l > ocfs2_fast_symlink_chars(sb)) { + status = ocfs2_reserve_clusters(osb, handle, 1, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_mknod_locked(osb, dir, dentry, + S_IFLNK | S_IRWXUGO, 0, + &new_fe_bh, parent_fe_bh, handle, + &inode, inode_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) new_fe_bh->b_data; + inode->i_rdev = 0; + newsize = l - 1; + if (l > ocfs2_fast_symlink_chars(sb)) { + inode->i_op = &ocfs2_symlink_inode_operations; + status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, + handle, data_ac, NULL, + NULL); + if (status < 0) { + if (status != -ENOSPC && status != -EINTR) { + mlog(ML_ERROR, "Failed to extend file to " + "%"MLFu64"\n", + newsize); + mlog_errno(status); + status = -ENOSPC; + } + goto bail; + } + i_size_write(inode, newsize); + inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); + } else { + inode->i_op = &ocfs2_fast_symlink_inode_operations; + memcpy((char *) fe->id2.i_symlink, symname, l); + i_size_write(inode, newsize); + inode->i_blocks = 0; + } + + status = ocfs2_mark_inode_dirty(handle, inode, new_fe_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (!ocfs2_inode_is_fast_symlink(inode)) { + status = ocfs2_create_symlink_data(osb, handle, inode, + symname); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + status = ocfs2_add_entry(handle, dentry, inode, + le64_to_cpu(fe->i_blkno), parent_fe_bh, + de_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + insert_inode_hash(inode); + dentry->d_op = &ocfs2_dentry_ops; + d_instantiate(dentry, inode); +bail: + if (handle) + ocfs2_commit_trans(handle); + if (new_fe_bh) + brelse(new_fe_bh); + if (parent_fe_bh) + brelse(parent_fe_bh); + if (de_bh) + brelse(de_bh); + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if ((status < 0) && inode) + iput(inode); + + mlog_exit(status); + + return status; +} + +int ocfs2_check_dir_entry(struct inode * dir, + struct ocfs2_dir_entry * de, + struct buffer_head * bh, + unsigned long offset) +{ + const char *error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (rlen < OCFS2_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + + if (error_msg != NULL) + mlog(ML_ERROR, "bad entry in directory #%"MLFu64": %s - " + "offset=%lu, inode=%"MLFu64", rec_len=%d, name_len=%d\n", + OCFS2_I(dir)->ip_blkno, error_msg, offset, + le64_to_cpu(de->inode), rlen, de->name_len); + return error_msg == NULL ? 1 : 0; +} + +/* we don't always have a dentry for what we want to add, so people + * like orphan dir can call this instead. + * + * If you pass me insert_bh, I'll skip the search of the other dir + * blocks and put the record in there. + */ +static int __ocfs2_add_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + const char *name, int namelen, + struct inode *inode, u64 blkno, + struct buffer_head *parent_fe_bh, + struct buffer_head *insert_bh) +{ + unsigned long offset; + unsigned short rec_len; + struct ocfs2_dir_entry *de, *de1; + struct super_block *sb; + int retval, status; + + mlog_entry_void(); + + sb = dir->i_sb; + + if (!namelen) + return -EINVAL; + + rec_len = OCFS2_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ocfs2_dir_entry *) insert_bh->b_data; + while (1) { + BUG_ON((char *)de >= sb->s_blocksize + insert_bh->b_data); + /* These checks should've already been passed by the + * prepare function, but I guess we can leave them + * here anyway. */ + if (!ocfs2_check_dir_entry(dir, de, insert_bh, offset)) { + retval = -ENOENT; + goto bail; + } + if (ocfs2_match(namelen, name, de)) { + retval = -EEXIST; + goto bail; + } + if (((le64_to_cpu(de->inode) == 0) && + (le16_to_cpu(de->rec_len) >= rec_len)) || + (le16_to_cpu(de->rec_len) >= + (OCFS2_DIR_REC_LEN(de->name_len) + rec_len))) { + status = ocfs2_journal_access(handle, dir, insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + /* By now the buffer is marked for journaling */ + offset += le16_to_cpu(de->rec_len); + if (le64_to_cpu(de->inode)) { + de1 = (struct ocfs2_dir_entry *)((char *) de + + OCFS2_DIR_REC_LEN(de->name_len)); + de1->rec_len = + cpu_to_le16(le16_to_cpu(de->rec_len) - + OCFS2_DIR_REC_LEN(de->name_len)); + de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); + de = de1; + } + de->file_type = OCFS2_FT_UNKNOWN; + if (blkno) { + de->inode = cpu_to_le64(blkno); + ocfs2_set_de_type(de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy(de->name, name, namelen); + + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_version++; + status = ocfs2_journal_dirty(handle, insert_bh); + retval = 0; + goto bail; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); + } + + /* when you think about it, the assert above should prevent us + * from ever getting here. */ + retval = -ENOSPC; +bail: + + mlog_exit(retval); + return retval; +} + + +/* + * ocfs2_delete_entry deletes a directory entry by merging it with the + * previous entry + */ +static int ocfs2_delete_entry(struct ocfs2_journal_handle *handle, + struct inode *dir, + struct ocfs2_dir_entry *de_del, + struct buffer_head *bh) +{ + struct ocfs2_dir_entry *de, *pde; + int i, status = -ENOENT; + + mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); + + i = 0; + pde = NULL; + de = (struct ocfs2_dir_entry *) bh->b_data; + while (i < bh->b_size) { + if (!ocfs2_check_dir_entry(dir, de, bh, i)) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (de == de_del) { + status = ocfs2_journal_access(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + status = -EIO; + mlog_errno(status); + goto bail; + } + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + dir->i_version++; + status = ocfs2_journal_dirty(handle, bh); + goto bail; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); + } +bail: + mlog_exit(status); + return status; +} + +/* + * Returns 0 if not found, -1 on failure, and 1 on success + */ +static int inline ocfs2_search_dirblock(struct buffer_head *bh, + struct inode *dir, + const char *name, int namelen, + unsigned long offset, + struct ocfs2_dir_entry **res_dir) +{ + struct ocfs2_dir_entry *de; + char *dlimit, *de_buf; + int de_len; + int ret = 0; + + mlog_entry_void(); + + de_buf = bh->b_data; + dlimit = de_buf + dir->i_sb->s_blocksize; + + while (de_buf < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + de = (struct ocfs2_dir_entry *) de_buf; + + if (de_buf + namelen <= dlimit && + ocfs2_match(namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ocfs2_check_dir_entry(dir, de, bh, offset)) { + ret = -1; + goto bail; + } + *res_dir = de; + ret = 1; + goto bail; + } + + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) { + ret = -1; + goto bail; + } + + de_buf += de_len; + offset += de_len; + } + +bail: + mlog_exit(ret); + return ret; +} + +struct buffer_head *ocfs2_find_entry(const char *name, int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir) +{ + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + + mlog_entry_void(); + + *res_dir = NULL; + sb = dir->i_sb; + + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + start = OCFS2_I(dir)->ip_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; + +restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + + /* XXX: questionable readahead stuff here */ + bh = ocfs2_bread(dir, b++, &err, 1); + bh_use[ra_max] = bh; +#if 0 // ??? + if (bh) + ll_rw_block(READ, 1, &bh); +#endif + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + brelse(bh); + goto next; + } + i = ocfs2_search_dirblock(bh, dir, name, namelen, + block << sb->s_blocksize_bits, + res_dir); + if (i == 1) { + OCFS2_I(dir)->ip_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = i_size_read(dir) >> sb->s_blocksize_bits; + if (block < nblocks) { + start = 0; + goto restart; + } + +cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse(bh_use[ra_ptr]); + + mlog_exit_ptr(ret); + return ret; +} + +static int ocfs2_blkno_stringify(u64 blkno, char *name) +{ + int status, namelen; + + mlog_entry_void(); + + namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016"MLFx64, + blkno); + if (namelen <= 0) { + if (namelen) + status = namelen; + else + status = -EINVAL; + mlog_errno(status); + goto bail; + } + if (namelen != OCFS2_ORPHAN_NAMELEN) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, + namelen); + + status = 0; +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + char *name, + struct buffer_head **de_bh) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int status = 0; + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + ocfs2_handle_add_inode(handle, orphan_dir_inode); + status = ocfs2_meta_lock(orphan_dir_inode, handle, &orphan_dir_bh, 1); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + +leave: + if (orphan_dir_inode) + iput(orphan_dir_inode); + + if (orphan_dir_bh) + brelse(orphan_dir_bh); + + mlog_exit(status); + return status; +} + +static int ocfs2_orphan_add(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *inode, + struct ocfs2_dinode *fe, + char *name, + struct buffer_head *de_bh) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int status = 0; + struct ocfs2_dinode *orphan_fe; + + mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); + + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + osb->slot_num); + if (!orphan_dir_inode) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + status = ocfs2_read_block(osb, + OCFS2_I(orphan_dir_inode)->ip_blkno, + &orphan_dir_bh, OCFS2_BH_CACHED, + orphan_dir_inode); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* we're a cluster, and nlink can change on disk from + * underneath us... */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + le16_add_cpu(&orphan_fe->i_links_count, 1); + orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + + status = ocfs2_journal_dirty(handle, orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = __ocfs2_add_entry(handle, orphan_dir_inode, name, + OCFS2_ORPHAN_NAMELEN, inode, + OCFS2_I(inode)->ip_blkno, + orphan_dir_bh, de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + + /* Record which orphan dir our inode now resides + * in. delete_inode will use this to determine which orphan + * dir to lock. */ + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; + spin_unlock(&OCFS2_I(inode)->ip_lock); + + mlog(0, "Inode %"MLFu64" orphaned in slot %d\n", + OCFS2_I(inode)->ip_blkno, osb->slot_num); + +leave: + if (orphan_dir_inode) + iput(orphan_dir_inode); + + if (orphan_dir_bh) + brelse(orphan_dir_bh); + + mlog_exit(status); + return status; +} + +/* unlike orphan_add, we expect the orphan dir to already be locked here. */ +int ocfs2_orphan_del(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh) +{ + char name[OCFS2_ORPHAN_NAMELEN + 1]; + struct ocfs2_dinode *orphan_fe; + int status = 0; + struct buffer_head *target_de_bh = NULL; + struct ocfs2_dir_entry *target_de = NULL; + + mlog_entry_void(); + + status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + mlog(0, "removing '%s' from orphan dir %"MLFu64" (namelen=%d)\n", + name, OCFS2_I(orphan_dir_inode)->ip_blkno, OCFS2_ORPHAN_NAMELEN); + + /* find it's spot in the orphan directory */ + target_de_bh = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, + orphan_dir_inode, &target_de); + if (!target_de_bh) { + status = -ENOENT; + mlog_errno(status); + goto leave; + } + + /* remove it from the orphan directory */ + status = ocfs2_delete_entry(handle, orphan_dir_inode, target_de, + target_de_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* do the i_nlink dance! :) */ + orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; + if (S_ISDIR(inode->i_mode)) + le16_add_cpu(&orphan_fe->i_links_count, -1); + orphan_dir_inode->i_nlink = le16_to_cpu(orphan_fe->i_links_count); + + status = ocfs2_journal_dirty(handle, orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + +leave: + if (target_de_bh) + brelse(target_de_bh); + + mlog_exit(status); + return status; +} + +struct inode_operations ocfs2_dir_iops = { + .create = ocfs2_create, + .lookup = ocfs2_lookup, + .link = ocfs2_link, + .unlink = ocfs2_unlink, + .rmdir = ocfs2_unlink, + .symlink = ocfs2_symlink, + .mkdir = ocfs2_mkdir, + .mknod = ocfs2_mknod, + .rename = ocfs2_rename, + .setattr = ocfs2_setattr, + .getattr = ocfs2_getattr, +}; diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h new file mode 100644 index 000000000000..deaaa97dbf0b --- /dev/null +++ b/fs/ocfs2/namei.h @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * namei.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_NAMEI_H +#define OCFS2_NAMEI_H + +extern struct inode_operations ocfs2_dir_iops; + +struct dentry *ocfs2_get_parent(struct dentry *child); + +int ocfs2_check_dir_entry (struct inode *dir, + struct ocfs2_dir_entry *de, + struct buffer_head *bh, + unsigned long offset); +struct buffer_head *ocfs2_find_entry(const char *name, + int namelen, + struct inode *dir, + struct ocfs2_dir_entry **res_dir); +int ocfs2_orphan_del(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct inode *orphan_dir_inode, + struct inode *inode, + struct buffer_head *orphan_dir_bh); + +static inline int ocfs2_match(int len, + const char * const name, + struct ocfs2_dir_entry *de) +{ + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); +} + +#endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/ocfs1_fs_compat.h b/fs/ocfs2/ocfs1_fs_compat.h new file mode 100644 index 000000000000..0b499bccec5a --- /dev/null +++ b/fs/ocfs2/ocfs1_fs_compat.h @@ -0,0 +1,109 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs1_fs_compat.h + * + * OCFS1 volume header definitions. OCFS2 creates valid but unmountable + * OCFS1 volume headers on the first two sectors of an OCFS2 volume. + * This allows an OCFS1 volume to see the partition and cleanly fail to + * mount it. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS1_FS_COMPAT_H +#define _OCFS1_FS_COMPAT_H + +#define OCFS1_MAX_VOL_SIGNATURE_LEN 128 +#define OCFS1_MAX_MOUNT_POINT_LEN 128 +#define OCFS1_MAX_VOL_ID_LENGTH 16 +#define OCFS1_MAX_VOL_LABEL_LEN 64 +#define OCFS1_MAX_CLUSTER_NAME_LEN 64 + +#define OCFS1_MAJOR_VERSION (2) +#define OCFS1_MINOR_VERSION (0) +#define OCFS1_VOLUME_SIGNATURE "OracleCFS" + +/* + * OCFS1 superblock. Lives at sector 0. + */ +struct ocfs1_vol_disk_hdr +{ +/*00*/ __u32 minor_version; + __u32 major_version; +/*08*/ __u8 signature[OCFS1_MAX_VOL_SIGNATURE_LEN]; +/*88*/ __u8 mount_point[OCFS1_MAX_MOUNT_POINT_LEN]; +/*108*/ __u64 serial_num; +/*110*/ __u64 device_size; + __u64 start_off; +/*120*/ __u64 bitmap_off; + __u64 publ_off; +/*130*/ __u64 vote_off; + __u64 root_bitmap_off; +/*140*/ __u64 data_start_off; + __u64 root_bitmap_size; +/*150*/ __u64 root_off; + __u64 root_size; +/*160*/ __u64 cluster_size; + __u64 num_nodes; +/*170*/ __u64 num_clusters; + __u64 dir_node_size; +/*180*/ __u64 file_node_size; + __u64 internal_off; +/*190*/ __u64 node_cfg_off; + __u64 node_cfg_size; +/*1A0*/ __u64 new_cfg_off; + __u32 prot_bits; + __s32 excl_mount; +/*1B0*/ +}; + + +struct ocfs1_disk_lock +{ +/*00*/ __u32 curr_master; + __u8 file_lock; + __u8 compat_pad[3]; /* Not in orignal definition. Used to + make the already existing alignment + explicit */ + __u64 last_write_time; +/*10*/ __u64 last_read_time; + __u32 writer_node_num; + __u32 reader_node_num; +/*20*/ __u64 oin_node_map; + __u64 dlock_seq_num; +/*30*/ +}; + +/* + * OCFS1 volume label. Lives at sector 1. + */ +struct ocfs1_vol_label +{ +/*00*/ struct ocfs1_disk_lock disk_lock; +/*30*/ __u8 label[OCFS1_MAX_VOL_LABEL_LEN]; +/*70*/ __u16 label_len; +/*72*/ __u8 vol_id[OCFS1_MAX_VOL_ID_LENGTH]; +/*82*/ __u16 vol_id_len; +/*84*/ __u8 cluster_name[OCFS1_MAX_CLUSTER_NAME_LEN]; +/*A4*/ __u16 cluster_name_len; +/*A6*/ +}; + + +#endif /* _OCFS1_FS_COMPAT_H */ + diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h new file mode 100644 index 000000000000..f468c600cf92 --- /dev/null +++ b/fs/ocfs2/ocfs2.h @@ -0,0 +1,464 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2.h + * + * Defines macros and structures used in OCFS2 + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_H +#define OCFS2_H + +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/wait.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/workqueue.h> +#include <linux/kref.h> + +#include "cluster/nodemanager.h" +#include "cluster/heartbeat.h" +#include "cluster/tcp.h" + +#include "dlm/dlmapi.h" + +#include "ocfs2_fs.h" +#include "endian.h" +#include "ocfs2_lockid.h" + +struct ocfs2_extent_map { + u32 em_clusters; + struct rb_root em_extents; +}; + +/* Most user visible OCFS2 inodes will have very few pieces of + * metadata, but larger files (including bitmaps, etc) must be taken + * into account when designing an access scheme. We allow a small + * amount of inlined blocks to be stored on an array and grow the + * structure into a rb tree when necessary. */ +#define OCFS2_INODE_MAX_CACHE_ARRAY 2 + +struct ocfs2_caching_info { + unsigned int ci_num_cached; + union { + sector_t ci_array[OCFS2_INODE_MAX_CACHE_ARRAY]; + struct rb_root ci_tree; + } ci_cache; +}; + +/* this limits us to 256 nodes + * if we need more, we can do a kmalloc for the map */ +#define OCFS2_NODE_MAP_MAX_NODES 256 +struct ocfs2_node_map { + u16 num_nodes; + unsigned long map[BITS_TO_LONGS(OCFS2_NODE_MAP_MAX_NODES)]; +}; + +enum ocfs2_ast_action { + OCFS2_AST_INVALID = 0, + OCFS2_AST_ATTACH, + OCFS2_AST_CONVERT, + OCFS2_AST_DOWNCONVERT, +}; + +/* actions for an unlockast function to take. */ +enum ocfs2_unlock_action { + OCFS2_UNLOCK_INVALID = 0, + OCFS2_UNLOCK_CANCEL_CONVERT, + OCFS2_UNLOCK_DROP_LOCK, +}; + +/* ocfs2_lock_res->l_flags flags. */ +#define OCFS2_LOCK_ATTACHED (0x00000001) /* have we initialized + * the lvb */ +#define OCFS2_LOCK_BUSY (0x00000002) /* we are currently in + * dlm_lock */ +#define OCFS2_LOCK_BLOCKED (0x00000004) /* blocked waiting to + * downconvert*/ +#define OCFS2_LOCK_LOCAL (0x00000008) /* newly created inode */ +#define OCFS2_LOCK_NEEDS_REFRESH (0x00000010) +#define OCFS2_LOCK_REFRESHING (0x00000020) +#define OCFS2_LOCK_INITIALIZED (0x00000040) /* track initialization + * for shutdown paths */ +#define OCFS2_LOCK_FREEING (0x00000080) /* help dlmglue track + * when to skip queueing + * a lock because it's + * about to be + * dropped. */ +#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ + +struct ocfs2_lock_res_ops; + +typedef void (*ocfs2_lock_callback)(int status, unsigned long data); + +struct ocfs2_lock_res { + void *l_priv; + struct ocfs2_lock_res_ops *l_ops; + spinlock_t l_lock; + + struct list_head l_blocked_list; + struct list_head l_mask_waiters; + + enum ocfs2_lock_type l_type; + unsigned long l_flags; + char l_name[OCFS2_LOCK_ID_MAX_LEN]; + int l_level; + unsigned int l_ro_holders; + unsigned int l_ex_holders; + struct dlm_lockstatus l_lksb; + + /* used from AST/BAST funcs. */ + enum ocfs2_ast_action l_action; + enum ocfs2_unlock_action l_unlock_action; + int l_requested; + int l_blocking; + + wait_queue_head_t l_event; + + struct list_head l_debug_list; +}; + +struct ocfs2_dlm_debug { + struct kref d_refcnt; + struct dentry *d_locking_state; + struct list_head d_lockres_tracking; +}; + +enum ocfs2_vol_state +{ + VOLUME_INIT = 0, + VOLUME_MOUNTED, + VOLUME_DISMOUNTED, + VOLUME_DISABLED +}; + +struct ocfs2_alloc_stats +{ + atomic_t moves; + atomic_t local_data; + atomic_t bitmap_data; + atomic_t bg_allocs; + atomic_t bg_extends; +}; + +enum ocfs2_local_alloc_state +{ + OCFS2_LA_UNUSED = 0, + OCFS2_LA_ENABLED, + OCFS2_LA_DISABLED +}; + +enum ocfs2_mount_options +{ + OCFS2_MOUNT_HB_LOCAL = 1 << 0, /* Heartbeat started in local mode */ + OCFS2_MOUNT_BARRIER = 1 << 1, /* Use block barriers */ + OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ + OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ + OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ +#ifdef OCFS2_ORACORE_WORKAROUNDS + OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ +#endif +}; + +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 + +struct ocfs2_journal; +struct ocfs2_journal_handle; +struct ocfs2_super +{ + u32 osb_id; /* id used by the proc interface */ + struct task_struct *commit_task; + struct super_block *sb; + struct inode *root_inode; + struct inode *sys_root_inode; + struct inode *system_inodes[NUM_SYSTEM_INODES]; + + struct ocfs2_slot_info *slot_info; + + spinlock_t node_map_lock; + struct ocfs2_node_map mounted_map; + struct ocfs2_node_map recovery_map; + struct ocfs2_node_map umount_map; + + u32 num_clusters; + u64 root_blkno; + u64 system_dir_blkno; + u64 bitmap_blkno; + u32 bitmap_cpg; + u8 *uuid; + char *uuid_str; + u8 *vol_label; + u64 first_cluster_group_blkno; + u32 fs_generation; + + u32 s_feature_compat; + u32 s_feature_incompat; + u32 s_feature_ro_compat; + + /* Protects s_next_generaion, osb_flags. Could protect more on + * osb as it's very short lived. */ + spinlock_t osb_lock; + u32 s_next_generation; + unsigned long osb_flags; + + unsigned long s_mount_opt; + + u16 max_slots; + u16 num_nodes; + s16 node_num; + s16 slot_num; + int s_sectsize_bits; + int s_clustersize; + int s_clustersize_bits; + struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ + + atomic_t vol_state; + struct semaphore recovery_lock; + struct task_struct *recovery_thread_task; + int disable_recovery; + wait_queue_head_t checkpoint_event; + atomic_t needs_checkpoint; + struct ocfs2_journal *journal; + + enum ocfs2_local_alloc_state local_alloc_state; + struct buffer_head *local_alloc_bh; + + /* Next two fields are for local node slot recovery during + * mount. */ + int dirty; + struct ocfs2_dinode *local_alloc_copy; + + struct ocfs2_alloc_stats alloc_stats; + char dev_str[20]; /* "major,minor" of the device */ + + struct dlm_ctxt *dlm; + struct ocfs2_lock_res osb_super_lockres; + struct ocfs2_lock_res osb_rename_lockres; + struct dlm_eviction_cb osb_eviction_cb; + struct ocfs2_dlm_debug *osb_dlm_debug; + + struct dentry *osb_debug_root; + + wait_queue_head_t recovery_event; + + spinlock_t vote_task_lock; + struct task_struct *vote_task; + wait_queue_head_t vote_event; + unsigned long vote_wake_sequence; + unsigned long vote_work_sequence; + + struct list_head blocked_lock_list; + unsigned long blocked_lock_count; + + struct list_head vote_list; + int vote_count; + + u32 net_key; + spinlock_t net_response_lock; + unsigned int net_response_ids; + struct list_head net_response_list; + + struct o2hb_callback_func osb_hb_up; + struct o2hb_callback_func osb_hb_down; + + struct list_head osb_net_handlers; + + wait_queue_head_t osb_mount_event; + + /* Truncate log info */ + struct inode *osb_tl_inode; + struct buffer_head *osb_tl_bh; + struct work_struct osb_truncate_log_wq; +}; + +#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) +#define OCFS2_MAX_OSB_ID 65536 + +static inline int ocfs2_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (OCFS2_SB(inode->i_sb)->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) + return 0; + return 1; +} + +/* set / clear functions because cluster events can make these happen + * in parallel so we want the transitions to be atomic. this also + * means that any future flags osb_flags must be protected by spinlock + * too! */ +static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags |= flag; + spin_unlock(&osb->osb_lock); +} + +static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, + int hard) +{ + spin_lock(&osb->osb_lock); + osb->osb_flags &= ~(OCFS2_OSB_SOFT_RO|OCFS2_OSB_HARD_RO); + if (hard) + osb->osb_flags |= OCFS2_OSB_HARD_RO; + else + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); +} + +static inline int ocfs2_is_hard_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_HARD_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + + return ret; +} + +#define OCFS2_IS_VALID_DINODE(ptr) \ + (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) + +#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \ + typeof(__di) ____di = (__di); \ + ocfs2_error((__sb), \ + "Dinode # %"MLFu64" has bad signature %.*s", \ + (____di)->i_blkno, 7, \ + (____di)->i_signature); \ +} while (0); + +#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ + (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) + +#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \ + typeof(__eb) ____eb = (__eb); \ + ocfs2_error((__sb), \ + "Extent Block # %"MLFu64" has bad signature %.*s", \ + (____eb)->h_blkno, 7, \ + (____eb)->h_signature); \ +} while (0); + +#define OCFS2_IS_VALID_GROUP_DESC(ptr) \ + (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) + +#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \ + typeof(__gd) ____gd = (__gd); \ + ocfs2_error((__sb), \ + "Group Descriptor # %"MLFu64" has bad signature %.*s", \ + (____gd)->bg_blkno, 7, \ + (____gd)->bg_signature); \ +} while (0); + +static inline unsigned long ino_from_blkno(struct super_block *sb, + u64 blkno) +{ + return (unsigned long)(blkno & (u64)ULONG_MAX); +} + +static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, + u32 clusters) +{ + int c_to_b_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u64)clusters << c_to_b_bits; +} + +static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + return (u32)(blocks >> b_to_c_bits); +} + +static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + bytes += OCFS2_SB(sb)->s_clustersize - 1; + /* OCFS2 just cannot have enough clusters to overflow this */ + clusters = (unsigned int)(bytes >> cl_bits); + + return clusters; +} + +static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb, + u64 bytes) +{ + bytes += sb->s_blocksize - 1; + return bytes >> sb->s_blocksize_bits; +} + +static inline u64 ocfs2_clusters_to_bytes(struct super_block *sb, + u32 clusters) +{ + return (u64)clusters << OCFS2_SB(sb)->s_clustersize_bits; +} + +static inline u64 ocfs2_align_bytes_to_clusters(struct super_block *sb, + u64 bytes) +{ + int cl_bits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int clusters; + + clusters = ocfs2_clusters_for_bytes(sb, bytes); + return (u64)clusters << cl_bits; +} + +static inline u64 ocfs2_align_bytes_to_blocks(struct super_block *sb, + u64 bytes) +{ + u64 blocks; + + blocks = ocfs2_blocks_for_bytes(sb, bytes); + return blocks << sb->s_blocksize_bits; +} + +static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) +{ + return (unsigned long)((bytes + 511) >> 9); +} + +#define ocfs2_set_bit ext2_set_bit +#define ocfs2_clear_bit ext2_clear_bit +#define ocfs2_test_bit ext2_test_bit +#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit +#endif /* OCFS2_H */ + diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h new file mode 100644 index 000000000000..dfb8a5bedfc8 --- /dev/null +++ b/fs/ocfs2/ocfs2_fs.h @@ -0,0 +1,638 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_fs.h + * + * On-disk structures for OCFS2. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _OCFS2_FS_H +#define _OCFS2_FS_H + +/* Version */ +#define OCFS2_MAJOR_REV_LEVEL 0 +#define OCFS2_MINOR_REV_LEVEL 90 + +/* + * An OCFS2 volume starts this way: + * Sector 0: Valid ocfs1_vol_disk_hdr that cleanly fails to mount OCFS. + * Sector 1: Valid ocfs1_vol_label that cleanly fails to mount OCFS. + * Block OCFS2_SUPER_BLOCK_BLKNO: OCFS2 superblock. + * + * All other structures are found from the superblock information. + * + * OCFS2_SUPER_BLOCK_BLKNO is in blocks, not sectors. eg, for a + * blocksize of 2K, it is 4096 bytes into disk. + */ +#define OCFS2_SUPER_BLOCK_BLKNO 2 + +/* + * Cluster size limits. The maximum is kept arbitrarily at 1 MB, and could + * grow if needed. + */ +#define OCFS2_MIN_CLUSTERSIZE 4096 +#define OCFS2_MAX_CLUSTERSIZE 1048576 + +/* + * Blocks cannot be bigger than clusters, so the maximum blocksize is the + * minimum cluster size. + */ +#define OCFS2_MIN_BLOCKSIZE 512 +#define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE + +/* Filesystem magic number */ +#define OCFS2_SUPER_MAGIC 0x7461636f + +/* Object signatures */ +#define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" +#define OCFS2_INODE_SIGNATURE "INODE01" +#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" +#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" + +/* Compatibility flags */ +#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_compat & (mask) ) +#define OCFS2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_ro_compat & (mask) ) +#define OCFS2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( OCFS2_SB(sb)->s_feature_incompat & (mask) ) +#define OCFS2_SET_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat |= (mask) +#define OCFS2_SET_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat |= (mask) +#define OCFS2_SET_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat |= (mask) +#define OCFS2_CLEAR_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_compat &= ~(mask) +#define OCFS2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_ro_compat &= ~(mask) +#define OCFS2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + OCFS2_SB(sb)->s_feature_incompat &= ~(mask) + +#define OCFS2_FEATURE_COMPAT_SUPP 0 +#define OCFS2_FEATURE_INCOMPAT_SUPP 0 +#define OCFS2_FEATURE_RO_COMPAT_SUPP 0 + +/* + * Heartbeat-only devices are missing journals and other files. The + * filesystem driver can't load them, but the library can. Never put + * this in OCFS2_FEATURE_INCOMPAT_SUPP, *ever*. + */ +#define OCFS2_FEATURE_INCOMPAT_HEARTBEAT_DEV 0x0002 + + +/* + * Flags on ocfs2_dinode.i_flags + */ +#define OCFS2_VALID_FL (0x00000001) /* Inode is valid */ +#define OCFS2_UNUSED2_FL (0x00000002) +#define OCFS2_ORPHANED_FL (0x00000004) /* On the orphan list */ +#define OCFS2_UNUSED3_FL (0x00000008) +/* System inode flags */ +#define OCFS2_SYSTEM_FL (0x00000010) /* System inode */ +#define OCFS2_SUPER_BLOCK_FL (0x00000020) /* Super block */ +#define OCFS2_LOCAL_ALLOC_FL (0x00000040) /* Slot local alloc bitmap */ +#define OCFS2_BITMAP_FL (0x00000080) /* Allocation bitmap */ +#define OCFS2_JOURNAL_FL (0x00000100) /* Slot local journal */ +#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ +#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ +#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ + +/* + * Journal Flags (ocfs2_dinode.id1.journal1.i_flags) + */ +#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */ + +/* + * superblock s_state flags + */ +#define OCFS2_ERROR_FS (0x00000001) /* FS saw errors */ + +/* Limit of space in ocfs2_dir_entry */ +#define OCFS2_MAX_FILENAME_LEN 255 + +/* Maximum slots on an ocfs2 file system */ +#define OCFS2_MAX_SLOTS 255 + +/* Slot map indicator for an empty slot */ +#define OCFS2_INVALID_SLOT -1 + +#define OCFS2_VOL_UUID_LEN 16 +#define OCFS2_MAX_VOL_LABEL_LEN 64 + +/* Journal limits (in bytes) */ +#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) +#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) + +struct ocfs2_system_inode_info { + char *si_name; + int si_iflags; + int si_mode; +}; + +/* System file index */ +enum { + BAD_BLOCK_SYSTEM_INODE = 0, + GLOBAL_INODE_ALLOC_SYSTEM_INODE, + SLOT_MAP_SYSTEM_INODE, +#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE + HEARTBEAT_SYSTEM_INODE, + GLOBAL_BITMAP_SYSTEM_INODE, +#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE + ORPHAN_DIR_SYSTEM_INODE, + EXTENT_ALLOC_SYSTEM_INODE, + INODE_ALLOC_SYSTEM_INODE, + JOURNAL_SYSTEM_INODE, + LOCAL_ALLOC_SYSTEM_INODE, + TRUNCATE_LOG_SYSTEM_INODE, + NUM_SYSTEM_INODES +}; + +static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { + /* Global system inodes (single copy) */ + /* The first two are only used from userspace mfks/tunefs */ + [BAD_BLOCK_SYSTEM_INODE] = { "bad_blocks", 0, S_IFREG | 0644 }, + [GLOBAL_INODE_ALLOC_SYSTEM_INODE] = { "global_inode_alloc", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + + /* These are used by the running filesystem */ + [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, + [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, + [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, + + /* Slot-specific system inodes (one copy per slot) */ + [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, + [EXTENT_ALLOC_SYSTEM_INODE] = { "extent_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, + [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, + [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, + [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } +}; + +/* Parameter passed from mount.ocfs2 to module */ +#define OCFS2_HB_NONE "heartbeat=none" +#define OCFS2_HB_LOCAL "heartbeat=local" + +/* + * OCFS2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define OCFS2_FT_UNKNOWN 0 +#define OCFS2_FT_REG_FILE 1 +#define OCFS2_FT_DIR 2 +#define OCFS2_FT_CHRDEV 3 +#define OCFS2_FT_BLKDEV 4 +#define OCFS2_FT_FIFO 5 +#define OCFS2_FT_SOCK 6 +#define OCFS2_FT_SYMLINK 7 + +#define OCFS2_FT_MAX 8 + +/* + * OCFS2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define OCFS2_DIR_PAD 4 +#define OCFS2_DIR_ROUND (OCFS2_DIR_PAD - 1) +#define OCFS2_DIR_MEMBER_LEN offsetof(struct ocfs2_dir_entry, name) +#define OCFS2_DIR_REC_LEN(name_len) (((name_len) + OCFS2_DIR_MEMBER_LEN + \ + OCFS2_DIR_ROUND) & \ + ~OCFS2_DIR_ROUND) + +#define OCFS2_LINK_MAX 32000 + +#define S_SHIFT 12 +static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, + [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, + [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, +}; + + +/* + * Convenience casts + */ +#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) + +/* + * On disk extent record for OCFS2 + * It describes a range of clusters on disk. + */ +struct ocfs2_extent_rec { +/*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ + __le32 e_clusters; /* Clusters covered by this extent */ + __le64 e_blkno; /* Physical disk offset, in blocks */ +/*10*/ +}; + +struct ocfs2_chain_rec { + __le32 c_free; /* Number of free bits in this chain. */ + __le32 c_total; /* Number of total bits in this chain */ + __le64 c_blkno; /* Physical disk offset (blocks) of 1st group */ +}; + +struct ocfs2_truncate_rec { + __le32 t_start; /* 1st cluster in this log */ + __le32 t_clusters; /* Number of total clusters covered */ +}; + +/* + * On disk extent list for OCFS2 (node in the tree). Note that this + * is contained inside ocfs2_dinode or ocfs2_extent_block, so the + * offsets are relative to ocfs2_dinode.id2.i_list or + * ocfs2_extent_block.h_list, respectively. + */ +struct ocfs2_extent_list { +/*00*/ __le16 l_tree_depth; /* Extent tree depth from this + point. 0 means data extents + hang directly off this + header (a leaf) */ + __le16 l_count; /* Number of extent records */ + __le16 l_next_free_rec; /* Next unused extent slot */ + __le16 l_reserved1; + __le64 l_reserved2; /* Pad to + sizeof(ocfs2_extent_rec) */ +/*10*/ struct ocfs2_extent_rec l_recs[0]; /* Extent records */ +}; + +/* + * On disk allocation chain list for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_chain. + */ +struct ocfs2_chain_list { +/*00*/ __le16 cl_cpg; /* Clusters per Block Group */ + __le16 cl_bpc; /* Bits per cluster */ + __le16 cl_count; /* Total chains in this list */ + __le16 cl_next_free_rec; /* Next unused chain slot */ + __le64 cl_reserved1; +/*10*/ struct ocfs2_chain_rec cl_recs[0]; /* Chain records */ +}; + +/* + * On disk deallocation log for OCFS2. Note that this is + * contained inside ocfs2_dinode, so the offsets are relative to + * ocfs2_dinode.id2.i_dealloc. + */ +struct ocfs2_truncate_log { +/*00*/ __le16 tl_count; /* Total records in this log */ + __le16 tl_used; /* Number of records in use */ + __le32 tl_reserved1; +/*08*/ struct ocfs2_truncate_rec tl_recs[0]; /* Truncate records */ +}; + +/* + * On disk extent block (indirect block) for OCFS2 + */ +struct ocfs2_extent_block +{ +/*00*/ __u8 h_signature[8]; /* Signature for verification */ + __le64 h_reserved1; +/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this + extent_header belongs to */ + __le16 h_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 h_fs_generation; /* Must match super block */ + __le64 h_blkno; /* Offset on disk, in blocks */ +/*20*/ __le64 h_reserved3; + __le64 h_next_leaf_blk; /* Offset on disk, in blocks, + of next leaf header pointing + to data */ +/*30*/ struct ocfs2_extent_list h_list; /* Extent record list */ +/* Actual on-disk size is one block */ +}; + +/* + * On disk superblock for OCFS2 + * Note that it is contained inside an ocfs2_dinode, so all offsets + * are relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_super_block { +/*00*/ __le16 s_major_rev_level; + __le16 s_minor_rev_level; + __le16 s_mnt_count; + __le16 s_max_mnt_count; + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le32 s_checkinterval; /* Max time between checks */ +/*10*/ __le64 s_lastcheck; /* Time of last check */ + __le32 s_creator_os; /* OS */ + __le32 s_feature_compat; /* Compatible feature set */ +/*20*/ __le32 s_feature_incompat; /* Incompatible feature set */ + __le32 s_feature_ro_compat; /* Readonly-compatible feature set */ + __le64 s_root_blkno; /* Offset, in blocks, of root directory + dinode */ +/*30*/ __le64 s_system_dir_blkno; /* Offset, in blocks, of system + directory dinode */ + __le32 s_blocksize_bits; /* Blocksize for this fs */ + __le32 s_clustersize_bits; /* Clustersize for this fs */ +/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts + before tunefs required */ + __le16 s_reserved1; + __le32 s_reserved2; + __le64 s_first_cluster_group; /* Block offset of 1st cluster + * group header */ +/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ +/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ +/*A0*/ +}; + +/* + * Local allocation bitmap for OCFS2 slots + * Note that it exists inside an ocfs2_dinode, so all offsets are + * relative to the start of ocfs2_dinode.id2. + */ +struct ocfs2_local_alloc +{ +/*00*/ __le32 la_bm_off; /* Starting bit offset in main bitmap */ + __le16 la_size; /* Size of included bitmap, in bytes */ + __le16 la_reserved1; + __le64 la_reserved2; +/*10*/ __u8 la_bitmap[0]; +}; + +/* + * On disk inode for OCFS2 + */ +struct ocfs2_dinode { +/*00*/ __u8 i_signature[8]; /* Signature for validation */ + __le32 i_generation; /* Generation number */ + __le16 i_suballoc_slot; /* Slot suballocator this inode + belongs to */ + __le16 i_suballoc_bit; /* Bit offset in suballocator + block group */ +/*10*/ __le32 i_reserved0; + __le32 i_clusters; /* Cluster count */ + __le32 i_uid; /* Owner UID */ + __le32 i_gid; /* Owning GID */ +/*20*/ __le64 i_size; /* Size in bytes */ + __le16 i_mode; /* File mode */ + __le16 i_links_count; /* Links count */ + __le32 i_flags; /* File flags */ +/*30*/ __le64 i_atime; /* Access time */ + __le64 i_ctime; /* Creation time */ +/*40*/ __le64 i_mtime; /* Modification time */ + __le64 i_dtime; /* Deletion time */ +/*50*/ __le64 i_blkno; /* Offset on disk, in blocks */ + __le64 i_last_eb_blk; /* Pointer to last extent + block */ +/*60*/ __le32 i_fs_generation; /* Generation per fs-instance */ + __le32 i_atime_nsec; + __le32 i_ctime_nsec; + __le32 i_mtime_nsec; +/*70*/ __le64 i_reserved1[9]; +/*B8*/ union { + __le64 i_pad1; /* Generic way to refer to this + 64bit union */ + struct { + __le64 i_rdev; /* Device number */ + } dev1; + struct { /* Info for bitmap system + inodes */ + __le32 i_used; /* Bits (ie, clusters) used */ + __le32 i_total; /* Total bits (clusters) + available */ + } bitmap1; + struct { /* Info for journal system + inodes */ + __le32 ij_flags; /* Mounted, version, etc. */ + __le32 ij_pad; + } journal1; + } id1; /* Inode type dependant 1 */ +/*C0*/ union { + struct ocfs2_super_block i_super; + struct ocfs2_local_alloc i_lab; + struct ocfs2_chain_list i_chain; + struct ocfs2_extent_list i_list; + struct ocfs2_truncate_log i_dealloc; + __u8 i_symlink[0]; + } id2; +/* Actual on-disk size is one block */ +}; + +/* + * On-disk directory entry structure for OCFS2 + * + * Packed as this structure could be accessed unaligned on 64-bit platforms + */ +struct ocfs2_dir_entry { +/*00*/ __le64 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; +/*0C*/ char name[OCFS2_MAX_FILENAME_LEN]; /* File name */ +/* Actual on-disk length specified by rec_len */ +} __attribute__ ((packed)); + +/* + * On disk allocator group structure for OCFS2 + */ +struct ocfs2_group_desc +{ +/*00*/ __u8 bg_signature[8]; /* Signature for validation */ + __le16 bg_size; /* Size of included bitmap in + bytes. */ + __le16 bg_bits; /* Bits represented by this + group. */ + __le16 bg_free_bits_count; /* Free bits count */ + __le16 bg_chain; /* What chain I am in. */ +/*10*/ __le32 bg_generation; + __le32 bg_reserved1; + __le64 bg_next_group; /* Next group in my list, in + blocks */ +/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in + blocks */ + __le64 bg_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 bg_reserved2[2]; +/*40*/ __u8 bg_bitmap[0]; +}; + +#ifdef __KERNEL__ +static inline int ocfs2_fast_symlink_chars(struct super_block *sb) +{ + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline u16 ocfs2_extent_recs_per_eb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline u16 ocfs2_local_alloc_size(struct super_block *sb) +{ + u16 size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} +#else +static inline int ocfs2_fast_symlink_chars(int blocksize) +{ + return blocksize - offsetof(struct ocfs2_dinode, id2.i_symlink); +} + +static inline int ocfs2_extent_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_chain_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_chain.cl_recs); + + return size / sizeof(struct ocfs2_chain_rec); +} + +static inline int ocfs2_extent_recs_per_eb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_extent_block, h_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + +static inline int ocfs2_local_alloc_size(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_lab.la_bitmap); + + return size; +} + +static inline int ocfs2_group_bitmap_size(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_group_desc, bg_bitmap); + + return size; +} + +static inline int ocfs2_truncate_recs_per_inode(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_dinode, id2.i_dealloc.tl_recs); + + return size / sizeof(struct ocfs2_truncate_rec); +} +#endif /* __KERNEL__ */ + + +static inline int ocfs2_system_inode_is_global(int type) +{ + return ((type >= 0) && + (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)); +} + +static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, + int type, int slot) +{ + int chars; + + /* + * Global system inodes can only have one copy. Everything + * after OCFS2_LAST_GLOBAL_SYSTEM_INODE in the system inode + * list has a copy per slot. + */ + if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE) + chars = snprintf(buf, len, + ocfs2_system_inodes[type].si_name); + else + chars = snprintf(buf, len, + ocfs2_system_inodes[type].si_name, + slot); + + return chars; +} + +static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, + umode_t mode) +{ + de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; +} + +#endif /* _OCFS2_FS_H */ + diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h new file mode 100644 index 000000000000..7dd9e1e705b0 --- /dev/null +++ b/fs/ocfs2/ocfs2_lockid.h @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_lockid.h + * + * Defines OCFS2 lockid bits. + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_LOCKID_H +#define OCFS2_LOCKID_H + +/* lock ids are made up in the following manner: + * name[0] --> type + * name[1-6] --> 6 pad characters, reserved for now + * name[7-22] --> block number, expressed in hex as 16 chars + * name[23-30] --> i_generation, expressed in hex 8 chars + * name[31] --> '\0' */ +#define OCFS2_LOCK_ID_MAX_LEN 32 +#define OCFS2_LOCK_ID_PAD "000000" + +enum ocfs2_lock_type { + OCFS2_LOCK_TYPE_META = 0, + OCFS2_LOCK_TYPE_DATA, + OCFS2_LOCK_TYPE_SUPER, + OCFS2_LOCK_TYPE_RENAME, + OCFS2_LOCK_TYPE_RW, + OCFS2_NUM_LOCK_TYPES +}; + +static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) +{ + char c; + switch (type) { + case OCFS2_LOCK_TYPE_META: + c = 'M'; + break; + case OCFS2_LOCK_TYPE_DATA: + c = 'D'; + break; + case OCFS2_LOCK_TYPE_SUPER: + c = 'S'; + break; + case OCFS2_LOCK_TYPE_RENAME: + c = 'R'; + break; + case OCFS2_LOCK_TYPE_RW: + c = 'W'; + break; + default: + c = '\0'; + } + + return c; +} + +#endif /* OCFS2_LOCKID_H */ diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c new file mode 100644 index 000000000000..871627961d6d --- /dev/null +++ b/fs/ocfs2/slot_map.c @@ -0,0 +1,303 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slot_map.c + * + * + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> + +#define MLOG_MASK_PREFIX ML_SUPER +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global); +static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, + s16 slot_num, + s16 node_num); + +/* Use the slot information we've collected to create a map of mounted + * nodes. Should be holding an EX on super block. assumes slot info is + * up to date. Note that we call this *after* we find a slot, so our + * own node should be set in the map too... */ +void ocfs2_populate_mounted_map(struct ocfs2_super *osb) +{ + int i; + struct ocfs2_slot_info *si = osb->slot_info; + + spin_lock(&si->si_lock); + + for (i = 0; i < si->si_size; i++) + if (si->si_global_node_nums[i] != OCFS2_INVALID_SLOT) + ocfs2_node_map_set_bit(osb, &osb->mounted_map, + si->si_global_node_nums[i]); + + spin_unlock(&si->si_lock); +} + +/* post the slot information on disk into our slot_info struct. */ +void ocfs2_update_slot_info(struct ocfs2_slot_info *si) +{ + int i; + __le16 *disk_info; + + /* we don't read the slot block here as ocfs2_super_lock + * should've made sure we have the most recent copy. */ + spin_lock(&si->si_lock); + disk_info = (__le16 *) si->si_bh->b_data; + + for (i = 0; i < si->si_size; i++) + si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); + + spin_unlock(&si->si_lock); +} + +/* post the our slot info stuff into it's destination bh and write it + * out. */ +int ocfs2_update_disk_slots(struct ocfs2_super *osb, + struct ocfs2_slot_info *si) +{ + int status, i; + __le16 *disk_info = (__le16 *) si->si_bh->b_data; + + spin_lock(&si->si_lock); + for (i = 0; i < si->si_size; i++) + disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); + spin_unlock(&si->si_lock); + + status = ocfs2_write_block(osb, si->si_bh, si->si_inode); + if (status < 0) + mlog_errno(status); + + return status; +} + +/* try to find global node in the slot info. Returns + * OCFS2_INVALID_SLOT if nothing is found. */ +static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global) +{ + int i; + s16 ret = OCFS2_INVALID_SLOT; + + for(i = 0; i < si->si_num_slots; i++) { + if (global == si->si_global_node_nums[i]) { + ret = (s16) i; + break; + } + } + return ret; +} + +static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si) +{ + int i; + s16 ret = OCFS2_INVALID_SLOT; + + for(i = 0; i < si->si_num_slots; i++) { + if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { + ret = (s16) i; + break; + } + } + return ret; +} + +s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global) +{ + s16 ret; + + spin_lock(&si->si_lock); + ret = __ocfs2_node_num_to_slot(si, global); + spin_unlock(&si->si_lock); + return ret; +} + +static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, + s16 slot_num, + s16 node_num) +{ + BUG_ON(slot_num == OCFS2_INVALID_SLOT); + BUG_ON(slot_num >= si->si_num_slots); + BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && + (node_num >= O2NM_MAX_NODES)); + + si->si_global_node_nums[slot_num] = node_num; +} + +void ocfs2_clear_slot(struct ocfs2_slot_info *si, + s16 slot_num) +{ + spin_lock(&si->si_lock); + __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); + spin_unlock(&si->si_lock); +} + +int ocfs2_init_slot_info(struct ocfs2_super *osb) +{ + int status, i; + u64 blkno; + struct inode *inode = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_slot_info *si; + + si = kcalloc(1, sizeof(struct ocfs2_slot_info), GFP_KERNEL); + if (!si) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + spin_lock_init(&si->si_lock); + si->si_num_slots = osb->max_slots; + si->si_size = OCFS2_MAX_SLOTS; + + for(i = 0; i < si->si_num_slots; i++) + si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; + + inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_read_block(osb, blkno, &bh, 0, inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + si->si_inode = inode; + si->si_bh = bh; + osb->slot_info = si; +bail: + if (status < 0 && si) + ocfs2_free_slot_info(si); + + return status; +} + +void ocfs2_free_slot_info(struct ocfs2_slot_info *si) +{ + if (si->si_inode) + iput(si->si_inode); + if (si->si_bh) + brelse(si->si_bh); + kfree(si); +} + +int ocfs2_find_slot(struct ocfs2_super *osb) +{ + int status; + s16 slot; + struct ocfs2_slot_info *si; + + mlog_entry_void(); + + si = osb->slot_info; + + ocfs2_update_slot_info(si); + + spin_lock(&si->si_lock); + /* search for ourselves first and take the slot if it already + * exists. Perhaps we need to mark this in a variable for our + * own journal recovery? Possibly not, though we certainly + * need to warn to the user */ + slot = __ocfs2_node_num_to_slot(si, osb->node_num); + if (slot == OCFS2_INVALID_SLOT) { + /* if no slot yet, then just take 1st available + * one. */ + slot = __ocfs2_find_empty_slot(si); + if (slot == OCFS2_INVALID_SLOT) { + spin_unlock(&si->si_lock); + mlog(ML_ERROR, "no free slots available!\n"); + status = -EINVAL; + goto bail; + } + } else + mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", + slot); + + __ocfs2_fill_slot(si, slot, osb->node_num); + osb->slot_num = slot; + spin_unlock(&si->si_lock); + + mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num); + + status = ocfs2_update_disk_slots(osb, si); + if (status < 0) + mlog_errno(status); + +bail: + mlog_exit(status); + return status; +} + +void ocfs2_put_slot(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_slot_info *si = osb->slot_info; + + if (!si) + return; + + ocfs2_update_slot_info(si); + + spin_lock(&si->si_lock); + __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&si->si_lock); + + status = ocfs2_update_disk_slots(osb, si); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + osb->slot_info = NULL; + ocfs2_free_slot_info(si); +} + diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h new file mode 100644 index 000000000000..d8c8ceed031b --- /dev/null +++ b/fs/ocfs2/slot_map.h @@ -0,0 +1,66 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * slotmap.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef SLOTMAP_H +#define SLOTMAP_H + +struct ocfs2_slot_info { + spinlock_t si_lock; + + struct inode *si_inode; + struct buffer_head *si_bh; + unsigned int si_num_slots; + unsigned int si_size; + s16 si_global_node_nums[OCFS2_MAX_SLOTS]; +}; + +int ocfs2_init_slot_info(struct ocfs2_super *osb); +void ocfs2_free_slot_info(struct ocfs2_slot_info *si); + +int ocfs2_find_slot(struct ocfs2_super *osb); +void ocfs2_put_slot(struct ocfs2_super *osb); + +void ocfs2_update_slot_info(struct ocfs2_slot_info *si); +int ocfs2_update_disk_slots(struct ocfs2_super *osb, + struct ocfs2_slot_info *si); + +s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, + s16 global); +void ocfs2_clear_slot(struct ocfs2_slot_info *si, + s16 slot_num); + +void ocfs2_populate_mounted_map(struct ocfs2_super *osb); + +static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, + int slot_num) +{ + BUG_ON(slot_num == OCFS2_INVALID_SLOT); + assert_spin_locked(&si->si_lock); + + return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; +} + +#endif diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c new file mode 100644 index 000000000000..c46c164aefbb --- /dev/null +++ b/fs/ocfs2/suballoc.c @@ -0,0 +1,1651 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.c + * + * metadata alloc and free + * Inspired by ext3 block groups. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#define MLOG_MASK_PREFIX ML_DISK_ALLOC +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "suballoc.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" + +#include "buffer_head_io.h" + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); +static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + u16 my_chain, + struct ocfs2_chain_list *cl); +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh); + +static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac); + +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found); +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found); +static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno); +static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno); +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr); +static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, + struct buffer_head *bg_bh, + unsigned int bits_wanted, + u16 *bit_off, + u16 *bits_found); +static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); +static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits); + +static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain); +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted); +static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count); +static inline u64 ocfs2_which_suballoc_group(u64 block, + unsigned int bit); +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off); +static inline u64 ocfs2_which_cluster_group(struct inode *inode, + u32 cluster); +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off); + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) +{ + if (ac->ac_inode) + iput(ac->ac_inode); + if (ac->ac_bh) + brelse(ac->ac_bh); + kfree(ac); +} + +static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) +{ + return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); +} + +static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *bg_bh, + u64 group_blkno, + u16 my_chain, + struct ocfs2_chain_list *cl) +{ + int status = 0; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct super_block * sb = alloc_inode->i_sb; + + mlog_entry_void(); + + if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { + ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") " + "!= b_blocknr (%llu)", group_blkno, + (unsigned long long) bg_bh->b_blocknr); + status = -EIO; + goto bail; + } + + status = ocfs2_journal_access(handle, + alloc_inode, + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(bg, 0, sb->s_blocksize); + strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); + bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); + bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb)); + bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl)); + bg->bg_chain = cpu_to_le16(my_chain); + bg->bg_next_group = cl->cl_recs[my_chain].c_blkno; + bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno); + bg->bg_blkno = cpu_to_le64(group_blkno); + /* set the 1st bit in the bitmap to account for the descriptor block */ + ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap); + bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1); + + status = ocfs2_journal_dirty(handle, bg_bh); + if (status < 0) + mlog_errno(status); + + /* There is no need to zero out or otherwise initialize the + * other blocks in a group - All valid FS metadata in a block + * group stores the superblock fs_generation value at + * allocation time. */ + +bail: + mlog_exit(status); + return status; +} + +static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_count)) { + if (le32_to_cpu(cl->cl_recs[best].c_total) > + le32_to_cpu(cl->cl_recs[curr].c_total)) + best = curr; + curr++; + } + return best; +} + +/* + * We expect the block group allocator to already be locked. + */ +static int ocfs2_block_group_alloc(struct ocfs2_super *osb, + struct inode *alloc_inode, + struct buffer_head *bh) +{ + int status, credits; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; + struct ocfs2_chain_list *cl; + struct ocfs2_alloc_context *ac = NULL; + struct ocfs2_journal_handle *handle = NULL; + u32 bit_off, num_bits; + u16 alloc_rec; + u64 bg_blkno; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + + BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); + + mlog_entry_void(); + + handle = ocfs2_alloc_handle(osb); + if (!handle) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + cl = &fe->id2.i_chain; + status = ocfs2_reserve_clusters(osb, + handle, + le16_to_cpu(cl->cl_cpg), + &ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + credits = ocfs2_calc_group_alloc_credits(osb->sb, + le16_to_cpu(cl->cl_cpg)); + handle = ocfs2_start_trans(osb, handle, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto bail; + } + + status = ocfs2_claim_clusters(osb, + handle, + ac, + le16_to_cpu(cl->cl_cpg), + &bit_off, + &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + alloc_rec = ocfs2_find_smallest_chain(cl); + + /* setup the group */ + bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "new descriptor, record %u, at block %"MLFu64"\n", + alloc_rec, bg_blkno); + + bg_bh = sb_getblk(osb->sb, bg_blkno); + if (!bg_bh) { + status = -EIO; + mlog_errno(status); + goto bail; + } + ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh); + + status = ocfs2_block_group_fill(handle, + alloc_inode, + bg_bh, + bg_blkno, + alloc_rec, + cl); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + status = ocfs2_journal_access(handle, alloc_inode, + bh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le32_add_cpu(&cl->cl_recs[alloc_rec].c_free, + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); + cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg_blkno); + if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) + le16_add_cpu(&cl->cl_next_free_rec, 1); + + le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) - + le16_to_cpu(bg->bg_free_bits_count)); + le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits)); + le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg)); + + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + spin_lock(&OCFS2_I(alloc_inode)->ip_lock); + OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb, + le32_to_cpu(fe->i_clusters))); + spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); + i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); + alloc_inode->i_blocks = + ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); + + status = 0; +bail: + if (handle) + ocfs2_commit_trans(handle); + + if (ac) + ocfs2_free_alloc_context(ac); + + if (bg_bh) + brelse(bg_bh); + + mlog_exit(status); + return status; +} + +static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int status; + u32 bits_wanted = ac->ac_bits_wanted; + struct inode *alloc_inode = ac->ac_inode; + struct buffer_head *bh = NULL; + struct ocfs2_journal_handle *handle = ac->ac_handle; + struct ocfs2_dinode *fe; + u32 free_bits; + + mlog_entry_void(); + + BUG_ON(handle->flags & OCFS2_HANDLE_STARTED); + + ocfs2_handle_add_inode(handle, alloc_inode); + status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); + status = -EIO; + goto bail; + } + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { + ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator " + "# %"MLFu64, le64_to_cpu(fe->i_blkno)); + status = -EIO; + goto bail; + } + + free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) - + le32_to_cpu(fe->id1.bitmap1.i_used); + + if (bits_wanted > free_bits) { + /* cluster bitmap never grows */ + if (ocfs2_is_cluster_bitmap(alloc_inode)) { + mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", + bits_wanted, free_bits); + status = -ENOSPC; + goto bail; + } + + status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_extends); + + /* You should never ask for this much metadata */ + BUG_ON(bits_wanted > + (le32_to_cpu(fe->id1.bitmap1.i_total) + - le32_to_cpu(fe->id1.bitmap1.i_used))); + } + + get_bh(bh); + ac->ac_bh = bh; +bail: + if (bh) + brelse(bh); + + mlog_exit(status); + return status; +} + +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *fe, + struct ocfs2_alloc_context **ac) +{ + int status; + struct inode *alloc_inode = NULL; + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_handle = handle; + (*ac)->ac_which = OCFS2_AC_USE_META; + +#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + 0); +#else + alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num); +#endif + if (!alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_inode = igrab(alloc_inode); + (*ac)->ac_group_search = ocfs2_block_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, (*ac)); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (alloc_inode) + iput(alloc_inode); + + mlog_exit(status); + return status; +} + +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac) +{ + int status; + struct inode *alloc_inode = NULL; + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = 1; + (*ac)->ac_handle = handle; + (*ac)->ac_which = OCFS2_AC_USE_INODE; + + alloc_inode = ocfs2_get_system_file_inode(osb, + INODE_ALLOC_SYSTEM_INODE, + osb->slot_num); + if (!alloc_inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_inode = igrab(alloc_inode); + (*ac)->ac_group_search = ocfs2_block_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + if (alloc_inode) + iput(alloc_inode); + + mlog_exit(status); + return status; +} + +/* local alloc code has to do the same thing, so rather than do this + * twice.. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac) +{ + int status; + + ac->ac_inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!ac->ac_inode) { + status = -EINVAL; + mlog(ML_ERROR, "Could not get bitmap inode!\n"); + goto bail; + } + ac->ac_which = OCFS2_AC_USE_MAIN; + ac->ac_group_search = ocfs2_cluster_group_search; + + status = ocfs2_reserve_suballoc_bits(osb, ac); + if (status < 0 && status != -ENOSPC) + mlog_errno(status); +bail: + return status; +} + +/* Callers don't need to care which bitmap (local alloc or main) to + * use so we figure it out for them, but unfortunately this clutters + * things a bit. */ +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + int status; + + mlog_entry_void(); + + BUG_ON(!handle); + + *ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL); + if (!(*ac)) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_handle = handle; + + status = -ENOSPC; + if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { + status = ocfs2_reserve_local_alloc_bits(osb, + handle, + bits_wanted, + *ac); + if ((status < 0) && (status != -ENOSPC)) { + mlog_errno(status); + goto bail; + } else if (status == -ENOSPC) { + /* reserve_local_bits will return enospc with + * the local alloc inode still locked, so we + * can change this safely here. */ + mlog(0, "Disabling local alloc\n"); + /* We set to OCFS2_LA_DISABLED so that umount + * can clean up what's left of the local + * allocation */ + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + } + + if (status == -ENOSPC) { + status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + } + + status = 0; +bail: + if ((status < 0) && *ac) { + ocfs2_free_alloc_context(*ac); + *ac = NULL; + } + + mlog_exit(status); + return status; +} + +/* + * More or less lifted from ext3. I'll leave their description below: + * + * "For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes." + * + * Note: OCFS2 already does this differently for metadata vs data + * allocations, as those bitmaps are seperate and undo access is never + * called on a metadata group descriptor. + */ +static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, + int nr) +{ + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) + return 0; + if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data) + return 1; + + bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; + return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); +} + +static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, + struct buffer_head *bg_bh, + unsigned int bits_wanted, + u16 *bit_off, + u16 *bits_found) +{ + void *bitmap; + u16 best_offset, best_size; + int offset, start, found, status = 0; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); + return -EIO; + } + + found = start = best_offset = best_size = 0; + bitmap = bg->bg_bitmap; + + while((offset = ocfs2_find_next_zero_bit(bitmap, + le16_to_cpu(bg->bg_bits), + start)) != -1) { + if (offset == le16_to_cpu(bg->bg_bits)) + break; + + if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { + /* We found a zero, but we can't use it as it + * hasn't been put to disk yet! */ + found = 0; + start = offset + 1; + } else if (offset == start) { + /* we found a zero */ + found++; + /* move start to the next bit to test */ + start++; + } else { + /* got a zero after some ones */ + found = 1; + start = offset + 1; + } + if (found > best_size) { + best_size = found; + best_offset = start - found; + } + /* we got everything we needed */ + if (found == bits_wanted) { + /* mlog(0, "Found it all!\n"); */ + break; + } + } + + /* XXX: I think the first clause is equivalent to the second + * - jlbec */ + if (found == bits_wanted) { + *bit_off = start - found; + *bits_found = found; + } else if (best_size) { + *bit_off = best_offset; + *bits_found = best_size; + } else { + status = -ENOSPC; + /* No error log here -- see the comment above + * ocfs2_test_bg_bit_allocatable */ + } + + return status; +} + +static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + void *bitmap = bg->bg_bitmap; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); + + mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, + num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access(handle, + alloc_inode, + group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le16_add_cpu(&bg->bg_free_bits_count, -num_bits); + + while(num_bits--) + ocfs2_set_bit(bit_off++, bitmap); + + status = ocfs2_journal_dirty(handle, + group_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + mlog_exit(status); + return status; +} + +/* find the one with the most empty bits */ +static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl) +{ + u16 curr, best; + + BUG_ON(!cl->cl_next_free_rec); + + best = curr = 0; + while (curr < le16_to_cpu(cl->cl_next_free_rec)) { + if (le32_to_cpu(cl->cl_recs[curr].c_free) > + le32_to_cpu(cl->cl_recs[best].c_free)) + best = curr; + curr++; + } + + BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec)); + return best; +} + +static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *fe_bh, + struct buffer_head *bg_bh, + struct buffer_head *prev_bg_bh, + u16 chain) +{ + int status; + /* there is a really tiny chance the journal calls could fail, + * but we wouldn't want inconsistent blocks in *any* case. */ + u64 fe_ptr, bg_ptr, prev_bg_ptr; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; + struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); + status = -EIO; + goto out; + } + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto out; + } + if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); + status = -EIO; + goto out; + } + + mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to " + "top, prev = %"MLFu64"\n", + fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno); + + fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); + bg_ptr = le64_to_cpu(bg->bg_next_group); + prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); + + status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + prev_bg->bg_next_group = bg->bg_next_group; + + status = ocfs2_journal_dirty(handle, prev_bg_bh); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + status = ocfs2_journal_access(handle, alloc_inode, bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno; + + status = ocfs2_journal_dirty(handle, bg_bh); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + status = ocfs2_journal_access(handle, alloc_inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno; + + status = ocfs2_journal_dirty(handle, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out_rollback; + } + + status = 0; +out_rollback: + if (status < 0) { + fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr); + bg->bg_next_group = cpu_to_le64(bg_ptr); + prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); + } +out: + mlog_exit(status); + return status; +} + +static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg, + u32 wanted) +{ + return le16_to_cpu(bg->bg_free_bits_count) > wanted; +} + +/* return 0 on success, -ENOSPC to keep searching and any other < 0 + * value on error. */ +static int ocfs2_cluster_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found) +{ + int search = -ENOSPC; + int ret; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + u16 tmp_off, tmp_found; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (bg->bg_free_bits_count) { + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + &tmp_off, &tmp_found); + if (ret) + return ret; + + /* ocfs2_block_group_find_clear_bits() might + * return success, but we still want to return + * -ENOSPC unless it found the minimum number + * of bits. */ + if (min_bits <= tmp_found) { + *bit_off = tmp_off; + *bits_found = tmp_found; + search = 0; /* success */ + } + } + + return search; +} + +static int ocfs2_block_group_search(struct inode *inode, + struct buffer_head *group_bh, + u32 bits_wanted, u32 min_bits, + u16 *bit_off, u16 *bits_found) +{ + int ret = -ENOSPC; + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + + BUG_ON(min_bits != 1); + BUG_ON(ocfs2_is_cluster_bitmap(inode)); + + if (bg->bg_free_bits_count) + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), + group_bh, bits_wanted, + bit_off, bits_found); + + return ret; +} + +static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno) +{ + int status; + u16 chain, tmp_bits; + u32 tmp_used; + u64 next_group; + struct ocfs2_journal_handle *handle = ac->ac_handle; + struct inode *alloc_inode = ac->ac_inode; + struct buffer_head *group_bh = NULL; + struct buffer_head *prev_group_bh = NULL; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + struct ocfs2_group_desc *bg; + + chain = ac->ac_chain; + mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n", + bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno); + + status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh, OCFS2_BH_CACHED, alloc_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + + status = -ENOSPC; + /* for now, the chain search is a bit simplistic. We just use + * the 1st group with any empty bits. */ + while ((status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, bit_off, + &tmp_bits)) == -ENOSPC) { + if (!bg->bg_next_group) + break; + + if (prev_group_bh) { + brelse(prev_group_bh); + prev_group_bh = NULL; + } + next_group = le64_to_cpu(bg->bg_next_group); + prev_group_bh = group_bh; + group_bh = NULL; + status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + next_group, &group_bh, + OCFS2_BH_CACHED, alloc_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + bg = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n", + tmp_bits, bg->bg_blkno); + + *num_bits = tmp_bits; + + BUG_ON(*num_bits == 0); + + /* + * Keep track of previous block descriptor read. When + * we find a target, if we have read more than X + * number of descriptors, and the target is reasonably + * empty, relink him to top of his chain. + * + * We've read 0 extra blocks and only send one more to + * the transaction, yet the next guy to search has a + * much easier time. + * + * Do this *after* figuring out how many bits we're taking out + * of our target group. + */ + if (ac->ac_allow_chain_relink && + (prev_group_bh) && + (ocfs2_block_group_reasonably_empty(bg, *num_bits))) { + status = ocfs2_relink_block_group(handle, alloc_inode, + ac->ac_bh, group_bh, + prev_group_bh, chain); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + /* Ok, claim our bits now: set the info on dinode, chainlist + * and then the group */ + status = ocfs2_journal_access(handle, + alloc_inode, + ac->ac_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits)); + + status = ocfs2_journal_dirty(handle, + ac->ac_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_block_group_set_bits(handle, + alloc_inode, + bg, + group_bh, + *bit_off, + *num_bits); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n", + *num_bits, fe->i_blkno); + + *bg_blkno = le64_to_cpu(bg->bg_blkno); +bail: + if (group_bh) + brelse(group_bh); + if (prev_group_bh) + brelse(prev_group_bh); + + mlog_exit(status); + return status; +} + +/* will give out up to bits_wanted contiguous bits. */ +static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 *bg_blkno) +{ + int status; + u16 victim, i; + struct ocfs2_chain_list *cl; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); + BUG_ON(!ac->ac_bh); + + fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); + status = -EIO; + goto bail; + } + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= + le32_to_cpu(fe->id1.bitmap1.i_total)) { + ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u" + "used bits but only %u total.", + le64_to_cpu(fe->i_blkno), + le32_to_cpu(fe->id1.bitmap1.i_used), + le32_to_cpu(fe->id1.bitmap1.i_total)); + status = -EIO; + goto bail; + } + + cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; + + victim = ocfs2_find_victim_chain(cl); + ac->ac_chain = victim; + ac->ac_allow_chain_relink = 1; + + status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, + num_bits, bg_blkno); + if (!status) + goto bail; + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + + mlog(0, "Search of victim chain %u came up with nothing, " + "trying all chains now.\n", victim); + + /* If we didn't pick a good victim, then just default to + * searching each chain in order. Don't allow chain relinking + * because we only calculate enough journal credits for one + * relink per alloc. */ + ac->ac_allow_chain_relink = 0; + for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) { + if (i == victim) + continue; + if (!cl->cl_recs[i].c_free) + continue; + + ac->ac_chain = i; + status = ocfs2_search_chain(ac, bits_wanted, min_bits, + bit_off, num_bits, + bg_blkno); + if (!status) + break; + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } +bail: + + mlog_exit(status); + return status; +} + +int ocfs2_claim_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u16 *suballoc_bit_start, + unsigned int *num_bits, + u64 *blkno_start) +{ + int status; + u64 bg_blkno; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted)); + BUG_ON(ac->ac_which != OCFS2_AC_USE_META); + BUG_ON(ac->ac_handle != handle); + + status = ocfs2_claim_suballoc_bits(osb, + ac, + bits_wanted, + 1, + suballoc_bit_start, + num_bits, + &bg_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_allocs); + + *blkno_start = bg_blkno + (u64) *suballoc_bit_start; + ac->ac_bits_given += (*num_bits); + status = 0; +bail: + mlog_exit(status); + return status; +} + +int ocfs2_claim_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u16 *suballoc_bit, + u64 *fe_blkno) +{ + int status; + unsigned int num_bits; + u64 bg_blkno; + + mlog_entry_void(); + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + BUG_ON(ac->ac_handle != handle); + + status = ocfs2_claim_suballoc_bits(osb, + ac, + 1, + 1, + suballoc_bit, + &num_bits, + &bg_blkno); + if (status < 0) { + mlog_errno(status); + goto bail; + } + atomic_inc(&osb->alloc_stats.bg_allocs); + + BUG_ON(num_bits != 1); + + *fe_blkno = bg_blkno + (u64) (*suballoc_bit); + ac->ac_bits_given++; + status = 0; +bail: + mlog_exit(status); + return status; +} + +/* translate a group desc. blkno and it's bitmap offset into + * disk cluster offset. */ +static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode, + u64 bg_blkno, + u16 bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 cluster = 0; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + if (bg_blkno != osb->first_cluster_group_blkno) + cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno); + cluster += (u32) bg_bit_off; + return cluster; +} + +/* given a cluster offset, calculate which block group it belongs to + * and return that block offset. */ +static inline u64 ocfs2_which_cluster_group(struct inode *inode, + u32 cluster) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 group_no; + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + group_no = cluster / osb->bitmap_cpg; + if (!group_no) + return osb->first_cluster_group_blkno; + return ocfs2_clusters_to_blocks(inode->i_sb, + group_no * osb->bitmap_cpg); +} + +/* given the block number of a cluster start, calculate which cluster + * group and descriptor bitmap offset that corresponds to. */ +static inline void ocfs2_block_to_cluster_group(struct inode *inode, + u64 data_blkno, + u64 *bg_blkno, + u16 *bg_bit_off) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno); + + BUG_ON(!ocfs2_is_cluster_bitmap(inode)); + + *bg_blkno = ocfs2_which_cluster_group(inode, + data_cluster); + + if (*bg_blkno == osb->first_cluster_group_blkno) + *bg_bit_off = (u16) data_cluster; + else + *bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb, + data_blkno - *bg_blkno); +} + +/* + * min_bits - minimum contiguous chunk from this total allocation we + * can handle. set to what we asked for originally for a full + * contig. allocation, set to '1' to indicate we can deal with extents + * of any size. + */ +int ocfs2_claim_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters) +{ + int status; + unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; + u64 bg_blkno; + u16 bg_bit_off; + + mlog_entry_void(); + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); + + BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL + && ac->ac_which != OCFS2_AC_USE_MAIN); + BUG_ON(ac->ac_handle != handle); + + if (ac->ac_which == OCFS2_AC_USE_LOCAL) { + status = ocfs2_claim_local_alloc_bits(osb, + handle, + ac, + bits_wanted, + cluster_start, + num_clusters); + if (!status) + atomic_inc(&osb->alloc_stats.local_data); + } else { + if (min_clusters > (osb->bitmap_cpg - 1)) { + /* The only paths asking for contiguousness + * should know about this already. */ + mlog(ML_ERROR, "minimum allocation requested exceeds " + "group bitmap size!"); + status = -ENOSPC; + goto bail; + } + /* clamp the current request down to a realistic size. */ + if (bits_wanted > (osb->bitmap_cpg - 1)) + bits_wanted = osb->bitmap_cpg - 1; + + status = ocfs2_claim_suballoc_bits(osb, + ac, + bits_wanted, + min_clusters, + &bg_bit_off, + num_clusters, + &bg_blkno); + if (!status) { + *cluster_start = + ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode, + bg_blkno, + bg_bit_off); + atomic_inc(&osb->alloc_stats.bitmap_data); + } + } + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + ac->ac_bits_given += *num_clusters; + +bail: + mlog_exit(status); + return status; +} + +static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits) +{ + int status; + unsigned int tmp; + int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; + struct ocfs2_group_desc *undo_bg = NULL; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); + status = -EIO; + goto bail; + } + + mlog(0, "off = %u, num = %u\n", bit_off, num_bits); + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + journal_type = OCFS2_JOURNAL_ACCESS_UNDO; + + status = ocfs2_journal_access(handle, alloc_inode, group_bh, + journal_type); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (ocfs2_is_cluster_bitmap(alloc_inode)) + undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; + + tmp = num_bits; + while(tmp--) { + ocfs2_clear_bit((bit_off + tmp), + (unsigned long *) bg->bg_bitmap); + if (ocfs2_is_cluster_bitmap(alloc_inode)) + ocfs2_set_bit(bit_off + tmp, + (unsigned long *) undo_bg->bg_bitmap); + } + le16_add_cpu(&bg->bg_free_bits_count, num_bits); + + status = ocfs2_journal_dirty(handle, group_bh); + if (status < 0) + mlog_errno(status); +bail: + return status; +} + +/* + * expects the suballoc inode to already be locked. + */ +static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) +{ + int status = 0; + u32 tmp_used; + struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; + struct ocfs2_chain_list *cl = &fe->id2.i_chain; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *group; + + mlog_entry_void(); + + if (!OCFS2_IS_VALID_DINODE(fe)) { + OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); + status = -EIO; + goto bail; + } + BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); + + mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64 + ", starting at %u\n", + OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno, + start_bit); + + status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, + alloc_inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + group = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(group)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); + status = -EIO; + goto bail; + } + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); + + status = ocfs2_block_group_clear_bits(handle, alloc_inode, + group, group_bh, + start_bit, count); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free, + count); + tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); + fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count); + + status = ocfs2_journal_dirty(handle, alloc_bh); + if (status < 0) { + mlog_errno(status); + goto bail; + } + +bail: + if (group_bh) + brelse(group_bh); + + mlog_exit(status); + return status; +} + +static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) +{ + u64 group = block - (u64) bit; + + return group; +} + +int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di) +{ + u64 blk = le64_to_cpu(di->i_blkno); + u16 bit = le16_to_cpu(di->i_suballoc_bit); + u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + return ocfs2_free_suballoc_bits(handle, inode_alloc_inode, + inode_alloc_bh, bit, bg_blkno, 1); +} + +int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, + struct inode *eb_alloc_inode, + struct buffer_head *eb_alloc_bh, + struct ocfs2_extent_block *eb) +{ + u64 blk = le64_to_cpu(eb->h_blkno); + u16 bit = le16_to_cpu(eb->h_suballoc_bit); + u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh, + bit, bg_blkno, 1); +} + +int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + int status; + u16 bg_start_bit; + u64 bg_blkno; + struct ocfs2_dinode *fe; + + /* You can't ever have a contiguous set of clusters + * bigger than a block group bitmap so we never have to worry + * about looping on them. */ + + mlog_entry_void(); + + /* This is expensive. We can safely remove once this stuff has + * gotten tested really well. */ + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + + fe = (struct ocfs2_dinode *) bitmap_bh->b_data; + + ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, + &bg_start_bit); + + mlog(0, "want to free %u clusters starting at block %"MLFu64"\n", + num_clusters, start_blk); + mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n", + bg_blkno, bg_start_bit); + + status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, + bg_start_bit, bg_blkno, + num_clusters); + if (status < 0) + mlog_errno(status); + + mlog_exit(status); + return status; +} + +static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) +{ + printk("Block Group:\n"); + printk("bg_signature: %s\n", bg->bg_signature); + printk("bg_size: %u\n", bg->bg_size); + printk("bg_bits: %u\n", bg->bg_bits); + printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count); + printk("bg_chain: %u\n", bg->bg_chain); + printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation)); + printk("bg_next_group: %"MLFu64"\n", bg->bg_next_group); + printk("bg_parent_dinode: %"MLFu64"\n", bg->bg_parent_dinode); + printk("bg_blkno: %"MLFu64"\n", bg->bg_blkno); +} + +static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) +{ + int i; + + printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno); + printk("i_signature: %s\n", fe->i_signature); + printk("i_size: %"MLFu64"\n", fe->i_size); + printk("i_clusters: %u\n", fe->i_clusters); + printk("i_generation: %u\n", + le32_to_cpu(fe->i_generation)); + printk("id1.bitmap1.i_used: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_used)); + printk("id1.bitmap1.i_total: %u\n", + le32_to_cpu(fe->id1.bitmap1.i_total)); + printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg); + printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc); + printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count); + printk("id2.i_chain.cl_next_free_rec: %u\n", + fe->id2.i_chain.cl_next_free_rec); + for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) { + printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_free); + printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i, + fe->id2.i_chain.cl_recs[i].c_total); + printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i, + fe->id2.i_chain.cl_recs[i].c_blkno); + } +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h new file mode 100644 index 000000000000..a76c82a7ceac --- /dev/null +++ b/fs/ocfs2/suballoc.h @@ -0,0 +1,132 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * suballoc.h + * + * Defines sub allocator api + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef _CHAINALLOC_H_ +#define _CHAINALLOC_H_ + +typedef int (group_search_t)(struct inode *, + struct buffer_head *, + u32, + u32, + u16 *, + u16 *); + +struct ocfs2_alloc_context { + struct inode *ac_inode; /* which bitmap are we allocating from? */ + struct buffer_head *ac_bh; /* file entry bh */ + u32 ac_bits_wanted; + u32 ac_bits_given; +#define OCFS2_AC_USE_LOCAL 1 +#define OCFS2_AC_USE_MAIN 2 +#define OCFS2_AC_USE_INODE 3 +#define OCFS2_AC_USE_META 4 + u32 ac_which; + struct ocfs2_journal_handle *ac_handle; + + /* these are used by the chain search */ + u16 ac_chain; + int ac_allow_chain_relink; + group_search_t *ac_group_search; +}; + +void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); +static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) +{ + return ac->ac_bits_wanted - ac->ac_bits_given; +} + +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_dinode *fe, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context **ac); +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + u32 bits_wanted, + struct ocfs2_alloc_context **ac); + +int ocfs2_claim_metadata(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u16 *suballoc_bit_start, + u32 *num_bits, + u64 *blkno_start); +int ocfs2_claim_new_inode(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u16 *suballoc_bit, + u64 *fe_blkno); +int ocfs2_claim_clusters(struct ocfs2_super *osb, + struct ocfs2_journal_handle *handle, + struct ocfs2_alloc_context *ac, + u32 min_clusters, + u32 *cluster_start, + u32 *num_clusters); + +int ocfs2_free_dinode(struct ocfs2_journal_handle *handle, + struct inode *inode_alloc_inode, + struct buffer_head *inode_alloc_bh, + struct ocfs2_dinode *di); +int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle, + struct inode *eb_alloc_inode, + struct buffer_head *eb_alloc_bh, + struct ocfs2_extent_block *eb); +int ocfs2_free_clusters(struct ocfs2_journal_handle *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); + +static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb, + u64 bg_blkno) +{ + /* This should work for all block group descriptors as only + * the 1st group descriptor of the cluster bitmap is + * different. */ + + if (bg_blkno == osb->first_cluster_group_blkno) + return 0; + + /* the rest of the block groups are located at the beginning + * of their 1st cluster, so a direct translation just + * works. */ + return ocfs2_blocks_to_clusters(osb->sb, bg_blkno); +} + +static inline int ocfs2_is_cluster_bitmap(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + return osb->bitmap_blkno == OCFS2_I(inode)->ip_blkno; +} + +/* This is for local alloc ONLY. Others should use the task-specific + * apis above. */ +int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, + struct ocfs2_alloc_context *ac); + +#endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c new file mode 100644 index 000000000000..364d64bd5f10 --- /dev/null +++ b/fs/ocfs2/super.c @@ -0,0 +1,1733 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.c + * + * load/unload driver, mount/dismount volumes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/utsname.h> +#include <linux/init.h> +#include <linux/random.h> +#include <linux/statfs.h> +#include <linux/moduleparam.h> +#include <linux/blkdev.h> +#include <linux/socket.h> +#include <linux/inet.h> +#include <linux/parser.h> +#include <linux/crc32.h> +#include <linux/debugfs.h> + +#include <cluster/nodemanager.h> + +#define MLOG_MASK_PREFIX ML_SUPER +#include <cluster/masklog.h> + +#include "ocfs2.h" + +/* this should be the only file to include a version 1 header */ +#include "ocfs1_fs_compat.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "export.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "localalloc.h" +#include "namei.h" +#include "slot_map.h" +#include "super.h" +#include "sysfile.h" +#include "uptodate.h" +#include "ver.h" +#include "vote.h" + +#include "buffer_head_io.h" + +/* + * Globals + */ +static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED; + +static u32 osb_id; /* Keeps track of next available OSB Id */ + +static kmem_cache_t *ocfs2_inode_cachep = NULL; + +kmem_cache_t *ocfs2_lock_cache = NULL; + +/* OCFS2 needs to schedule several differnt types of work which + * require cluster locking, disk I/O, recovery waits, etc. Since these + * types of work tend to be heavy we avoid using the kernel events + * workqueue and schedule on our own. */ +struct workqueue_struct *ocfs2_wq = NULL; + +static struct dentry *ocfs2_debugfs_root = NULL; + +MODULE_AUTHOR("Oracle"); +MODULE_LICENSE("GPL"); + +static int ocfs2_parse_options(struct super_block *sb, char *options, + unsigned long *mount_opt, int is_remount); +static void ocfs2_put_super(struct super_block *sb); +static int ocfs2_mount_volume(struct super_block *sb); +static int ocfs2_remount(struct super_block *sb, int *flags, char *data); +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err); +static int ocfs2_initialize_mem_caches(void); +static void ocfs2_free_mem_caches(void); +static void ocfs2_delete_osb(struct ocfs2_super *osb); + +static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf); + +static int ocfs2_sync_fs(struct super_block *sb, int wait); + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); +static int ocfs2_release_system_inodes(struct ocfs2_super *osb); +static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); +static int ocfs2_check_volume(struct ocfs2_super *osb); +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 sectsize); +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size); +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size); +static void ocfs2_write_super(struct super_block *sb); +static struct inode *ocfs2_alloc_inode(struct super_block *sb); +static void ocfs2_destroy_inode(struct inode *inode); + +static unsigned long long ocfs2_max_file_offset(unsigned int blockshift); + +static struct super_operations ocfs2_sops = { + .statfs = ocfs2_statfs, + .alloc_inode = ocfs2_alloc_inode, + .destroy_inode = ocfs2_destroy_inode, + .drop_inode = ocfs2_drop_inode, + .clear_inode = ocfs2_clear_inode, + .delete_inode = ocfs2_delete_inode, + .sync_fs = ocfs2_sync_fs, + .write_super = ocfs2_write_super, + .put_super = ocfs2_put_super, + .remount_fs = ocfs2_remount, +}; + +enum { + Opt_barrier, + Opt_err_panic, + Opt_err_ro, + Opt_intr, + Opt_nointr, + Opt_hb_none, + Opt_hb_local, + Opt_data_ordered, + Opt_data_writeback, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_barrier, "barrier=%u"}, + {Opt_err_panic, "errors=panic"}, + {Opt_err_ro, "errors=remount-ro"}, + {Opt_intr, "intr"}, + {Opt_nointr, "nointr"}, + {Opt_hb_none, OCFS2_HB_NONE}, + {Opt_hb_local, OCFS2_HB_LOCAL}, + {Opt_data_ordered, "data=ordered"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_err, NULL} +}; + +/* + * write_super and sync_fs ripped right out of ext3. + */ +static void ocfs2_write_super(struct super_block *sb) +{ + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; +} + +static int ocfs2_sync_fs(struct super_block *sb, int wait) +{ + int status = 0; + tid_t target; + struct ocfs2_super *osb = OCFS2_SB(sb); + + sb->s_dirt = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (wait) { + status = ocfs2_flush_truncate_log(osb); + if (status < 0) + mlog_errno(status); + } else { + ocfs2_schedule_truncate_log_flush(osb, 0); + } + + if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { + if (wait) + log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); + } + return 0; +} + +static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + mlog_entry_void(); + + new = ocfs2_iget(osb, osb->root_blkno); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->root_inode = new; + + new = ocfs2_iget(osb, osb->system_dir_blkno); + if (IS_ERR(new)) { + status = PTR_ERR(new); + mlog_errno(status); + goto bail; + } + osb->sys_root_inode = new; + + for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; + i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog_errno(status); + /* FIXME: Should ERROR_RO_FS */ + mlog(ML_ERROR, "Unable to load system inode %d, " + "possibly corrupt fs?", i); + goto bail; + } + // the array now has one ref, so drop this one + iput(new); + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) +{ + struct inode *new = NULL; + int status = 0; + int i; + + mlog_entry_void(); + + for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; + i < NUM_SYSTEM_INODES; + i++) { + new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); + if (!new) { + ocfs2_release_system_inodes(osb); + status = -EINVAL; + mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", + status, i, osb->slot_num); + goto bail; + } + /* the array now has one ref, so drop this one */ + iput(new); + } + +bail: + mlog_exit(status); + return status; +} + +static int ocfs2_release_system_inodes(struct ocfs2_super *osb) +{ + int status = 0, i; + struct inode *inode; + + mlog_entry_void(); + + for (i = 0; i < NUM_SYSTEM_INODES; i++) { + inode = osb->system_inodes[i]; + if (inode) { + iput(inode); + osb->system_inodes[i] = NULL; + } + } + + inode = osb->sys_root_inode; + if (inode) { + iput(inode); + osb->sys_root_inode = NULL; + } + + inode = osb->root_inode; + if (inode) { + iput(inode); + osb->root_inode = NULL; + } + + mlog_exit(status); + return status; +} + +/* We're allocating fs objects, use GFP_NOFS */ +static struct inode *ocfs2_alloc_inode(struct super_block *sb) +{ + struct ocfs2_inode_info *oi; + + oi = kmem_cache_alloc(ocfs2_inode_cachep, SLAB_NOFS); + if (!oi) + return NULL; + + return &oi->vfs_inode; +} + +static void ocfs2_destroy_inode(struct inode *inode) +{ + kmem_cache_free(ocfs2_inode_cachep, OCFS2_I(inode)); +} + +/* From xfs_super.c:xfs_max_file_offset + * Copyright (c) 2000-2004 Silicon Graphics, Inc. + */ +static unsigned long long ocfs2_max_file_offset(unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_prepare_write does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBD) + BUG_ON(sizeof(sector_t) != 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((unsigned long long)pagefactor) << bitshift) - 1; +} + +static int ocfs2_remount(struct super_block *sb, int *flags, char *data) +{ + int incompat_features; + int ret = 0; + unsigned long parsed_options; + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + ret = -EINVAL; + goto out; + } + + if ((osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) != + (parsed_options & OCFS2_MOUNT_HB_LOCAL)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change heartbeat mode on remount\n"); + goto out; + } + + if ((osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK) != + (parsed_options & OCFS2_MOUNT_DATA_WRITEBACK)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot change data mode on remount\n"); + goto out; + } + + /* We're going to/from readonly mode. */ + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Lock here so the check of HARD_RO and the potential + * setting of SOFT_RO is atomic. */ + spin_lock(&osb->osb_lock); + if (osb->osb_flags & OCFS2_OSB_HARD_RO) { + mlog(ML_ERROR, "Remount on readonly device is forbidden.\n"); + ret = -EROFS; + goto unlock_osb; + } + + if (*flags & MS_RDONLY) { + mlog(0, "Going to ro mode.\n"); + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + } else { + mlog(0, "Making ro filesystem writeable.\n"); + + if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { + mlog(ML_ERROR, "Cannot remount RDWR " + "filesystem due to previous errors.\n"); + ret = -EROFS; + goto unlock_osb; + } + incompat_features = OCFS2_HAS_RO_COMPAT_FEATURE(sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP); + if (incompat_features) { + mlog(ML_ERROR, "Cannot remount RDWR because " + "of unsupported optional features " + "(%x).\n", incompat_features); + ret = -EINVAL; + goto unlock_osb; + } + sb->s_flags &= ~MS_RDONLY; + osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; + } +unlock_osb: + spin_unlock(&osb->osb_lock); + } + + if (!ret) { + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); + + /* Only save off the new mount options in case of a successful + * remount. */ + osb->s_mount_opt = parsed_options; + } +out: + return ret; +} + +static int ocfs2_sb_probe(struct super_block *sb, + struct buffer_head **bh, + int *sector_size) +{ + int status = 0, tmpstat; + struct ocfs1_vol_disk_hdr *hdr; + struct ocfs2_dinode *di; + int blksize; + + *bh = NULL; + + /* may be > 512 */ + *sector_size = bdev_hardsect_size(sb->s_bdev); + if (*sector_size > OCFS2_MAX_BLOCKSIZE) { + mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", + *sector_size, OCFS2_MAX_BLOCKSIZE); + status = -EINVAL; + goto bail; + } + + /* Can this really happen? */ + if (*sector_size < OCFS2_MIN_BLOCKSIZE) + *sector_size = OCFS2_MIN_BLOCKSIZE; + + /* check block zero for old format */ + status = ocfs2_get_sector(sb, bh, 0, *sector_size); + if (status < 0) { + mlog_errno(status); + goto bail; + } + hdr = (struct ocfs1_vol_disk_hdr *) (*bh)->b_data; + if (hdr->major_version == OCFS1_MAJOR_VERSION) { + mlog(ML_ERROR, "incompatible version: %u.%u\n", + hdr->major_version, hdr->minor_version); + status = -EINVAL; + } + if (memcmp(hdr->signature, OCFS1_VOLUME_SIGNATURE, + strlen(OCFS1_VOLUME_SIGNATURE)) == 0) { + mlog(ML_ERROR, "incompatible volume signature: %8s\n", + hdr->signature); + status = -EINVAL; + } + brelse(*bh); + *bh = NULL; + if (status < 0) { + mlog(ML_ERROR, "This is an ocfs v1 filesystem which must be " + "upgraded before mounting with ocfs v2\n"); + goto bail; + } + + /* + * Now check at magic offset for 512, 1024, 2048, 4096 + * blocksizes. 4096 is the maximum blocksize because it is + * the minimum clustersize. + */ + status = -EINVAL; + for (blksize = *sector_size; + blksize <= OCFS2_MAX_BLOCKSIZE; + blksize <<= 1) { + tmpstat = ocfs2_get_sector(sb, bh, + OCFS2_SUPER_BLOCK_BLKNO, + blksize); + if (tmpstat < 0) { + status = tmpstat; + mlog_errno(status); + goto bail; + } + di = (struct ocfs2_dinode *) (*bh)->b_data; + status = ocfs2_verify_volume(di, *bh, blksize); + if (status >= 0) + goto bail; + brelse(*bh); + *bh = NULL; + if (status != -EAGAIN) + break; + } + +bail: + return status; +} + +static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) +{ + struct dentry *root; + int status, sector_size; + unsigned long parsed_opt; + struct inode *inode = NULL; + struct ocfs2_super *osb = NULL; + struct buffer_head *bh = NULL; + + mlog_entry("%p, %p, %i", sb, data, silent); + + /* for now we only have one cluster/node, make sure we see it + * in the heartbeat universe */ + if (!o2hb_check_local_node_heartbeating()) { + status = -EINVAL; + goto read_super_error; + } + + /* probe for superblock */ + status = ocfs2_sb_probe(sb, &bh, §or_size); + if (status < 0) { + mlog(ML_ERROR, "superblock probe failed!\n"); + goto read_super_error; + } + + status = ocfs2_initialize_super(sb, bh, sector_size); + osb = OCFS2_SB(sb); + if (status < 0) { + mlog_errno(status); + goto read_super_error; + } + brelse(bh); + bh = NULL; + + if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) { + status = -EINVAL; + goto read_super_error; + } + osb->s_mount_opt = parsed_opt; + + sb->s_magic = OCFS2_SUPER_MAGIC; + + /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + * heartbeat=none */ + if (bdev_read_only(sb->s_bdev)) { + if (!(sb->s_flags & MS_RDONLY)) { + status = -EACCES; + mlog(ML_ERROR, "Readonly device detected but readonly " + "mount was not specified.\n"); + goto read_super_error; + } + + /* You should not be able to start a local heartbeat + * on a readonly device. */ + if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { + status = -EROFS; + mlog(ML_ERROR, "Local heartbeat specified on readonly " + "device.\n"); + goto read_super_error; + } + + status = ocfs2_check_journals_nolocks(osb); + if (status < 0) { + if (status == -EROFS) + mlog(ML_ERROR, "Recovery required on readonly " + "file system, but write access is " + "unavailable.\n"); + else + mlog_errno(status); + goto read_super_error; + } + + ocfs2_set_ro_flag(osb, 1); + + printk(KERN_NOTICE "Readonly device detected. No cluster " + "services will be utilized for this mount. Recovery " + "will be skipped.\n"); + } + + if (!ocfs2_is_hard_readonly(osb)) { + /* If this isn't a hard readonly mount, then we need + * to make sure that heartbeat is in a valid state, + * and that we mark ourselves soft readonly is -oro + * was specified. */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { + mlog(ML_ERROR, "No heartbeat for device (%s)\n", + sb->s_id); + status = -EINVAL; + goto read_super_error; + } + + if (sb->s_flags & MS_RDONLY) + ocfs2_set_ro_flag(osb, 0); + } + + osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, + ocfs2_debugfs_root); + if (!osb->osb_debug_root) { + status = -EINVAL; + mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); + goto read_super_error; + } + + status = ocfs2_mount_volume(sb); + if (osb->root_inode) + inode = igrab(osb->root_inode); + + if (status < 0) + goto read_super_error; + + if (!inode) { + status = -EIO; + mlog_errno(status); + goto read_super_error; + } + + root = d_alloc_root(inode); + if (!root) { + status = -ENOMEM; + mlog_errno(status); + goto read_super_error; + } + + sb->s_root = root; + + ocfs2_complete_mount_recovery(osb); + + printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s " + "data mode.\n", + MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num, + osb->slot_num, + osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : + "ordered"); + + atomic_set(&osb->vol_state, VOLUME_MOUNTED); + wake_up(&osb->osb_mount_event); + + mlog_exit(status); + return status; + +read_super_error: + if (bh != NULL) + brelse(bh); + + if (inode) + iput(inode); + + if (osb) { + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + ocfs2_dismount_volume(sb, 1); + } + + mlog_exit(status); + return status; +} + +static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); +} + +static struct file_system_type ocfs2_fs_type = { + .owner = THIS_MODULE, + .name = "ocfs2", + .get_sb = ocfs2_get_sb, /* is this called when we mount + * the fs? */ + .kill_sb = kill_block_super, /* set to the generic one + * right now, but do we + * need to change that? */ + .fs_flags = FS_REQUIRES_DEV, + .next = NULL +}; + +static int ocfs2_parse_options(struct super_block *sb, + char *options, + unsigned long *mount_opt, + int is_remount) +{ + int status; + char *p; + + mlog_entry("remount: %d, options: \"%s\"\n", is_remount, + options ? options : "(none)"); + + *mount_opt = 0; + + if (!options) { + status = 1; + goto bail; + } + + while ((p = strsep(&options, ",")) != NULL) { + int token, option; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_hb_local: + *mount_opt |= OCFS2_MOUNT_HB_LOCAL; + break; + case Opt_hb_none: + *mount_opt &= ~OCFS2_MOUNT_HB_LOCAL; + break; + case Opt_barrier: + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option) + *mount_opt |= OCFS2_MOUNT_BARRIER; + else + *mount_opt &= ~OCFS2_MOUNT_BARRIER; + break; + case Opt_intr: + *mount_opt &= ~OCFS2_MOUNT_NOINTR; + break; + case Opt_nointr: + *mount_opt |= OCFS2_MOUNT_NOINTR; + break; + case Opt_err_panic: + *mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_err_ro: + *mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC; + break; + case Opt_data_ordered: + *mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK; + break; + case Opt_data_writeback: + *mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; + break; + default: + mlog(ML_ERROR, + "Unrecognized mount option \"%s\" " + "or missing value\n", p); + status = 0; + goto bail; + } + } + + status = 1; + +bail: + mlog_exit(status); + return status; +} + +static int __init ocfs2_init(void) +{ + int status; + + mlog_entry_void(); + + ocfs2_print_version(); + + if (init_ocfs2_extent_maps()) + return -ENOMEM; + + status = init_ocfs2_uptodate_cache(); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_initialize_mem_caches(); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + ocfs2_wq = create_singlethread_workqueue("ocfs2_wq"); + if (!ocfs2_wq) { + status = -ENOMEM; + goto leave; + } + + spin_lock(&ocfs2_globals_lock); + osb_id = 0; + spin_unlock(&ocfs2_globals_lock); + + ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); + if (!ocfs2_debugfs_root) { + status = -EFAULT; + mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); + } + +leave: + if (status < 0) { + ocfs2_free_mem_caches(); + exit_ocfs2_uptodate_cache(); + exit_ocfs2_extent_maps(); + } + + mlog_exit(status); + + if (status >= 0) { + return register_filesystem(&ocfs2_fs_type); + } else + return -1; +} + +static void __exit ocfs2_exit(void) +{ + mlog_entry_void(); + + if (ocfs2_wq) { + flush_workqueue(ocfs2_wq); + destroy_workqueue(ocfs2_wq); + } + + debugfs_remove(ocfs2_debugfs_root); + + ocfs2_free_mem_caches(); + + unregister_filesystem(&ocfs2_fs_type); + + exit_ocfs2_extent_maps(); + + exit_ocfs2_uptodate_cache(); + + mlog_exit_void(); +} + +static void ocfs2_put_super(struct super_block *sb) +{ + mlog_entry("(0x%p)\n", sb); + + ocfs2_sync_blockdev(sb); + ocfs2_dismount_volume(sb, 0); + + mlog_exit_void(); +} + +static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) +{ + struct ocfs2_super *osb; + u32 numbits, freebits; + int status; + struct ocfs2_dinode *bm_lock; + struct buffer_head *bh = NULL; + struct inode *inode = NULL; + + mlog_entry("(%p, %p)\n", sb, buf); + + osb = OCFS2_SB(sb); + + inode = ocfs2_get_system_file_inode(osb, + GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + mlog(ML_ERROR, "failed to get bitmap inode\n"); + status = -EIO; + goto bail; + } + + status = ocfs2_meta_lock(inode, NULL, &bh, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + bm_lock = (struct ocfs2_dinode *) bh->b_data; + + numbits = le32_to_cpu(bm_lock->id1.bitmap1.i_total); + freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); + + buf->f_type = OCFS2_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_namelen = OCFS2_MAX_FILENAME_LEN; + buf->f_blocks = ((sector_t) numbits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bfree = ((sector_t) freebits) * + (osb->s_clustersize >> osb->sb->s_blocksize_bits); + buf->f_bavail = buf->f_bfree; + buf->f_files = numbits; + buf->f_ffree = freebits; + + brelse(bh); + + ocfs2_meta_unlock(inode, 0); + status = 0; +bail: + if (inode) + iput(inode); + + mlog_exit(status); + + return status; +} + +static void ocfs2_inode_init_once(void *data, + kmem_cache_t *cachep, + unsigned long flags) +{ + struct ocfs2_inode_info *oi = data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + oi->ip_flags = 0; + oi->ip_open_count = 0; + spin_lock_init(&oi->ip_lock); + ocfs2_extent_map_init(&oi->vfs_inode); + INIT_LIST_HEAD(&oi->ip_handle_list); + INIT_LIST_HEAD(&oi->ip_io_markers); + oi->ip_handle = NULL; + oi->ip_created_trans = 0; + oi->ip_last_trans = 0; + oi->ip_dir_start_lookup = 0; + + init_rwsem(&oi->ip_alloc_sem); + init_MUTEX(&(oi->ip_io_sem)); + + oi->ip_blkno = 0ULL; + oi->ip_clusters = 0; + + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); + ocfs2_lock_res_init_once(&oi->ip_meta_lockres); + ocfs2_lock_res_init_once(&oi->ip_data_lockres); + + ocfs2_metadata_cache_init(&oi->vfs_inode); + + inode_init_once(&oi->vfs_inode); + } +} + +static int ocfs2_initialize_mem_caches(void) +{ + ocfs2_inode_cachep = kmem_cache_create("ocfs2_inode_cache", + sizeof(struct ocfs2_inode_info), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + ocfs2_inode_init_once, NULL); + if (!ocfs2_inode_cachep) + return -ENOMEM; + + ocfs2_lock_cache = kmem_cache_create("ocfs2_lock", + sizeof(struct ocfs2_journal_lock), + 0, + SLAB_NO_REAP|SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ocfs2_lock_cache) + return -ENOMEM; + + return 0; +} + +static void ocfs2_free_mem_caches(void) +{ + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + if (ocfs2_lock_cache) + kmem_cache_destroy(ocfs2_lock_cache); + + ocfs2_inode_cachep = NULL; + ocfs2_lock_cache = NULL; +} + +static int ocfs2_get_sector(struct super_block *sb, + struct buffer_head **bh, + int block, + int sect_size) +{ + if (!sb_set_blocksize(sb, sect_size)) { + mlog(ML_ERROR, "unable to set blocksize\n"); + return -EIO; + } + + *bh = sb_getblk(sb, block); + if (!*bh) { + mlog_errno(-EIO); + return -EIO; + } + lock_buffer(*bh); + if (!buffer_dirty(*bh)) + clear_buffer_uptodate(*bh); + unlock_buffer(*bh); + ll_rw_block(READ, 1, bh); + wait_on_buffer(*bh); + return 0; +} + +/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ +static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) +{ + int status; + + /* XXX hold a ref on the node while mounte? easy enough, if + * desirable. */ + osb->node_num = o2nm_this_node(); + if (osb->node_num == O2NM_MAX_NODES) { + mlog(ML_ERROR, "could not find this host's node number\n"); + status = -ENOENT; + goto bail; + } + + mlog(ML_NOTICE, "I am node %d\n", osb->node_num); + + status = 0; +bail: + return status; +} + +static int ocfs2_mount_volume(struct super_block *sb) +{ + int status = 0; + int unlock_super = 0; + struct ocfs2_super *osb = OCFS2_SB(sb); + + mlog_entry_void(); + + if (ocfs2_is_hard_readonly(osb)) + goto leave; + + status = ocfs2_fill_local_node_info(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_register_hb_callbacks(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_dlm_init(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* requires vote_thread to be running. */ + status = ocfs2_register_net_handlers(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_super_lock(osb, 1); + if (status < 0) { + mlog_errno(status); + goto leave; + } + unlock_super = 1; + + /* This will load up the node map and add ourselves to it. */ + status = ocfs2_find_slot(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + ocfs2_populate_mounted_map(osb); + + /* load all node-local system inodes */ + status = ocfs2_init_local_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_check_volume(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_truncate_log_init(osb); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* This should be sent *after* we recovered our journal as it + * will cause other nodes to unmark us as needing + * recovery. However, we need to send it *before* dropping the + * super block lock as otherwise their recovery threads might + * try to clean us up while we're live! */ + status = ocfs2_request_mount_vote(osb); + if (status < 0) + mlog_errno(status); + +leave: + if (unlock_super) + ocfs2_super_unlock(osb, 1); + + mlog_exit(status); + return status; +} + +/* we can't grab the goofy sem lock from inside wait_event, so we use + * memory barriers to make sure that we'll see the null task before + * being woken up */ +static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) +{ + mb(); + return osb->recovery_thread_task != NULL; +} + +static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) +{ + int tmp; + struct ocfs2_super *osb = NULL; + + mlog_entry("(0x%p)\n", sb); + + BUG_ON(!sb); + osb = OCFS2_SB(sb); + BUG_ON(!osb); + + ocfs2_shutdown_local_alloc(osb); + + ocfs2_truncate_log_shutdown(osb); + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + down(&osb->recovery_lock); + osb->disable_recovery = 1; + up(&osb->recovery_lock); + wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); + + /* At this point, we know that no more recovery threads can be + * launched, so wait for any recovery completion work to + * complete. */ + flush_workqueue(ocfs2_wq); + + ocfs2_journal_shutdown(osb); + + ocfs2_sync_blockdev(sb); + + /* No dlm means we've failed during mount, so skip all the + * steps which depended on that to complete. */ + if (osb->dlm) { + tmp = ocfs2_super_lock(osb, 1); + if (tmp < 0) { + mlog_errno(tmp); + return; + } + + tmp = ocfs2_request_umount_vote(osb); + if (tmp < 0) + mlog_errno(tmp); + + if (osb->slot_num != OCFS2_INVALID_SLOT) + ocfs2_put_slot(osb); + + ocfs2_super_unlock(osb, 1); + } + + ocfs2_release_system_inodes(osb); + + if (osb->dlm) { + ocfs2_unregister_net_handlers(osb); + + ocfs2_dlm_shutdown(osb); + } + + ocfs2_clear_hb_callbacks(osb); + + debugfs_remove(osb->osb_debug_root); + + if (!mnt_err) + ocfs2_stop_heartbeat(osb); + + atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); + + printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n", + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num); + + ocfs2_delete_osb(osb); + kfree(osb); + sb->s_dev = 0; + sb->s_fs_info = NULL; +} + +static int ocfs2_setup_osb_uuid(struct ocfs2_super *osb, const unsigned char *uuid, + unsigned uuid_bytes) +{ + int i, ret; + char *ptr; + + BUG_ON(uuid_bytes != OCFS2_VOL_UUID_LEN); + + osb->uuid_str = kcalloc(1, OCFS2_VOL_UUID_LEN * 2 + 1, GFP_KERNEL); + if (osb->uuid_str == NULL) + return -ENOMEM; + + memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN); + + for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { + /* print with null */ + ret = snprintf(ptr, 3, "%02X", uuid[i]); + if (ret != 2) /* drop super cleans up */ + return -EINVAL; + /* then only advance past the last char */ + ptr += 2; + } + + return 0; +} + +static int ocfs2_initialize_super(struct super_block *sb, + struct buffer_head *bh, + int sector_size) +{ + int status = 0; + int i; + struct ocfs2_dinode *di = NULL; + struct inode *inode = NULL; + struct buffer_head *bitmap_bh = NULL; + struct ocfs2_journal *journal; + __le32 uuid_net_key; + struct ocfs2_super *osb; + + mlog_entry_void(); + + osb = kcalloc(1, sizeof(struct ocfs2_super), GFP_KERNEL); + if (!osb) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + sb->s_fs_info = osb; + sb->s_op = &ocfs2_sops; + sb->s_export_op = &ocfs2_export_ops; + sb->s_flags |= MS_NOATIME; + /* this is needed to support O_LARGEFILE */ + sb->s_maxbytes = ocfs2_max_file_offset(sb->s_blocksize_bits); + + osb->sb = sb; + /* Save off for ocfs2_rw_direct */ + osb->s_sectsize_bits = blksize_bits(sector_size); + if (!osb->s_sectsize_bits) + BUG(); + + osb->net_response_ids = 0; + spin_lock_init(&osb->net_response_lock); + INIT_LIST_HEAD(&osb->net_response_list); + + INIT_LIST_HEAD(&osb->osb_net_handlers); + init_waitqueue_head(&osb->recovery_event); + spin_lock_init(&osb->vote_task_lock); + init_waitqueue_head(&osb->vote_event); + osb->vote_work_sequence = 0; + osb->vote_wake_sequence = 0; + INIT_LIST_HEAD(&osb->blocked_lock_list); + osb->blocked_lock_count = 0; + INIT_LIST_HEAD(&osb->vote_list); + spin_lock_init(&osb->osb_lock); + + atomic_set(&osb->alloc_stats.moves, 0); + atomic_set(&osb->alloc_stats.local_data, 0); + atomic_set(&osb->alloc_stats.bitmap_data, 0); + atomic_set(&osb->alloc_stats.bg_allocs, 0); + atomic_set(&osb->alloc_stats.bg_extends, 0); + + ocfs2_init_node_maps(osb); + + snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", + MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + + init_MUTEX(&osb->recovery_lock); + + osb->disable_recovery = 0; + osb->recovery_thread_task = NULL; + + init_waitqueue_head(&osb->checkpoint_event); + atomic_set(&osb->needs_checkpoint, 0); + + osb->node_num = O2NM_INVALID_NODE_NUM; + osb->slot_num = OCFS2_INVALID_SLOT; + + osb->local_alloc_state = OCFS2_LA_UNUSED; + osb->local_alloc_bh = NULL; + + ocfs2_setup_hb_callbacks(osb); + + init_waitqueue_head(&osb->osb_mount_event); + + osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); + if (!osb->vol_label) { + mlog(ML_ERROR, "unable to alloc vol label\n"); + status = -ENOMEM; + goto bail; + } + + osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL); + if (!osb->uuid) { + mlog(ML_ERROR, "unable to alloc uuid\n"); + status = -ENOMEM; + goto bail; + } + + di = (struct ocfs2_dinode *)bh->b_data; + + osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); + if (osb->max_slots > OCFS2_MAX_SLOTS || osb->max_slots == 0) { + mlog(ML_ERROR, "Invalid number of node slots (%u)\n", + osb->max_slots); + status = -EINVAL; + goto bail; + } + mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); + + osb->s_feature_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); + osb->s_feature_ro_compat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_ro_compat); + osb->s_feature_incompat = + le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_incompat); + + if ((i = OCFS2_HAS_INCOMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_INCOMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount because of unsupported " + "optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + if (!(osb->sb->s_flags & MS_RDONLY) && + (i = OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, ~OCFS2_FEATURE_RO_COMPAT_SUPP))) { + mlog(ML_ERROR, "couldn't mount RDWR because of " + "unsupported optional features (%x).\n", i); + status = -EINVAL; + goto bail; + } + + get_random_bytes(&osb->s_next_generation, sizeof(u32)); + + /* FIXME + * This should be done in ocfs2_journal_init(), but unknown + * ordering issues will cause the filesystem to crash. + * If anyone wants to figure out what part of the code + * refers to osb->journal before ocfs2_journal_init() is run, + * be my guest. + */ + /* initialize our journal structure */ + + journal = kcalloc(1, sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto bail; + } + osb->journal = journal; + journal->j_osb = osb; + + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = (unsigned long) 1; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery, osb); + journal->j_state = OCFS2_JOURNAL_FREE; + + /* get some pseudo constants for clustersize bits */ + osb->s_clustersize_bits = + le32_to_cpu(di->id2.i_super.s_clustersize_bits); + osb->s_clustersize = 1 << osb->s_clustersize_bits; + mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits); + + if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || + osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { + mlog(ML_ERROR, "Volume has invalid cluster size (%d)\n", + osb->s_clustersize); + status = -EINVAL; + goto bail; + } + + if (ocfs2_clusters_to_blocks(osb->sb, le32_to_cpu(di->i_clusters) - 1) + > (u32)~0UL) { + mlog(ML_ERROR, "Volume might try to write to blocks beyond " + "what jbd can address in 32 bits.\n"); + status = -EINVAL; + goto bail; + } + + if (ocfs2_setup_osb_uuid(osb, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid))) { + mlog(ML_ERROR, "Out of memory trying to setup our uuid.\n"); + status = -ENOMEM; + goto bail; + } + + memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key)); + osb->net_key = le32_to_cpu(uuid_net_key); + + strncpy(osb->vol_label, di->id2.i_super.s_label, 63); + osb->vol_label[63] = '\0'; + osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); + osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); + osb->first_cluster_group_blkno = + le64_to_cpu(di->id2.i_super.s_first_cluster_group); + osb->fs_generation = le32_to_cpu(di->i_fs_generation); + mlog(0, "vol_label: %s\n", osb->vol_label); + mlog(0, "uuid: %s\n", osb->uuid_str); + mlog(0, "root_blkno=%"MLFu64", system_dir_blkno=%"MLFu64"\n", + osb->root_blkno, osb->system_dir_blkno); + + osb->osb_dlm_debug = ocfs2_new_dlm_debug(); + if (!osb->osb_dlm_debug) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + atomic_set(&osb->vol_state, VOLUME_INIT); + + /* load root, system_dir, and all global system inodes */ + status = ocfs2_init_global_system_inodes(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* + * global bitmap + */ + inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, + OCFS2_INVALID_SLOT); + if (!inode) { + status = -EINVAL; + mlog_errno(status); + goto bail; + } + + osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + + status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, + inode); + iput(inode); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + di = (struct ocfs2_dinode *) bitmap_bh->b_data; + osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); + osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total); + brelse(bitmap_bh); + mlog(0, "cluster bitmap inode: %"MLFu64", clusters per group: %u\n", + osb->bitmap_blkno, osb->bitmap_cpg); + + status = ocfs2_init_slot_info(osb); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + /* Link this osb onto the global linked list of all osb structures. */ + /* The Global Link List is mainted for the whole driver . */ + spin_lock(&ocfs2_globals_lock); + osb->osb_id = osb_id; + if (osb_id < OCFS2_MAX_OSB_ID) + osb_id++; + else { + mlog(ML_ERROR, "Too many volumes mounted\n"); + status = -ENOMEM; + } + spin_unlock(&ocfs2_globals_lock); + +bail: + mlog_exit(status); + return status; +} + +/* + * will return: -EAGAIN if it is ok to keep searching for superblocks + * -EINVAL if there is a bad superblock + * 0 on success + */ +static int ocfs2_verify_volume(struct ocfs2_dinode *di, + struct buffer_head *bh, + u32 blksz) +{ + int status = -EAGAIN; + + mlog_entry_void(); + + if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, + strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { + status = -EINVAL; + if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), + blksz); + } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != + OCFS2_MAJOR_REV_LEVEL || + le16_to_cpu(di->id2.i_super.s_minor_rev_level) != + OCFS2_MINOR_REV_LEVEL) { + mlog(ML_ERROR, "found superblock with bad version: " + "found %u.%u, should be %u.%u\n", + le16_to_cpu(di->id2.i_super.s_major_rev_level), + le16_to_cpu(di->id2.i_super.s_minor_rev_level), + OCFS2_MAJOR_REV_LEVEL, + OCFS2_MINOR_REV_LEVEL); + } else if (bh->b_blocknr != le64_to_cpu(di->i_blkno)) { + mlog(ML_ERROR, "bad block number on superblock: " + "found %"MLFu64", should be %llu\n", + di->i_blkno, (unsigned long long)bh->b_blocknr); + } else if (le32_to_cpu(di->id2.i_super.s_clustersize_bits) < 12 || + le32_to_cpu(di->id2.i_super.s_clustersize_bits) > 20) { + mlog(ML_ERROR, "bad cluster size found: %u\n", + 1 << le32_to_cpu(di->id2.i_super.s_clustersize_bits)); + } else if (!le64_to_cpu(di->id2.i_super.s_root_blkno)) { + mlog(ML_ERROR, "bad root_blkno: 0\n"); + } else if (!le64_to_cpu(di->id2.i_super.s_system_dir_blkno)) { + mlog(ML_ERROR, "bad system_dir_blkno: 0\n"); + } else if (le16_to_cpu(di->id2.i_super.s_max_slots) > OCFS2_MAX_SLOTS) { + mlog(ML_ERROR, + "Superblock slots found greater than file system " + "maximum: found %u, max %u\n", + le16_to_cpu(di->id2.i_super.s_max_slots), + OCFS2_MAX_SLOTS); + } else { + /* found it! */ + status = 0; + } + } + + mlog_exit(status); + return status; +} + +static int ocfs2_check_volume(struct ocfs2_super *osb) +{ + int status = 0; + int dirty; + struct ocfs2_dinode *local_alloc = NULL; /* only used if we + * recover + * ourselves. */ + + mlog_entry_void(); + + /* Init our journal object. */ + status = ocfs2_journal_init(osb->journal, &dirty); + if (status < 0) { + mlog(ML_ERROR, "Could not initialize journal!\n"); + goto finally; + } + + /* If the journal was unmounted cleanly then we don't want to + * recover anything. Otherwise, journal_load will do that + * dirty work for us :) */ + if (!dirty) { + status = ocfs2_journal_wipe(osb->journal, 0); + if (status < 0) { + mlog_errno(status); + goto finally; + } + } else { + mlog(ML_NOTICE, "File system was not unmounted cleanly, " + "recovering volume.\n"); + } + + /* will play back anything left in the journal. */ + ocfs2_journal_load(osb->journal); + + if (dirty) { + /* recover my local alloc if we didn't unmount cleanly. */ + status = ocfs2_begin_local_alloc_recovery(osb, + osb->slot_num, + &local_alloc); + if (status < 0) { + mlog_errno(status); + goto finally; + } + /* we complete the recovery process after we've marked + * ourselves as mounted. */ + } + + mlog(0, "Journal loaded.\n"); + + status = ocfs2_load_local_alloc(osb); + if (status < 0) { + mlog_errno(status); + goto finally; + } + + if (dirty) { + /* Recovery will be completed after we've mounted the + * rest of the volume. */ + osb->dirty = 1; + osb->local_alloc_copy = local_alloc; + local_alloc = NULL; + } + + /* go through each journal, trylock it and if you get the + * lock, and it's marked as dirty, set the bit in the recover + * map and launch a recovery thread for it. */ + status = ocfs2_mark_dead_nodes(osb); + if (status < 0) + mlog_errno(status); + +finally: + if (local_alloc) + kfree(local_alloc); + + mlog_exit(status); + return status; +} + +/* + * The routine gets called from dismount or close whenever a dismount on + * volume is requested and the osb open count becomes 1. + * It will remove the osb from the global list and also free up all the + * initialized resources and fileobject. + */ +static void ocfs2_delete_osb(struct ocfs2_super *osb) +{ + mlog_entry_void(); + + /* This function assumes that the caller has the main osb resource */ + + if (osb->slot_info) + ocfs2_free_slot_info(osb->slot_info); + + /* FIXME + * This belongs in journal shutdown, but because we have to + * allocate osb->journal at the start of ocfs2_initalize_osb(), + * we free it here. + */ + kfree(osb->journal); + if (osb->local_alloc_copy) + kfree(osb->local_alloc_copy); + kfree(osb->uuid_str); + ocfs2_put_dlm_debug(osb->osb_dlm_debug); + memset(osb, 0, sizeof(struct ocfs2_super)); + + mlog_exit_void(); +} + +/* Put OCFS2 into a readonly state, or (if the user specifies it), + * panic(). We do not support continue-on-error operation. */ +static void ocfs2_handle_error(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) + panic("OCFS2: (device %s): panic forced after error\n", + sb->s_id); + + ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS); + + if (sb->s_flags & MS_RDONLY && + (ocfs2_is_soft_readonly(osb) || + ocfs2_is_hard_readonly(osb))) + return; + + printk(KERN_CRIT "File system is now read-only due to the potential " + "of on-disk corruption. Please run fsck.ocfs2 once the file " + "system is unmounted.\n"); + sb->s_flags |= MS_RDONLY; + ocfs2_set_ro_flag(osb, 0); +} + +static char error_buf[1024]; + +void __ocfs2_error(struct super_block *sb, + const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsprintf(error_buf, fmt, args); + va_end(args); + + /* Not using mlog here because we want to show the actual + * function the error came from. */ + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + ocfs2_handle_error(sb); +} + +/* Handle critical errors. This is intentionally more drastic than + * ocfs2_handle_error, so we only use for things like journal errors, + * etc. */ +void __ocfs2_abort(struct super_block* sb, + const char *function, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vsprintf(error_buf, fmt, args); + va_end(args); + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", + sb->s_id, function, error_buf); + + /* We don't have the cluster support yet to go straight to + * hard readonly in here. Until then, we want to keep + * ocfs2_abort() so that we can at least mark critical + * errors. + * + * TODO: This should abort the journal and alert other nodes + * that our slot needs recovery. */ + + /* Force a panic(). This stinks, but it's better than letting + * things continue without having a proper hard readonly + * here. */ + OCFS2_SB(sb)->s_mount_opt |= OCFS2_MOUNT_ERRORS_PANIC; + ocfs2_handle_error(sb); +} + +module_init(ocfs2_init); +module_exit(ocfs2_exit); diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h new file mode 100644 index 000000000000..c564177dfbdc --- /dev/null +++ b/fs/ocfs2/super.h @@ -0,0 +1,44 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * super.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SUPER_H +#define OCFS2_SUPER_H + +extern struct workqueue_struct *ocfs2_wq; + +int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, + int node_num); + +void __ocfs2_error(struct super_block *sb, + const char *function, + const char *fmt, ...); +#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args) + +void __ocfs2_abort(struct super_block *sb, + const char *function, + const char *fmt, ...); +#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args) + +#endif /* OCFS2_SUPER_H */ diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c new file mode 100644 index 000000000000..f6986bd79e75 --- /dev/null +++ b/fs/ocfs2/symlink.c @@ -0,0 +1,180 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * linux/cluster/ssi/cfs/symlink.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE + * or NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to ssic-linux-devel@lists.sourceforge.net + * + * Copyright (C) 1992 Rick Sladkey + * + * Optimization changes Copyright (C) 1994 Florian La Roche + * + * Jun 7 1999, cache symlink lookups in the page cache. -DaveM + * + * Portions Copyright (C) 2001 Compaq Computer Corporation + * + * ocfs2 symlink handling code. + * + * Copyright (C) 2004, 2005 Oracle. + * + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/utsname.h> + +#define MLOG_MASK_PREFIX ML_NAMEI +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "file.h" +#include "inode.h" +#include "journal.h" +#include "symlink.h" + +#include "buffer_head_io.h" + +static char *ocfs2_page_getlink(struct dentry * dentry, + struct page **ppage); +static char *ocfs2_fast_symlink_getlink(struct inode *inode, + struct buffer_head **bh); + +/* get the link contents into pagecache */ +static char *ocfs2_page_getlink(struct dentry * dentry, + struct page **ppage) +{ + struct page * page; + struct address_space *mapping = dentry->d_inode->i_mapping; + page = read_cache_page(mapping, 0, + (filler_t *)mapping->a_ops->readpage, NULL); + if (IS_ERR(page)) + goto sync_fail; + wait_on_page_locked(page); + if (!PageUptodate(page)) + goto async_fail; + *ppage = page; + return kmap(page); + +async_fail: + page_cache_release(page); + return ERR_PTR(-EIO); + +sync_fail: + return (char*)page; +} + +static char *ocfs2_fast_symlink_getlink(struct inode *inode, + struct buffer_head **bh) +{ + int status; + char *link = NULL; + struct ocfs2_dinode *fe; + + mlog_entry_void(); + + status = ocfs2_read_block(OCFS2_SB(inode->i_sb), + OCFS2_I(inode)->ip_blkno, + bh, + OCFS2_BH_CACHED, + inode); + if (status < 0) { + mlog_errno(status); + link = ERR_PTR(status); + goto bail; + } + + fe = (struct ocfs2_dinode *) (*bh)->b_data; + link = (char *) fe->id2.i_symlink; +bail: + mlog_exit(status); + + return link; +} + +static int ocfs2_readlink(struct dentry *dentry, + char __user *buffer, + int buflen) +{ + int ret; + char *link; + struct buffer_head *bh = NULL; + struct inode *inode = dentry->d_inode; + + mlog_entry_void(); + + link = ocfs2_fast_symlink_getlink(inode, &bh); + if (IS_ERR(link)) { + ret = PTR_ERR(link); + goto out; + } + + ret = vfs_readlink(dentry, buffer, buflen, link); + + brelse(bh); +out: + mlog_exit(ret); + return ret; +} + +static void *ocfs2_follow_link(struct dentry *dentry, + struct nameidata *nd) +{ + int status; + char *link; + struct inode *inode = dentry->d_inode; + struct page *page = NULL; + struct buffer_head *bh = NULL; + + if (ocfs2_inode_is_fast_symlink(inode)) + link = ocfs2_fast_symlink_getlink(inode, &bh); + else + link = ocfs2_page_getlink(dentry, &page); + if (IS_ERR(link)) { + status = PTR_ERR(link); + mlog_errno(status); + goto bail; + } + + status = vfs_follow_link(nd, link); + if (status) + mlog_errno(status); +bail: + if (page) { + kunmap(page); + page_cache_release(page); + } + if (bh) + brelse(bh); + + return ERR_PTR(status); +} + +struct inode_operations ocfs2_symlink_inode_operations = { + .readlink = page_readlink, + .follow_link = ocfs2_follow_link, + .getattr = ocfs2_getattr, +}; +struct inode_operations ocfs2_fast_symlink_inode_operations = { + .readlink = ocfs2_readlink, + .follow_link = ocfs2_follow_link, + .getattr = ocfs2_getattr, +}; diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h new file mode 100644 index 000000000000..1ea9e4d9e9eb --- /dev/null +++ b/fs/ocfs2/symlink.h @@ -0,0 +1,42 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * symlink.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYMLINK_H +#define OCFS2_SYMLINK_H + +extern struct inode_operations ocfs2_symlink_inode_operations; +extern struct inode_operations ocfs2_fast_symlink_inode_operations; + +/* + * Test whether an inode is a fast symlink. + */ +static inline int ocfs2_inode_is_fast_symlink(struct inode *inode) +{ + return (S_ISLNK(inode->i_mode) && + inode->i_blocks == 0); +} + + +#endif /* OCFS2_SYMLINK_H */ diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c new file mode 100644 index 000000000000..600a8bc5b541 --- /dev/null +++ b/fs/ocfs2/sysfile.c @@ -0,0 +1,131 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.c + * + * Initialize, read, write, etc. system files. + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#include "ocfs2.h" + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "alloc.h" +#include "dir.h" +#include "inode.h" +#include "journal.h" +#include "sysfile.h" + +#include "buffer_head_io.h" + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +static inline int is_global_system_inode(int type); +static inline int is_in_system_inode_array(struct ocfs2_super *osb, + int type, + u32 slot); + +static inline int is_global_system_inode(int type) +{ + return type >= OCFS2_FIRST_ONLINE_SYSTEM_INODE && + type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; +} + +static inline int is_in_system_inode_array(struct ocfs2_super *osb, + int type, + u32 slot) +{ + return slot == osb->slot_num || is_global_system_inode(type); +} + +struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + struct inode *inode = NULL; + struct inode **arr = NULL; + + /* avoid the lookup if cached in local system file array */ + if (is_in_system_inode_array(osb, type, slot)) + arr = &(osb->system_inodes[type]); + + if (arr && ((inode = *arr) != NULL)) { + /* get a ref in addition to the array ref */ + inode = igrab(inode); + if (!inode) + BUG(); + + return inode; + } + + /* this gets one ref thru iget */ + inode = _ocfs2_get_system_file_inode(osb, type, slot); + + /* add one more if putting into array for first time */ + if (arr && inode) { + *arr = igrab(inode); + if (!*arr) + BUG(); + } + return inode; +} + +static struct inode * _ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot) +{ + char namebuf[40]; + struct inode *inode = NULL; + u64 blkno; + struct buffer_head *dirent_bh = NULL; + struct ocfs2_dir_entry *de = NULL; + int status = 0; + + ocfs2_sprintf_system_inode_name(namebuf, + sizeof(namebuf), + type, slot); + + status = ocfs2_find_files_on_disk(namebuf, strlen(namebuf), + &blkno, osb->sys_root_inode, + &dirent_bh, &de); + if (status < 0) { + goto bail; + } + + inode = ocfs2_iget(osb, blkno); + if (IS_ERR(inode)) { + mlog_errno(PTR_ERR(inode)); + inode = NULL; + goto bail; + } +bail: + if (dirent_bh) + brelse(dirent_bh); + return inode; +} + diff --git a/fs/ocfs2/sysfile.h b/fs/ocfs2/sysfile.h new file mode 100644 index 000000000000..cc9ea661ffc1 --- /dev/null +++ b/fs/ocfs2/sysfile.h @@ -0,0 +1,33 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * sysfile.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_SYSFILE_H +#define OCFS2_SYSFILE_H + +struct inode * ocfs2_get_system_file_inode(struct ocfs2_super *osb, + int type, + u32 slot); + +#endif /* OCFS2_SYSFILE_H */ diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c new file mode 100644 index 000000000000..3a0458fd3e1b --- /dev/null +++ b/fs/ocfs2/uptodate.c @@ -0,0 +1,544 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.c + * + * Tracking the up-to-date-ness of a local buffer_head with respect to + * the cluster. + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Standard buffer head caching flags (uptodate, etc) are insufficient + * in a clustered environment - a buffer may be marked up to date on + * our local node but could have been modified by another cluster + * member. As a result an additional (and performant) caching scheme + * is required. A further requirement is that we consume as little + * memory as possible - we never pin buffer_head structures in order + * to cache them. + * + * We track the existence of up to date buffers on the inodes which + * are associated with them. Because we don't want to pin + * buffer_heads, this is only a (strong) hint and several other checks + * are made in the I/O path to ensure that we don't use a stale or + * invalid buffer without going to disk: + * - buffer_jbd is used liberally - if a bh is in the journal on + * this node then it *must* be up to date. + * - the standard buffer_uptodate() macro is used to detect buffers + * which may be invalid (even if we have an up to date tracking + * item for them) + * + * For a full understanding of how this code works together, one + * should read the callers in dlmglue.c, the I/O functions in + * buffer_head_io.c and ocfs2_journal_access in journal.c + */ + +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/buffer_head.h> +#include <linux/rbtree.h> +#include <linux/jbd.h> + +#define MLOG_MASK_PREFIX ML_UPTODATE + +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "inode.h" +#include "uptodate.h" + +struct ocfs2_meta_cache_item { + struct rb_node c_node; + sector_t c_block; +}; + +static kmem_cache_t *ocfs2_uptodate_cachep = NULL; + +void ocfs2_metadata_cache_init(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; + ci->ci_num_cached = 0; +} + +/* No lock taken here as 'root' is not expected to be visible to other + * processes. */ +static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) +{ + unsigned int purged = 0; + struct rb_node *node; + struct ocfs2_meta_cache_item *item; + + while ((node = rb_last(root)) != NULL) { + item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); + + mlog(0, "Purge item %llu\n", + (unsigned long long) item->c_block); + + rb_erase(&item->c_node, root); + kmem_cache_free(ocfs2_uptodate_cachep, item); + + purged++; + } + return purged; +} + +/* Called from locking and called from ocfs2_clear_inode. Dump the + * cache for a given inode. + * + * This function is a few more lines longer than necessary due to some + * accounting done here, but I think it's worth tracking down those + * bugs sooner -- Mark */ +void ocfs2_metadata_cache_purge(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int tree, to_purge, purged; + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + struct rb_root root = RB_ROOT; + + spin_lock(&oi->ip_lock); + tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + to_purge = ci->ci_num_cached; + + mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge, + tree ? "array" : "tree", oi->ip_blkno); + + /* If we're a tree, save off the root so that we can safely + * initialize the cache. We do the work to free tree members + * without the spinlock. */ + if (tree) + root = ci->ci_cache.ci_tree; + + ocfs2_metadata_cache_init(inode); + spin_unlock(&oi->ip_lock); + + purged = ocfs2_purge_copied_metadata_tree(&root); + /* If possible, track the number wiped so that we can more + * easily detect counting errors. Unfortunately, this is only + * meaningful for trees. */ + if (tree && purged != to_purge) + mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n", + oi->ip_blkno, to_purge, purged); +} + +/* Returns the index in the cache array, -1 if not found. + * Requires ip_lock. */ +static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, + sector_t item) +{ + int i; + + for (i = 0; i < ci->ci_num_cached; i++) { + if (item == ci->ci_cache.ci_array[i]) + return i; + } + + return -1; +} + +/* Returns the cache item if found, otherwise NULL. + * Requires ip_lock. */ +static struct ocfs2_meta_cache_item * +ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, + sector_t block) +{ + struct rb_node * n = ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *item = NULL; + + while (n) { + item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); + + if (block < item->c_block) + n = n->rb_left; + else if (block > item->c_block) + n = n->rb_right; + else + return item; + } + + return NULL; +} + +static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, + struct buffer_head *bh) +{ + int index = -1; + struct ocfs2_meta_cache_item *item = NULL; + + spin_lock(&oi->ip_lock); + + mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n", + oi->ip_blkno, (unsigned long long) bh->b_blocknr, + !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); + + if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) + index = ocfs2_search_cache_array(&oi->ip_metadata_cache, + bh->b_blocknr); + else + item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, + bh->b_blocknr); + + spin_unlock(&oi->ip_lock); + + mlog(0, "index = %d, item = %p\n", index, item); + + return (index != -1) || (item != NULL); +} + +/* Warning: even if it returns true, this does *not* guarantee that + * the block is stored in our inode metadata cache. */ +int ocfs2_buffer_uptodate(struct inode *inode, + struct buffer_head *bh) +{ + /* Doesn't matter if the bh is in our cache or not -- if it's + * not marked uptodate then we know it can't have correct + * data. */ + if (!buffer_uptodate(bh)) + return 0; + + /* OCFS2 does not allow multiple nodes to be changing the same + * block at the same time. */ + if (buffer_jbd(bh)) + return 1; + + /* Ok, locally the buffer is marked as up to date, now search + * our cache to see if we can trust that. */ + return ocfs2_buffer_cached(OCFS2_I(inode), bh); +} + +/* Requires ip_lock */ +static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, + sector_t block) +{ + BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); + + mlog(0, "block %llu takes position %u\n", (unsigned long long) block, + ci->ci_num_cached); + + ci->ci_cache.ci_array[ci->ci_num_cached] = block; + ci->ci_num_cached++; +} + +/* By now the caller should have checked that the item does *not* + * exist in the tree. + * Requires ip_lock. */ +static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *new) +{ + sector_t block = new->c_block; + struct rb_node *parent = NULL; + struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; + struct ocfs2_meta_cache_item *tmp; + + mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, + ci->ci_num_cached); + + while(*p) { + parent = *p; + + tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); + + if (block < tmp->c_block) + p = &(*p)->rb_left; + else if (block > tmp->c_block) + p = &(*p)->rb_right; + else { + /* This should never happen! */ + mlog(ML_ERROR, "Duplicate block %llu cached!\n", + (unsigned long long) block); + BUG(); + } + } + + rb_link_node(&new->c_node, parent, p); + rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached++; +} + +static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, + struct ocfs2_caching_info *ci) +{ + assert_spin_locked(&oi->ip_lock); + + return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && + (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); +} + +/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the + * pointers in tree after we use them - this allows caller to detect + * when to free in case of error. */ +static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, + struct ocfs2_meta_cache_item **tree) +{ + int i; + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, + "Inode %"MLFu64", num cached = %u, should be %u\n", + oi->ip_blkno, ci->ci_num_cached, + OCFS2_INODE_MAX_CACHE_ARRAY); + mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), + "Inode %"MLFu64" not marked as inline anymore!\n", + oi->ip_blkno); + assert_spin_locked(&oi->ip_lock); + + /* Be careful to initialize the tree members *first* because + * once the ci_tree is used, the array is junk... */ + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + tree[i]->c_block = ci->ci_cache.ci_array[i]; + + oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; + ci->ci_cache.ci_tree = RB_ROOT; + /* this will be set again by __ocfs2_insert_cache_tree */ + ci->ci_num_cached = 0; + + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + __ocfs2_insert_cache_tree(ci, tree[i]); + tree[i] = NULL; + } + + mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n", + oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); +} + +/* Slow path function - memory allocation is necessary. See the + * comment above ocfs2_set_buffer_uptodate for more information. */ +static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, + sector_t block, + int expand_tree) +{ + int i; + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + struct ocfs2_meta_cache_item *new = NULL; + struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = + { NULL, }; + + mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n", + oi->ip_blkno, (unsigned long long) block, expand_tree); + + new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); + if (!new) { + mlog_errno(-ENOMEM); + return; + } + new->c_block = block; + + if (expand_tree) { + /* Do *not* allocate an array here - the removal code + * has no way of tracking that. */ + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { + tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, + GFP_KERNEL); + if (!tree[i]) { + mlog_errno(-ENOMEM); + goto out_free; + } + + /* These are initialized in ocfs2_expand_cache! */ + } + } + + spin_lock(&oi->ip_lock); + if (ocfs2_insert_can_use_array(oi, ci)) { + mlog(0, "Someone cleared the tree underneath us\n"); + /* Ok, items were removed from the cache in between + * locks. Detect this and revert back to the fast path */ + ocfs2_append_cache_array(ci, block); + spin_unlock(&oi->ip_lock); + goto out_free; + } + + if (expand_tree) + ocfs2_expand_cache(oi, tree); + + __ocfs2_insert_cache_tree(ci, new); + spin_unlock(&oi->ip_lock); + + new = NULL; +out_free: + if (new) + kmem_cache_free(ocfs2_uptodate_cachep, new); + + /* If these were used, then ocfs2_expand_cache re-set them to + * NULL for us. */ + if (tree[0]) { + for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) + if (tree[i]) + kmem_cache_free(ocfs2_uptodate_cachep, + tree[i]); + } +} + +/* Item insertion is guarded by ip_io_sem, so the insertion path takes + * advantage of this by not rechecking for a duplicate insert during + * the slow case. Additionally, if the cache needs to be bumped up to + * a tree, the code will not recheck after acquiring the lock -- + * multiple paths cannot be expanding to a tree at the same time. + * + * The slow path takes into account that items can be removed + * (including the whole tree wiped and reset) when this process it out + * allocating memory. In those cases, it reverts back to the fast + * path. + * + * Note that this function may actually fail to insert the block if + * memory cannot be allocated. This is not fatal however (but may + * result in a performance penalty) */ +void ocfs2_set_buffer_uptodate(struct inode *inode, + struct buffer_head *bh) +{ + int expand; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + /* The block may very well exist in our cache already, so avoid + * doing any more work in that case. */ + if (ocfs2_buffer_cached(oi, bh)) + return; + + mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno, + (unsigned long long) bh->b_blocknr); + + /* No need to recheck under spinlock - insertion is guarded by + * ip_io_sem */ + spin_lock(&oi->ip_lock); + if (ocfs2_insert_can_use_array(oi, ci)) { + /* Fast case - it's an array and there's a free + * spot. */ + ocfs2_append_cache_array(ci, bh->b_blocknr); + spin_unlock(&oi->ip_lock); + return; + } + + expand = 0; + if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + /* We need to bump things up to a tree. */ + expand = 1; + } + spin_unlock(&oi->ip_lock); + + __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); +} + +/* Called against a newly allocated buffer. Most likely nobody should + * be able to read this sort of metadata while it's still being + * allocated, but this is careful to take ip_io_sem anyway. */ +void ocfs2_set_new_buffer_uptodate(struct inode *inode, + struct buffer_head *bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + /* This should definitely *not* exist in our cache */ + BUG_ON(ocfs2_buffer_cached(oi, bh)); + + set_buffer_uptodate(bh); + + down(&oi->ip_io_sem); + ocfs2_set_buffer_uptodate(inode, bh); + up(&oi->ip_io_sem); +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, + int index) +{ + sector_t *array = ci->ci_cache.ci_array; + int bytes; + + BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); + BUG_ON(index >= ci->ci_num_cached); + BUG_ON(!ci->ci_num_cached); + + mlog(0, "remove index %d (num_cached = %u\n", index, + ci->ci_num_cached); + + ci->ci_num_cached--; + + /* don't need to copy if the array is now empty, or if we + * removed at the tail */ + if (ci->ci_num_cached && index < ci->ci_num_cached) { + bytes = sizeof(sector_t) * (ci->ci_num_cached - index); + memmove(&array[index], &array[index + 1], bytes); + } +} + +/* Requires ip_lock. */ +static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, + struct ocfs2_meta_cache_item *item) +{ + mlog(0, "remove block %llu from tree\n", + (unsigned long long) item->c_block); + + rb_erase(&item->c_node, &ci->ci_cache.ci_tree); + ci->ci_num_cached--; +} + +/* Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. */ +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh) +{ + int index; + sector_t block = bh->b_blocknr; + struct ocfs2_meta_cache_item *item = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; + + spin_lock(&oi->ip_lock); + mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n", + oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached, + oi->ip_flags & OCFS2_INODE_CACHE_INLINE); + + if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { + index = ocfs2_search_cache_array(ci, block); + if (index != -1) + ocfs2_remove_metadata_array(ci, index); + } else { + item = ocfs2_search_cache_tree(ci, block); + if (item) + ocfs2_remove_metadata_tree(ci, item); + } + spin_unlock(&oi->ip_lock); + + if (item) + kmem_cache_free(ocfs2_uptodate_cachep, item); +} + +int __init init_ocfs2_uptodate_cache(void) +{ + ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", + sizeof(struct ocfs2_meta_cache_item), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ocfs2_uptodate_cachep) + return -ENOMEM; + + mlog(0, "%u inlined cache items per inode.\n", + OCFS2_INODE_MAX_CACHE_ARRAY); + + return 0; +} + +void __exit exit_ocfs2_uptodate_cache(void) +{ + if (ocfs2_uptodate_cachep) + kmem_cache_destroy(ocfs2_uptodate_cachep); +} diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h new file mode 100644 index 000000000000..e5aacdf4eabf --- /dev/null +++ b/fs/ocfs2/uptodate.h @@ -0,0 +1,44 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * uptodate.h + * + * Cluster uptodate tracking + * + * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_UPTODATE_H +#define OCFS2_UPTODATE_H + +int __init init_ocfs2_uptodate_cache(void); +void __exit exit_ocfs2_uptodate_cache(void); + +void ocfs2_metadata_cache_init(struct inode *inode); +void ocfs2_metadata_cache_purge(struct inode *inode); + +int ocfs2_buffer_uptodate(struct inode *inode, + struct buffer_head *bh); +void ocfs2_set_buffer_uptodate(struct inode *inode, + struct buffer_head *bh); +void ocfs2_set_new_buffer_uptodate(struct inode *inode, + struct buffer_head *bh); +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh); + +#endif /* OCFS2_UPTODATE_H */ diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c new file mode 100644 index 000000000000..5405ce121c99 --- /dev/null +++ b/fs/ocfs2/ver.c @@ -0,0 +1,43 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.c + * + * version string + * + * Copyright (C) 2002, 2005 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/module.h> +#include <linux/string.h> +#include <linux/kernel.h> + +#include "ver.h" + +#define OCFS2_BUILD_VERSION "1.3.3" + +#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION + +void ocfs2_print_version(void) +{ + printk(KERN_INFO "%s\n", VERSION_STR); +} + +MODULE_DESCRIPTION(VERSION_STR); + +MODULE_VERSION(OCFS2_BUILD_VERSION); diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h new file mode 100644 index 000000000000..d7395cb91d2f --- /dev/null +++ b/fs/ocfs2/ver.h @@ -0,0 +1,31 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ver.h + * + * Function prototypes + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_VER_H +#define OCFS2_VER_H + +void ocfs2_print_version(void); + +#endif /* OCFS2_VER_H */ diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c new file mode 100644 index 000000000000..021978e0576b --- /dev/null +++ b/fs/ocfs2/vote.c @@ -0,0 +1,1202 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * vote.c + * + * description here + * + * Copyright (C) 2003, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <linux/kthread.h> + +#include <cluster/heartbeat.h> +#include <cluster/nodemanager.h> +#include <cluster/tcp.h> + +#include <dlm/dlmapi.h> + +#define MLOG_MASK_PREFIX ML_VOTE +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "alloc.h" +#include "dlmglue.h" +#include "extent_map.h" +#include "heartbeat.h" +#include "inode.h" +#include "journal.h" +#include "slot_map.h" +#include "vote.h" + +#include "buffer_head_io.h" + +#define OCFS2_MESSAGE_TYPE_VOTE (0x1) +#define OCFS2_MESSAGE_TYPE_RESPONSE (0x2) +struct ocfs2_msg_hdr +{ + __be32 h_response_id; /* used to lookup message handle on sending + * node. */ + __be32 h_request; + __be64 h_blkno; + __be32 h_generation; + __be32 h_node_num; /* node sending this particular message. */ +}; + +/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this + * for the network. */ +#define OCFS2_VOTE_FILENAME_LEN 256 +struct ocfs2_vote_msg +{ + struct ocfs2_msg_hdr v_hdr; + union { + __be32 v_generic1; + __be32 v_orphaned_slot; /* Used during delete votes */ + __be32 v_nlink; /* Used during unlink votes */ + } md1; /* Message type dependant 1 */ + __be32 v_unlink_namelen; + __be64 v_unlink_parent; + u8 v_unlink_dirent[OCFS2_VOTE_FILENAME_LEN]; +}; + +/* Responses are given these values to maintain backwards + * compatibility with older ocfs2 versions */ +#define OCFS2_RESPONSE_OK (0) +#define OCFS2_RESPONSE_BUSY (-16) +#define OCFS2_RESPONSE_BAD_MSG (-22) + +struct ocfs2_response_msg +{ + struct ocfs2_msg_hdr r_hdr; + __be32 r_response; + __be32 r_orphaned_slot; +}; + +struct ocfs2_vote_work { + struct list_head w_list; + struct ocfs2_vote_msg w_msg; +}; + +enum ocfs2_vote_request { + OCFS2_VOTE_REQ_INVALID = 0, + OCFS2_VOTE_REQ_DELETE, + OCFS2_VOTE_REQ_UNLINK, + OCFS2_VOTE_REQ_RENAME, + OCFS2_VOTE_REQ_MOUNT, + OCFS2_VOTE_REQ_UMOUNT, + OCFS2_VOTE_REQ_LAST +}; + +static inline int ocfs2_is_valid_vote_request(int request) +{ + return OCFS2_VOTE_REQ_INVALID < request && + request < OCFS2_VOTE_REQ_LAST; +} + +typedef void (*ocfs2_net_response_callback)(void *priv, + struct ocfs2_response_msg *resp); +struct ocfs2_net_response_cb { + ocfs2_net_response_callback rc_cb; + void *rc_priv; +}; + +struct ocfs2_net_wait_ctxt { + struct list_head n_list; + u32 n_response_id; + wait_queue_head_t n_event; + struct ocfs2_node_map n_node_map; + int n_response; /* an agreggate response. 0 if + * all nodes are go, < 0 on any + * negative response from any + * node or network error. */ + struct ocfs2_net_response_cb *n_callback; +}; + +static void ocfs2_process_mount_request(struct ocfs2_super *osb, + unsigned int node_num) +{ + mlog(0, "MOUNT vote from node %u\n", node_num); + /* The other node only sends us this message when he has an EX + * on the superblock, so our recovery threads (if having been + * launched) are waiting on it.*/ + ocfs2_recovery_map_clear(osb, node_num); + ocfs2_node_map_set_bit(osb, &osb->mounted_map, node_num); + + /* We clear the umount map here because a node may have been + * previously mounted, safely unmounted but never stopped + * heartbeating - in which case we'd have a stale entry. */ + ocfs2_node_map_clear_bit(osb, &osb->umount_map, node_num); +} + +static void ocfs2_process_umount_request(struct ocfs2_super *osb, + unsigned int node_num) +{ + mlog(0, "UMOUNT vote from node %u\n", node_num); + ocfs2_node_map_clear_bit(osb, &osb->mounted_map, node_num); + ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); +} + +void ocfs2_mark_inode_remotely_deleted(struct inode *inode) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + assert_spin_locked(&oi->ip_lock); + /* We set the SKIP_DELETE flag on the inode so we don't try to + * delete it in delete_inode ourselves, thus avoiding + * unecessary lock pinging. If the other node failed to wipe + * the inode as a result of a crash, then recovery will pick + * up the slack. */ + oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; +} + +static int ocfs2_process_delete_request(struct inode *inode, + int *orphaned_slot) +{ + int response = OCFS2_RESPONSE_BUSY; + + mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", + inode->i_ino, inode->i_nlink, *orphaned_slot); + + spin_lock(&OCFS2_I(inode)->ip_lock); + + /* Whatever our vote response is, we want to make sure that + * the orphaned slot is recorded properly on this node *and* + * on the requesting node. Technically, if the requesting node + * did not know which slot the inode is orphaned in but we + * respond with BUSY he doesn't actually need the orphaned + * slot, but it doesn't hurt to do it here anyway. */ + if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { + mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != + OCFS2_INVALID_SLOT && + OCFS2_I(inode)->ip_orphaned_slot != + (*orphaned_slot), + "Inode %"MLFu64": This node thinks it's " + "orphaned in slot %d, messaged it's in %d\n", + OCFS2_I(inode)->ip_blkno, + OCFS2_I(inode)->ip_orphaned_slot, + *orphaned_slot); + + mlog(0, "Setting orphaned slot for inode %"MLFu64" to %d\n", + OCFS2_I(inode)->ip_blkno, *orphaned_slot); + + OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; + } else { + mlog(0, "Sending back orphaned slot %d for inode %"MLFu64"\n", + OCFS2_I(inode)->ip_orphaned_slot, + OCFS2_I(inode)->ip_blkno); + + *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + } + + /* vote no if the file is still open. */ + if (OCFS2_I(inode)->ip_open_count) { + mlog(0, "open count = %u\n", + OCFS2_I(inode)->ip_open_count); + spin_unlock(&OCFS2_I(inode)->ip_lock); + goto done; + } + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* directories are a bit ugly... What if someone is sitting in + * it? We want to make sure the inode is removed completely as + * a result of the iput in process_vote. */ + if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { + mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); + goto done; + } + + if (filemap_fdatawrite(inode->i_mapping)) { + mlog(ML_ERROR, "Could not sync inode %"MLFu64" for delete!\n", + OCFS2_I(inode)->ip_blkno); + goto done; + } + sync_mapping_buffers(inode->i_mapping); + truncate_inode_pages(inode->i_mapping, 0); + ocfs2_extent_map_trunc(inode, 0); + + spin_lock(&OCFS2_I(inode)->ip_lock); + /* double check open count - someone might have raced this + * thread into ocfs2_file_open while we were writing out + * data. If we're to allow a wipe of this inode now, we *must* + * hold the spinlock until we've marked it. */ + if (OCFS2_I(inode)->ip_open_count) { + mlog(0, "Raced to wipe! open count = %u\n", + OCFS2_I(inode)->ip_open_count); + spin_unlock(&OCFS2_I(inode)->ip_lock); + goto done; + } + + /* Mark the inode as being wiped from disk. */ + ocfs2_mark_inode_remotely_deleted(inode); + spin_unlock(&OCFS2_I(inode)->ip_lock); + + /* Not sure this is necessary anymore. */ + d_prune_aliases(inode); + + /* If we get here, then we're voting 'yes', so commit the + * delete on our side. */ + response = OCFS2_RESPONSE_OK; +done: + return response; +} + +static int ocfs2_match_dentry(struct dentry *dentry, + u64 parent_blkno, + unsigned int namelen, + const char *name) +{ + struct inode *parent; + + if (!dentry->d_parent) { + mlog(0, "Detached from parent.\n"); + return 0; + } + + parent = dentry->d_parent->d_inode; + /* Negative parent dentry? */ + if (!parent) + return 0; + + /* Name is in a different directory. */ + if (OCFS2_I(parent)->ip_blkno != parent_blkno) + return 0; + + if (dentry->d_name.len != namelen) + return 0; + + /* comparison above guarantees this is safe. */ + if (memcmp(dentry->d_name.name, name, namelen)) + return 0; + + return 1; +} + +static void ocfs2_process_dentry_request(struct inode *inode, + int rename, + unsigned int new_nlink, + u64 parent_blkno, + unsigned int namelen, + const char *name) +{ + struct dentry *dentry = NULL; + struct list_head *p; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + mlog(0, "parent %"MLFu64", namelen = %u, name = %.*s\n", parent_blkno, + namelen, namelen, name); + + spin_lock(&dcache_lock); + + /* Another node is removing this name from the system. It is + * up to us to find the corresponding dentry and if it exists, + * unhash it from the dcache. */ + list_for_each(p, &inode->i_dentry) { + dentry = list_entry(p, struct dentry, d_alias); + + if (ocfs2_match_dentry(dentry, parent_blkno, namelen, name)) { + mlog(0, "dentry found: %.*s\n", + dentry->d_name.len, dentry->d_name.name); + + dget_locked(dentry); + break; + } + + dentry = NULL; + } + + spin_unlock(&dcache_lock); + + if (dentry) { + d_delete(dentry); + dput(dentry); + } + + /* rename votes don't send link counts */ + if (!rename) { + mlog(0, "new_nlink = %u\n", new_nlink); + + /* We don't have the proper locks here to directly + * change i_nlink and besides, the vote is sent + * *before* the operation so it may have failed on the + * other node. This passes a hint to ocfs2_drop_inode + * to force ocfs2_delete_inode, who will take the + * proper cluster locks to sort things out. */ + if (new_nlink == 0) { + spin_lock(&oi->ip_lock); + oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; + spin_unlock(&OCFS2_I(inode)->ip_lock); + } + } +} + +static void ocfs2_process_vote(struct ocfs2_super *osb, + struct ocfs2_vote_msg *msg) +{ + int net_status, vote_response; + int orphaned_slot = 0; + int rename = 0; + unsigned int node_num, generation, new_nlink, namelen; + u64 blkno, parent_blkno; + enum ocfs2_vote_request request; + struct inode *inode = NULL; + struct ocfs2_msg_hdr *hdr = &msg->v_hdr; + struct ocfs2_response_msg response; + + /* decode the network mumbo jumbo into local variables. */ + request = be32_to_cpu(hdr->h_request); + blkno = be64_to_cpu(hdr->h_blkno); + generation = be32_to_cpu(hdr->h_generation); + node_num = be32_to_cpu(hdr->h_node_num); + if (request == OCFS2_VOTE_REQ_DELETE) + orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); + + mlog(0, "processing vote: request = %u, blkno = %"MLFu64", " + "generation = %u, node_num = %u, priv1 = %u\n", request, + blkno, generation, node_num, be32_to_cpu(msg->md1.v_generic1)); + + if (!ocfs2_is_valid_vote_request(request)) { + mlog(ML_ERROR, "Invalid vote request %d from node %u\n", + request, node_num); + vote_response = OCFS2_RESPONSE_BAD_MSG; + goto respond; + } + + vote_response = OCFS2_RESPONSE_OK; + + switch (request) { + case OCFS2_VOTE_REQ_UMOUNT: + ocfs2_process_umount_request(osb, node_num); + goto respond; + case OCFS2_VOTE_REQ_MOUNT: + ocfs2_process_mount_request(osb, node_num); + goto respond; + default: + /* avoids a gcc warning */ + break; + } + + /* We cannot process the remaining message types before we're + * fully mounted. It's perfectly safe however to send a 'yes' + * response as we can't possibly have any of the state they're + * asking us to modify yet. */ + if (atomic_read(&osb->vol_state) == VOLUME_INIT) + goto respond; + + /* If we get here, then the request is against an inode. */ + inode = ocfs2_ilookup_for_vote(osb, blkno, + request == OCFS2_VOTE_REQ_DELETE); + + /* Not finding the inode is perfectly valid - it means we're + * not interested in what the other node is about to do to it + * so in those cases we automatically respond with an + * affirmative. Cluster locking ensures that we won't race + * interest in the inode with this vote request. */ + if (!inode) + goto respond; + + /* Check generation values. It's possible for us to get a + * request against a stale inode. If so then we proceed as if + * we had not found an inode in the first place. */ + if (inode->i_generation != generation) { + mlog(0, "generation passed %u != inode generation = %u, " + "ip_flags = %x, ip_blkno = %"MLFu64", msg %"MLFu64", " + "i_count = %u, message type = %u\n", + generation, inode->i_generation, OCFS2_I(inode)->ip_flags, + OCFS2_I(inode)->ip_blkno, blkno, + atomic_read(&inode->i_count), request); + iput(inode); + inode = NULL; + goto respond; + } + + switch (request) { + case OCFS2_VOTE_REQ_DELETE: + vote_response = ocfs2_process_delete_request(inode, + &orphaned_slot); + break; + case OCFS2_VOTE_REQ_RENAME: + rename = 1; + /* fall through */ + case OCFS2_VOTE_REQ_UNLINK: + parent_blkno = be64_to_cpu(msg->v_unlink_parent); + namelen = be32_to_cpu(msg->v_unlink_namelen); + /* new_nlink will be ignored in case of a rename vote */ + new_nlink = be32_to_cpu(msg->md1.v_nlink); + ocfs2_process_dentry_request(inode, rename, new_nlink, + parent_blkno, namelen, + msg->v_unlink_dirent); + break; + default: + mlog(ML_ERROR, "node %u, invalid request: %u\n", + node_num, request); + vote_response = OCFS2_RESPONSE_BAD_MSG; + } + +respond: + /* Response struture is small so we just put it on the stack + * and stuff it inline. */ + memset(&response, 0, sizeof(struct ocfs2_response_msg)); + response.r_hdr.h_response_id = hdr->h_response_id; + response.r_hdr.h_blkno = hdr->h_blkno; + response.r_hdr.h_generation = hdr->h_generation; + response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); + response.r_response = cpu_to_be32(vote_response); + response.r_orphaned_slot = cpu_to_be32(orphaned_slot); + + net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, + osb->net_key, + &response, + sizeof(struct ocfs2_response_msg), + node_num, + NULL); + /* We still want to error print for ENOPROTOOPT here. The + * sending node shouldn't have unregistered his net handler + * without sending an unmount vote 1st */ + if (net_status < 0 + && net_status != -ETIMEDOUT + && net_status != -ENOTCONN) + mlog(ML_ERROR, "message to node %u fails with error %d!\n", + node_num, net_status); + + if (inode) + iput(inode); +} + +static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) +{ + unsigned long processed; + struct ocfs2_lock_res *lockres; + struct ocfs2_vote_work *work; + + mlog_entry_void(); + + spin_lock(&osb->vote_task_lock); + /* grab this early so we know to try again if a state change and + * wake happens part-way through our work */ + osb->vote_work_sequence = osb->vote_wake_sequence; + + processed = osb->blocked_lock_count; + while (processed) { + BUG_ON(list_empty(&osb->blocked_lock_list)); + + lockres = list_entry(osb->blocked_lock_list.next, + struct ocfs2_lock_res, l_blocked_list); + list_del_init(&lockres->l_blocked_list); + osb->blocked_lock_count--; + spin_unlock(&osb->vote_task_lock); + + BUG_ON(!processed); + processed--; + + ocfs2_process_blocked_lock(osb, lockres); + + spin_lock(&osb->vote_task_lock); + } + + while (osb->vote_count) { + BUG_ON(list_empty(&osb->vote_list)); + work = list_entry(osb->vote_list.next, + struct ocfs2_vote_work, w_list); + list_del(&work->w_list); + osb->vote_count--; + spin_unlock(&osb->vote_task_lock); + + ocfs2_process_vote(osb, &work->w_msg); + kfree(work); + + spin_lock(&osb->vote_task_lock); + } + spin_unlock(&osb->vote_task_lock); + + mlog_exit_void(); +} + +static int ocfs2_vote_thread_lists_empty(struct ocfs2_super *osb) +{ + int empty = 0; + + spin_lock(&osb->vote_task_lock); + if (list_empty(&osb->blocked_lock_list) && + list_empty(&osb->vote_list)) + empty = 1; + + spin_unlock(&osb->vote_task_lock); + return empty; +} + +static int ocfs2_vote_thread_should_wake(struct ocfs2_super *osb) +{ + int should_wake = 0; + + spin_lock(&osb->vote_task_lock); + if (osb->vote_work_sequence != osb->vote_wake_sequence) + should_wake = 1; + spin_unlock(&osb->vote_task_lock); + + return should_wake; +} + +int ocfs2_vote_thread(void *arg) +{ + int status = 0; + struct ocfs2_super *osb = arg; + + /* only quit once we've been asked to stop and there is no more + * work available */ + while (!(kthread_should_stop() && + ocfs2_vote_thread_lists_empty(osb))) { + + wait_event_interruptible(osb->vote_event, + ocfs2_vote_thread_should_wake(osb) || + kthread_should_stop()); + + mlog(0, "vote_thread: awoken\n"); + + ocfs2_vote_thread_do_work(osb); + } + + osb->vote_task = NULL; + return status; +} + +static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response_id) +{ + struct ocfs2_net_wait_ctxt *w; + + w = kcalloc(1, sizeof(*w), GFP_KERNEL); + if (!w) { + mlog_errno(-ENOMEM); + goto bail; + } + + INIT_LIST_HEAD(&w->n_list); + init_waitqueue_head(&w->n_event); + ocfs2_node_map_init(&w->n_node_map); + w->n_response_id = response_id; + w->n_callback = NULL; +bail: + return w; +} + +static unsigned int ocfs2_new_response_id(struct ocfs2_super *osb) +{ + unsigned int ret; + + spin_lock(&osb->net_response_lock); + ret = ++osb->net_response_ids; + spin_unlock(&osb->net_response_lock); + + return ret; +} + +static void ocfs2_dequeue_net_wait_ctxt(struct ocfs2_super *osb, + struct ocfs2_net_wait_ctxt *w) +{ + spin_lock(&osb->net_response_lock); + list_del(&w->n_list); + spin_unlock(&osb->net_response_lock); +} + +static void ocfs2_queue_net_wait_ctxt(struct ocfs2_super *osb, + struct ocfs2_net_wait_ctxt *w) +{ + spin_lock(&osb->net_response_lock); + list_add_tail(&w->n_list, + &osb->net_response_list); + spin_unlock(&osb->net_response_lock); +} + +static void __ocfs2_mark_node_responded(struct ocfs2_super *osb, + struct ocfs2_net_wait_ctxt *w, + int node_num) +{ + assert_spin_locked(&osb->net_response_lock); + + ocfs2_node_map_clear_bit(osb, &w->n_node_map, node_num); + if (ocfs2_node_map_is_empty(osb, &w->n_node_map)) + wake_up(&w->n_event); +} + +/* Intended to be called from the node down callback, we fake remove + * the node from all our response contexts */ +void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, + int node_num) +{ + struct list_head *p; + struct ocfs2_net_wait_ctxt *w = NULL; + + spin_lock(&osb->net_response_lock); + + list_for_each(p, &osb->net_response_list) { + w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); + + __ocfs2_mark_node_responded(osb, w, node_num); + } + + spin_unlock(&osb->net_response_lock); +} + +static int ocfs2_broadcast_vote(struct ocfs2_super *osb, + struct ocfs2_vote_msg *request, + unsigned int response_id, + int *response, + struct ocfs2_net_response_cb *callback) +{ + int status, i, remote_err; + struct ocfs2_net_wait_ctxt *w = NULL; + int dequeued = 0; + + mlog_entry_void(); + + w = ocfs2_new_net_wait_ctxt(response_id); + if (!w) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + w->n_callback = callback; + + /* we're pretty much ready to go at this point, and this fills + * in n_response which we need anyway... */ + ocfs2_queue_net_wait_ctxt(osb, w); + + i = ocfs2_node_map_iterate(osb, &osb->mounted_map, 0); + + while (i != O2NM_INVALID_NODE_NUM) { + if (i != osb->node_num) { + mlog(0, "trying to send request to node %i\n", i); + ocfs2_node_map_set_bit(osb, &w->n_node_map, i); + + remote_err = 0; + status = o2net_send_message(OCFS2_MESSAGE_TYPE_VOTE, + osb->net_key, + request, + sizeof(*request), + i, + &remote_err); + if (status == -ETIMEDOUT) { + mlog(0, "remote node %d timed out!\n", i); + status = -EAGAIN; + goto bail; + } + if (remote_err < 0) { + status = remote_err; + mlog(0, "remote error %d on node %d!\n", + remote_err, i); + mlog_errno(status); + goto bail; + } + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + i++; + i = ocfs2_node_map_iterate(osb, &osb->mounted_map, i); + mlog(0, "next is %d, i am %d\n", i, osb->node_num); + } + mlog(0, "done sending, now waiting on responses...\n"); + + wait_event(w->n_event, ocfs2_node_map_is_empty(osb, &w->n_node_map)); + + ocfs2_dequeue_net_wait_ctxt(osb, w); + dequeued = 1; + + *response = w->n_response; + status = 0; +bail: + if (w) { + if (!dequeued) + ocfs2_dequeue_net_wait_ctxt(osb, w); + kfree(w); + } + + mlog_exit(status); + return status; +} + +static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, + u64 blkno, + unsigned int generation, + enum ocfs2_vote_request type, + u32 priv) +{ + struct ocfs2_vote_msg *request; + struct ocfs2_msg_hdr *hdr; + + BUG_ON(!ocfs2_is_valid_vote_request(type)); + + request = kcalloc(1, sizeof(*request), GFP_KERNEL); + if (!request) { + mlog_errno(-ENOMEM); + } else { + hdr = &request->v_hdr; + hdr->h_node_num = cpu_to_be32(osb->node_num); + hdr->h_request = cpu_to_be32(type); + hdr->h_blkno = cpu_to_be64(blkno); + hdr->h_generation = cpu_to_be32(generation); + + request->md1.v_generic1 = cpu_to_be32(priv); + } + + return request; +} + +/* Complete the buildup of a new vote request and process the + * broadcast return value. */ +static int ocfs2_do_request_vote(struct ocfs2_super *osb, + struct ocfs2_vote_msg *request, + struct ocfs2_net_response_cb *callback) +{ + int status, response; + unsigned int response_id; + struct ocfs2_msg_hdr *hdr; + + response_id = ocfs2_new_response_id(osb); + + hdr = &request->v_hdr; + hdr->h_response_id = cpu_to_be32(response_id); + + status = ocfs2_broadcast_vote(osb, request, response_id, &response, + callback); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + status = response; +bail: + + return status; +} + +static int ocfs2_request_vote(struct inode *inode, + struct ocfs2_vote_msg *request, + struct ocfs2_net_response_cb *callback) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (ocfs2_inode_is_new(inode)) + return 0; + + status = -EAGAIN; + while (status == -EAGAIN) { + if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && + signal_pending(current)) + return -ERESTARTSYS; + + status = ocfs2_super_lock(osb, 0); + if (status < 0) { + mlog_errno(status); + break; + } + + status = 0; + if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) + status = ocfs2_do_request_vote(osb, request, callback); + + ocfs2_super_unlock(osb, 0); + } + return status; +} + +static void ocfs2_delete_response_cb(void *priv, + struct ocfs2_response_msg *resp) +{ + int orphaned_slot, node; + struct inode *inode = priv; + + orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); + node = be32_to_cpu(resp->r_hdr.h_node_num); + mlog(0, "node %d tells us that inode %"MLFu64" is orphaned in slot " + "%d\n", node, OCFS2_I(inode)->ip_blkno, orphaned_slot); + + /* The other node may not actually know which slot the inode + * is orphaned in. */ + if (orphaned_slot == OCFS2_INVALID_SLOT) + return; + + /* Ok, the responding node knows which slot this inode is + * orphaned in. We verify that the information is correct and + * then record this in the inode. ocfs2_delete_inode will use + * this information to determine which lock to take. */ + spin_lock(&OCFS2_I(inode)->ip_lock); + mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && + OCFS2_I(inode)->ip_orphaned_slot + != OCFS2_INVALID_SLOT, "Inode %"MLFu64": Node %d " + "says it's orphaned in slot %d, we think it's in %d\n", + OCFS2_I(inode)->ip_blkno, + be32_to_cpu(resp->r_hdr.h_node_num), + orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); + + OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +int ocfs2_request_delete_vote(struct inode *inode) +{ + int orphaned_slot, status; + struct ocfs2_net_response_cb delete_cb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_vote_msg *request; + + spin_lock(&OCFS2_I(inode)->ip_lock); + orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + spin_unlock(&OCFS2_I(inode)->ip_lock); + + delete_cb.rc_cb = ocfs2_delete_response_cb; + delete_cb.rc_priv = inode; + + mlog(0, "Inode %"MLFu64", we start thinking orphaned slot is %d\n", + OCFS2_I(inode)->ip_blkno, orphaned_slot); + + status = -ENOMEM; + request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, + inode->i_generation, + OCFS2_VOTE_REQ_DELETE, orphaned_slot); + if (request) { + status = ocfs2_request_vote(inode, request, &delete_cb); + + kfree(request); + } + + return status; +} + +static void ocfs2_setup_unlink_vote(struct ocfs2_vote_msg *request, + struct dentry *dentry) +{ + struct inode *parent = dentry->d_parent->d_inode; + + /* We need some values which will uniquely identify a dentry + * on the other nodes so that they can find it and run + * d_delete against it. Parent directory block and full name + * should suffice. */ + + mlog(0, "unlink/rename request: parent: %"MLFu64" name: %.*s\n", + OCFS2_I(parent)->ip_blkno, dentry->d_name.len, + dentry->d_name.name); + + request->v_unlink_parent = cpu_to_be64(OCFS2_I(parent)->ip_blkno); + request->v_unlink_namelen = cpu_to_be32(dentry->d_name.len); + memcpy(request->v_unlink_dirent, dentry->d_name.name, + dentry->d_name.len); +} + +int ocfs2_request_unlink_vote(struct inode *inode, + struct dentry *dentry, + unsigned int nlink) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_vote_msg *request; + + if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) + return -ENAMETOOLONG; + + status = -ENOMEM; + request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, + inode->i_generation, + OCFS2_VOTE_REQ_UNLINK, nlink); + if (request) { + ocfs2_setup_unlink_vote(request, dentry); + + status = ocfs2_request_vote(inode, request, NULL); + + kfree(request); + } + return status; +} + +int ocfs2_request_rename_vote(struct inode *inode, + struct dentry *dentry) +{ + int status; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_vote_msg *request; + + if (dentry->d_name.len > OCFS2_VOTE_FILENAME_LEN) + return -ENAMETOOLONG; + + status = -ENOMEM; + request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, + inode->i_generation, + OCFS2_VOTE_REQ_RENAME, 0); + if (request) { + ocfs2_setup_unlink_vote(request, dentry); + + status = ocfs2_request_vote(inode, request, NULL); + + kfree(request); + } + return status; +} + +int ocfs2_request_mount_vote(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_vote_msg *request = NULL; + + request = ocfs2_new_vote_request(osb, 0ULL, 0, + OCFS2_VOTE_REQ_MOUNT, 0); + if (!request) { + status = -ENOMEM; + goto bail; + } + + status = -EAGAIN; + while (status == -EAGAIN) { + if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && + signal_pending(current)) { + status = -ERESTARTSYS; + goto bail; + } + + if (ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) { + status = 0; + goto bail; + } + + status = ocfs2_do_request_vote(osb, request, NULL); + } + +bail: + if (request) + kfree(request); + + return status; +} + +int ocfs2_request_umount_vote(struct ocfs2_super *osb) +{ + int status; + struct ocfs2_vote_msg *request = NULL; + + request = ocfs2_new_vote_request(osb, 0ULL, 0, + OCFS2_VOTE_REQ_UMOUNT, 0); + if (!request) { + status = -ENOMEM; + goto bail; + } + + status = -EAGAIN; + while (status == -EAGAIN) { + /* Do not check signals on this vote... We really want + * this one to go all the way through. */ + + if (ocfs2_node_map_is_only(osb, &osb->mounted_map, + osb->node_num)) { + status = 0; + goto bail; + } + + status = ocfs2_do_request_vote(osb, request, NULL); + } + +bail: + if (request) + kfree(request); + + return status; +} + +/* TODO: This should eventually be a hash table! */ +static struct ocfs2_net_wait_ctxt * __ocfs2_find_net_wait_ctxt(struct ocfs2_super *osb, + u32 response_id) +{ + struct list_head *p; + struct ocfs2_net_wait_ctxt *w = NULL; + + list_for_each(p, &osb->net_response_list) { + w = list_entry(p, struct ocfs2_net_wait_ctxt, n_list); + if (response_id == w->n_response_id) + break; + w = NULL; + } + + return w; +} + +/* Translate response codes into local node errno values */ +static inline int ocfs2_translate_response(int response) +{ + int ret; + + switch (response) { + case OCFS2_RESPONSE_OK: + ret = 0; + break; + + case OCFS2_RESPONSE_BUSY: + ret = -EBUSY; + break; + + default: + ret = -EINVAL; + } + + return ret; +} + +static int ocfs2_handle_response_message(struct o2net_msg *msg, + u32 len, + void *data) +{ + unsigned int response_id, node_num; + int response_status; + struct ocfs2_super *osb = data; + struct ocfs2_response_msg *resp; + struct ocfs2_net_wait_ctxt * w; + struct ocfs2_net_response_cb *resp_cb; + + resp = (struct ocfs2_response_msg *) msg->buf; + + response_id = be32_to_cpu(resp->r_hdr.h_response_id); + node_num = be32_to_cpu(resp->r_hdr.h_node_num); + response_status = + ocfs2_translate_response(be32_to_cpu(resp->r_response)); + + mlog(0, "received response message:\n"); + mlog(0, "h_response_id = %u\n", response_id); + mlog(0, "h_request = %u\n", be32_to_cpu(resp->r_hdr.h_request)); + mlog(0, "h_blkno = %"MLFu64"\n", be64_to_cpu(resp->r_hdr.h_blkno)); + mlog(0, "h_generation = %u\n", be32_to_cpu(resp->r_hdr.h_generation)); + mlog(0, "h_node_num = %u\n", node_num); + mlog(0, "r_response = %d\n", response_status); + + spin_lock(&osb->net_response_lock); + w = __ocfs2_find_net_wait_ctxt(osb, response_id); + if (!w) { + mlog(0, "request not found!\n"); + goto bail; + } + resp_cb = w->n_callback; + + if (response_status && (!w->n_response)) { + /* we only really need one negative response so don't + * set it twice. */ + w->n_response = response_status; + } + + if (resp_cb) { + spin_unlock(&osb->net_response_lock); + + resp_cb->rc_cb(resp_cb->rc_priv, resp); + + spin_lock(&osb->net_response_lock); + } + + __ocfs2_mark_node_responded(osb, w, node_num); +bail: + spin_unlock(&osb->net_response_lock); + + return 0; +} + +static int ocfs2_handle_vote_message(struct o2net_msg *msg, + u32 len, + void *data) +{ + int status; + struct ocfs2_super *osb = data; + struct ocfs2_vote_work *work; + + work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL); + if (!work) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + INIT_LIST_HEAD(&work->w_list); + memcpy(&work->w_msg, msg->buf, sizeof(struct ocfs2_vote_msg)); + + mlog(0, "scheduling vote request:\n"); + mlog(0, "h_response_id = %u\n", + be32_to_cpu(work->w_msg.v_hdr.h_response_id)); + mlog(0, "h_request = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_request)); + mlog(0, "h_blkno = %"MLFu64"\n", + be64_to_cpu(work->w_msg.v_hdr.h_blkno)); + mlog(0, "h_generation = %u\n", + be32_to_cpu(work->w_msg.v_hdr.h_generation)); + mlog(0, "h_node_num = %u\n", + be32_to_cpu(work->w_msg.v_hdr.h_node_num)); + mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); + + spin_lock(&osb->vote_task_lock); + list_add_tail(&work->w_list, &osb->vote_list); + osb->vote_count++; + spin_unlock(&osb->vote_task_lock); + + ocfs2_kick_vote_thread(osb); + + status = 0; +bail: + return status; +} + +void ocfs2_unregister_net_handlers(struct ocfs2_super *osb) +{ + if (!osb->net_key) + return; + + o2net_unregister_handler_list(&osb->osb_net_handlers); + + if (!list_empty(&osb->net_response_list)) + mlog(ML_ERROR, "net response list not empty!\n"); + + osb->net_key = 0; +} + +int ocfs2_register_net_handlers(struct ocfs2_super *osb) +{ + int status = 0; + + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_RESPONSE, + osb->net_key, + sizeof(struct ocfs2_response_msg), + ocfs2_handle_response_message, + osb, &osb->osb_net_handlers); + if (status) { + mlog_errno(status); + goto bail; + } + + status = o2net_register_handler(OCFS2_MESSAGE_TYPE_VOTE, + osb->net_key, + sizeof(struct ocfs2_vote_msg), + ocfs2_handle_vote_message, + osb, &osb->osb_net_handlers); + if (status) { + mlog_errno(status); + goto bail; + } +bail: + if (status < 0) + ocfs2_unregister_net_handlers(osb); + + return status; +} diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h new file mode 100644 index 000000000000..9cce60703466 --- /dev/null +++ b/fs/ocfs2/vote.h @@ -0,0 +1,56 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * vote.h + * + * description here + * + * Copyright (C) 2002, 2004 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + + +#ifndef VOTE_H +#define VOTE_H + +int ocfs2_vote_thread(void *arg); +static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) +{ + spin_lock(&osb->vote_task_lock); + /* make sure the voting thread gets a swipe at whatever changes + * the caller may have made to the voting state */ + osb->vote_wake_sequence++; + spin_unlock(&osb->vote_task_lock); + wake_up(&osb->vote_event); +} + +int ocfs2_request_delete_vote(struct inode *inode); +int ocfs2_request_unlink_vote(struct inode *inode, + struct dentry *dentry, + unsigned int nlink); +int ocfs2_request_rename_vote(struct inode *inode, + struct dentry *dentry); +int ocfs2_request_mount_vote(struct ocfs2_super *osb); +int ocfs2_request_umount_vote(struct ocfs2_super *osb); +int ocfs2_register_net_handlers(struct ocfs2_super *osb); +void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); + +void ocfs2_mark_inode_remotely_deleted(struct inode *inode); + +void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, + int node_num); +#endif diff --git a/fs/open.c b/fs/open.c index f53a5b9ffb7d..8e20c1f32563 100644 --- a/fs/open.c +++ b/fs/open.c @@ -16,6 +16,7 @@ #include <linux/tty.h> #include <linux/namei.h> #include <linux/backing-dev.h> +#include <linux/capability.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/vfs.h> @@ -194,7 +195,8 @@ out: return error; } -int do_truncate(struct dentry *dentry, loff_t length, struct file *filp) +int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) { int err; struct iattr newattrs; @@ -204,19 +206,19 @@ int do_truncate(struct dentry *dentry, loff_t length, struct file *filp) return -EINVAL; newattrs.ia_size = length; - newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + newattrs.ia_valid = ATTR_SIZE | time_attrs; if (filp) { newattrs.ia_file = filp; newattrs.ia_valid |= ATTR_FILE; } - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); err = notify_change(dentry, &newattrs); - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); return err; } -static inline long do_sys_truncate(const char __user * path, loff_t length) +static long do_sys_truncate(const char __user * path, loff_t length) { struct nameidata nd; struct inode * inode; @@ -266,7 +268,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length) error = locks_verify_truncate(inode, NULL, length); if (!error) { DQUOT_INIT(inode); - error = do_truncate(nd.dentry, length, NULL); + error = do_truncate(nd.dentry, length, 0, NULL); } put_write_access(inode); @@ -282,7 +284,7 @@ asmlinkage long sys_truncate(const char __user * path, unsigned long length) return do_sys_truncate(path, (long)length); } -static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode * inode; struct dentry *dentry; @@ -318,7 +320,7 @@ static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small) error = locks_verify_truncate(inode, file, length); if (!error) - error = do_truncate(dentry, length, file); + error = do_truncate(dentry, length, 0, file); out_putf: fput(file); out: @@ -397,9 +399,9 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times) (error = vfs_permission(&nd, MAY_WRITE)) != 0) goto dput_and_out; } - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); error = notify_change(nd.dentry, &newattrs); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); dput_and_out: path_release(&nd); out: @@ -450,9 +452,9 @@ long do_utimes(char __user * filename, struct timeval * times) (error = vfs_permission(&nd, MAY_WRITE)) != 0) goto dput_and_out; } - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); error = notify_change(nd.dentry, &newattrs); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); dput_and_out: path_release(&nd); out: @@ -619,13 +621,13 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) err = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out_putf; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; err = notify_change(dentry, &newattrs); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); out_putf: fput(file); @@ -653,13 +655,13 @@ asmlinkage long sys_chmod(const char __user * filename, mode_t mode) if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto dput_and_out; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(nd.dentry, &newattrs); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); dput_and_out: path_release(&nd); @@ -695,9 +697,9 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group) } if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID|ATTR_KILL_SGID; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); error = notify_change(dentry, &newattrs); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); out: return error; } @@ -970,7 +972,7 @@ out: EXPORT_SYMBOL(get_unused_fd); -static inline void __put_unused_fd(struct files_struct *files, unsigned int fd) +static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __FD_CLR(fd, fdt->open_fds); diff --git a/fs/partitions/Kconfig b/fs/partitions/Kconfig index 656bc43431b9..7490cc9208b3 100644 --- a/fs/partitions/Kconfig +++ b/fs/partitions/Kconfig @@ -21,26 +21,30 @@ config ACORN_PARTITION Support hard disks partitioned under Acorn operating systems. config ACORN_PARTITION_CUMANA - bool "Cumana partition support" if PARTITION_ADVANCED && ACORN_PARTITION + bool "Cumana partition support" if PARTITION_ADVANCED default y if ARCH_ACORN + depends on ACORN_PARTITION help Say Y here if you would like to use hard disks under Linux which were partitioned using the Cumana interface on Acorn machines. config ACORN_PARTITION_EESOX - bool "EESOX partition support" if PARTITION_ADVANCED && ACORN_PARTITION + bool "EESOX partition support" if PARTITION_ADVANCED default y if ARCH_ACORN + depends on ACORN_PARTITION config ACORN_PARTITION_ICS - bool "ICS partition support" if PARTITION_ADVANCED && ACORN_PARTITION + bool "ICS partition support" if PARTITION_ADVANCED default y if ARCH_ACORN + depends on ACORN_PARTITION help Say Y here if you would like to use hard disks under Linux which were partitioned using the ICS interface on Acorn machines. config ACORN_PARTITION_ADFS - bool "Native filecore partition support" if PARTITION_ADVANCED && ACORN_PARTITION + bool "Native filecore partition support" if PARTITION_ADVANCED default y if ARCH_ACORN + depends on ACORN_PARTITION help The Acorn Disc Filing System is the standard file system of the RiscOS operating system which runs on Acorn's ARM-based Risc PC @@ -48,15 +52,17 @@ config ACORN_PARTITION_ADFS `Y' here, Linux will support disk partitions created under ADFS. config ACORN_PARTITION_POWERTEC - bool "PowerTec partition support" if PARTITION_ADVANCED && ACORN_PARTITION + bool "PowerTec partition support" if PARTITION_ADVANCED default y if ARCH_ACORN + depends on ACORN_PARTITION help Support reading partition tables created on Acorn machines using the PowerTec SCSI drive. config ACORN_PARTITION_RISCIX - bool "RISCiX partition support" if PARTITION_ADVANCED && ACORN_PARTITION + bool "RISCiX partition support" if PARTITION_ADVANCED default y if ARCH_ACORN + depends on ACORN_PARTITION help Once upon a time, there was a native Unix port for the Acorn series of machines called RISCiX. If you say 'Y' here, Linux will be able @@ -85,7 +91,7 @@ config ATARI_PARTITION config IBM_PARTITION bool "IBM disk label and partition support" - depends on PARTITION_ADVANCED && ARCH_S390 + depends on PARTITION_ADVANCED && S390 help Say Y here if you would like to be able to read the hard disk partition table format used by IBM DASD disks operating under CMS. @@ -224,5 +230,3 @@ config EFI_PARTITION Say Y here if you would like to use hard disks under Linux which were partitioned using EFI GPT. Presently only useful on the IA-64 platform. - -# define_bool CONFIG_ACORN_PARTITION_CUMANA y diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 8dc1822a7022..7881ce05daef 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -226,7 +226,7 @@ static struct sysfs_ops part_sysfs_ops = { static ssize_t part_uevent_store(struct hd_struct * p, const char *page, size_t count) { - kobject_hotplug(&p->kobj, KOBJ_ADD); + kobject_uevent(&p->kobj, KOBJ_ADD); return count; } static ssize_t part_dev_read(struct hd_struct * p, char *page) @@ -336,12 +336,31 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) disk->part[part-1] = p; } +static char *make_block_name(struct gendisk *disk) +{ + char *name; + static char *block_str = "block:"; + int size; + + size = strlen(block_str) + strlen(disk->disk_name) + 1; + name = kmalloc(size, GFP_KERNEL); + if (!name) + return NULL; + strcpy(name, block_str); + strcat(name, disk->disk_name); + return name; +} + static void disk_sysfs_symlinks(struct gendisk *disk) { struct device *target = get_device(disk->driverfs_dev); if (target) { + char *disk_name = make_block_name(disk); sysfs_create_link(&disk->kobj,&target->kobj,"device"); - sysfs_create_link(&target->kobj,&disk->kobj,"block"); + if (disk_name) { + sysfs_create_link(&target->kobj,&disk->kobj,disk_name); + kfree(disk_name); + } } } @@ -360,7 +379,7 @@ void register_disk(struct gendisk *disk) if ((err = kobject_add(&disk->kobj))) return; disk_sysfs_symlinks(disk); - kobject_hotplug(&disk->kobj, KOBJ_ADD); + kobject_uevent(&disk->kobj, KOBJ_ADD); /* No minors to use for partitions */ if (disk->minors == 1) { @@ -461,10 +480,14 @@ void del_gendisk(struct gendisk *disk) devfs_remove_disk(disk); if (disk->driverfs_dev) { + char *disk_name = make_block_name(disk); sysfs_remove_link(&disk->kobj, "device"); - sysfs_remove_link(&disk->driverfs_dev->kobj, "block"); + if (disk_name) { + sysfs_remove_link(&disk->driverfs_dev->kobj, disk_name); + kfree(disk_name); + } put_device(disk->driverfs_dev); } - kobject_hotplug(&disk->kobj, KOBJ_REMOVE); + kobject_uevent(&disk->kobj, KOBJ_REMOVE); kobject_del(&disk->kobj); } diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 6327bcb2d73d..78010ad60e47 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -56,7 +56,10 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) struct hd_geometry *geo; char type[5] = {0,}; char name[7] = {0,}; - struct vtoc_volume_label *vlabel; + union label_t { + struct vtoc_volume_label vol; + struct vtoc_cms_label cms; + } *label; unsigned char *data; Sector sect; @@ -64,9 +67,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) goto out_noinfo; if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) goto out_nogeo; - if ((vlabel = kmalloc(sizeof(struct vtoc_volume_label), - GFP_KERNEL)) == NULL) - goto out_novlab; + if ((label = kmalloc(sizeof(union label_t), GFP_KERNEL)) == NULL) + goto out_nolab; if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) @@ -87,7 +89,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) strncpy(name, data + 8, 6); else strncpy(name, data + 4, 6); - memcpy (vlabel, data, sizeof(struct vtoc_volume_label)); + memcpy(label, data, sizeof(union label_t)); put_dev_sector(sect); EBCASC(type, 4); @@ -100,14 +102,12 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) /* * VM style CMS1 labeled disk */ - int *label = (int *) vlabel; - - if (label[13] != 0) { + if (label->cms.disk_offset != 0) { printk("CMS1/%8s(MDSK):", name); /* disk is reserved minidisk */ - blocksize = label[3]; - offset = label[13]; - size = (label[7] - 1)*(blocksize >> 9); + blocksize = label->cms.block_size; + offset = label->cms.disk_offset; + size = (label->cms.block_count - 1) * (blocksize >> 9); } else { printk("CMS1/%8s:", name); offset = (info->label_block + 1); @@ -126,7 +126,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) printk("VOL1/%8s:", name); /* get block number and read then go through format1 labels */ - blk = cchhb2blk(&vlabel->vtoc, geo) + 1; + blk = cchhb2blk(&label->vol.vtoc, geo) + 1; counter = 0; while ((data = read_dev_sector(bdev, blk*(blocksize/512), §)) != NULL) { @@ -174,7 +174,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) } printk("\n"); - kfree(vlabel); + kfree(label); kfree(geo); kfree(info); return 1; @@ -182,8 +182,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) out_readerr: out_badsect: out_noioctl: - kfree(vlabel); -out_novlab: + kfree(label); +out_nolab: kfree(geo); out_nogeo: kfree(info); diff --git a/fs/pipe.c b/fs/pipe.c index 66aa0b938d6a..eef0f29e86ef 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -44,10 +44,10 @@ void pipe_wait(struct inode * inode) * is considered a noninteractive wait: */ prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); schedule(); finish_wait(PIPE_WAIT(*inode), &wait); - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); } static inline int @@ -136,7 +136,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); info = inode->i_pipe; for (;;) { int bufs = info->nrbufs; @@ -200,7 +200,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } pipe_wait(inode); } - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { wake_up_interruptible(PIPE_WAIT(*inode)); @@ -237,7 +237,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); info = inode->i_pipe; if (!PIPE_READERS(*inode)) { @@ -341,13 +341,13 @@ pipe_writev(struct file *filp, const struct iovec *_iov, PIPE_WAITING_WRITERS(*inode)--; } out: - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } if (ret > 0) - inode_update_time(inode, 1); /* mtime and ctime */ + file_update_time(filp); return ret; } @@ -381,7 +381,7 @@ pipe_ioctl(struct inode *pino, struct file *filp, switch (cmd) { case FIONREAD: - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); info = inode->i_pipe; count = 0; buf = info->curbuf; @@ -390,7 +390,7 @@ pipe_ioctl(struct inode *pino, struct file *filp, count += info->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -433,7 +433,7 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); PIPE_READERS(*inode) -= decr; PIPE_WRITERS(*inode) -= decw; if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { @@ -443,7 +443,7 @@ pipe_release(struct inode *inode, int decr, int decw) kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); return 0; } @@ -454,9 +454,9 @@ pipe_read_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); if (retval < 0) return retval; @@ -471,9 +471,9 @@ pipe_write_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); if (retval < 0) return retval; @@ -488,14 +488,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); if (retval >= 0) retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); if (retval < 0) return retval; @@ -534,9 +534,9 @@ pipe_read_open(struct inode *inode, struct file *filp) { /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); PIPE_READERS(*inode)++; - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); return 0; } @@ -544,9 +544,9 @@ pipe_read_open(struct inode *inode, struct file *filp) static int pipe_write_open(struct inode *inode, struct file *filp) { - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); PIPE_WRITERS(*inode)++; - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); return 0; } @@ -554,12 +554,12 @@ pipe_write_open(struct inode *inode, struct file *filp) static int pipe_rdwr_open(struct inode *inode, struct file *filp) { - down(PIPE_SEM(*inode)); + mutex_lock(PIPE_MUTEX(*inode)); if (filp->f_mode & FMODE_READ) PIPE_READERS(*inode)++; if (filp->f_mode & FMODE_WRITE) PIPE_WRITERS(*inode)++; - up(PIPE_SEM(*inode)); + mutex_unlock(PIPE_MUTEX(*inode)); return 0; } diff --git a/fs/pnode.c b/fs/pnode.c index aeeec8ba8dd2..f1871f773f64 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -103,7 +103,7 @@ static struct vfsmount *propagation_next(struct vfsmount *m, struct vfsmount *next; struct vfsmount *master = m->mnt_master; - if ( master == origin->mnt_master ) { + if (master == origin->mnt_master) { next = next_peer(m); return ((next == origin) ? NULL : next); } else if (m->mnt_slave.next != &master->mnt_slave_list) diff --git a/fs/proc/array.c b/fs/proc/array.c index 3e1239e4b303..7eb1bd7f800c 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -308,7 +308,7 @@ int proc_pid_status(struct task_struct *task, char * buffer) buffer = task_sig(task, buffer); buffer = task_cap(task, buffer); buffer = cpuset_task_status_allowed(task, buffer); -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) buffer = task_show_regs(task, buffer); #endif return buffer - orig; @@ -330,7 +330,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; unsigned long rsslim = 0; - unsigned long it_real_value = 0; + DEFINE_KTIME(it_real_value); struct task_struct *t; char tcomm[sizeof(task->comm)]; @@ -386,7 +386,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) utime = cputime_add(utime, task->signal->utime); stime = cputime_add(stime, task->signal->stime); } - it_real_value = task->signal->it_real_value; + it_real_value = task->signal->real_timer.expires; } ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; read_unlock(&tasklist_lock); @@ -435,7 +435,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) priority, nice, num_threads, - jiffies_to_clock_t(it_real_value), + (long) ktime_to_clock_t(it_real_value), start_time, vsize, mm ? get_mm_rss(mm) : 0, diff --git a/fs/proc/base.c b/fs/proc/base.c index 634355e16986..20feb7568deb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -55,6 +55,7 @@ #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/init.h> +#include <linux/capability.h> #include <linux/file.h> #include <linux/string.h> #include <linux/seq_file.h> diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 72b431d0a0a4..20e5c4509a43 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -21,6 +21,8 @@ #include <linux/bitops.h> #include <asm/uaccess.h> +#include "internal.h" + static ssize_t proc_file_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); static ssize_t proc_file_write(struct file *file, const char __user *buffer, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e6a818a93f3d..6573f31f1fd9 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -19,7 +19,7 @@ #include <asm/system.h> #include <asm/uaccess.h> -extern void free_proc_entry(struct proc_dir_entry *); +#include "internal.h" static inline struct proc_dir_entry * de_get(struct proc_dir_entry *de) { diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e55198f9806..95a1cf32b838 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -37,6 +37,10 @@ extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +void free_proc_entry(struct proc_dir_entry *de); + +int proc_init_inodecache(void); + static inline struct task_struct *proc_task(struct inode *inode) { return PROC_I(inode)->task; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1c7da988fcc3..adc2cd95169a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -14,6 +14,7 @@ #include <linux/proc_fs.h> #include <linux/user.h> #include <linux/a.out.h> +#include <linux/capability.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/vmalloc.h> diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 5b6b0b6038a7..63bf6c00fa0c 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -323,6 +323,7 @@ static struct file_operations proc_modules_operations = { }; #endif +#ifdef CONFIG_SLAB extern struct seq_operations slabinfo_op; extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *); static int slabinfo_open(struct inode *inode, struct file *file) @@ -336,6 +337,7 @@ static struct file_operations proc_slabinfo_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif static int show_stat(struct seq_file *p, void *v) { @@ -600,7 +602,9 @@ void __init proc_misc_init(void) create_seq_entry("partitions", 0, &proc_partitions_operations); create_seq_entry("stat", 0, &proc_stat_operations); create_seq_entry("interrupts", 0, &proc_interrupts_operations); +#ifdef CONFIG_SLAB create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); +#endif create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations); create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations); diff --git a/fs/proc/root.c b/fs/proc/root.c index aef148f099a2..68896283c8ae 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,8 @@ #include <linux/bitops.h> #include <linux/smp_lock.h> +#include "internal.h" + struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc_root_driver; #ifdef CONFIG_SYSCTL @@ -36,7 +38,6 @@ static struct file_system_type proc_fs_type = { .kill_sb = kill_anon_super, }; -extern int __init proc_init_inodecache(void); void __init proc_root_init(void) { int err = proc_init_inodecache(); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 50bd5a8f0446..0eaad41f4658 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -390,129 +390,12 @@ struct seq_operations proc_pid_smaps_op = { }; #ifdef CONFIG_NUMA - -struct numa_maps { - unsigned long pages; - unsigned long anon; - unsigned long mapped; - unsigned long mapcount_max; - unsigned long node[MAX_NUMNODES]; -}; - -/* - * Calculate numa node maps for a vma - */ -static struct numa_maps *get_numa_maps(struct vm_area_struct *vma) -{ - int i; - struct page *page; - unsigned long vaddr; - struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); - - if (!md) - return NULL; - md->pages = 0; - md->anon = 0; - md->mapped = 0; - md->mapcount_max = 0; - for_each_node(i) - md->node[i] =0; - - for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { - page = follow_page(vma, vaddr, 0); - if (page) { - int count = page_mapcount(page); - - if (count) - md->mapped++; - if (count > md->mapcount_max) - md->mapcount_max = count; - md->pages++; - if (PageAnon(page)) - md->anon++; - md->node[page_to_nid(page)]++; - } - cond_resched(); - } - return md; -} - -static int show_numa_map(struct seq_file *m, void *v) -{ - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - struct mempolicy *pol; - struct numa_maps *md; - struct zone **z; - int n; - int first; - - if (!vma->vm_mm) - return 0; - - md = get_numa_maps(vma); - if (!md) - return 0; - - seq_printf(m, "%08lx", vma->vm_start); - pol = get_vma_policy(task, vma, vma->vm_start); - /* Print policy */ - switch (pol->policy) { - case MPOL_PREFERRED: - seq_printf(m, " prefer=%d", pol->v.preferred_node); - break; - case MPOL_BIND: - seq_printf(m, " bind={"); - first = 1; - for (z = pol->v.zonelist->zones; *z; z++) { - - if (!first) - seq_putc(m, ','); - else - first = 0; - seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id, - (*z)->name); - } - seq_putc(m, '}'); - break; - case MPOL_INTERLEAVE: - seq_printf(m, " interleave={"); - first = 1; - for_each_node(n) { - if (node_isset(n, pol->v.nodes)) { - if (!first) - seq_putc(m,','); - else - first = 0; - seq_printf(m, "%d",n); - } - } - seq_putc(m, '}'); - break; - default: - seq_printf(m," default"); - break; - } - seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu", - md->mapcount_max, md->pages, md->mapped); - if (md->anon) - seq_printf(m," Anon=%lu",md->anon); - - for_each_online_node(n) { - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); - } - seq_putc(m, '\n'); - kfree(md); - if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; - return 0; -} +extern int show_numa_map(struct seq_file *m, void *v); struct seq_operations proc_pid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_numa_map + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_numa_map }; #endif diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 3b2e7b69e63a..4063fb32f78c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -14,7 +14,6 @@ #include <linux/a.out.h> #include <linux/elf.h> #include <linux/elfcore.h> -#include <linux/proc_fs.h> #include <linux/highmem.h> #include <linux/bootmem.h> #include <linux/init.h> @@ -35,11 +34,14 @@ static size_t elfcorebuf_sz; /* Total size of vmcore file. */ static u64 vmcore_size; +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + struct proc_dir_entry *proc_vmcore = NULL; /* Reads a page from the oldmem device from given offset. */ static ssize_t read_from_oldmem(char *buf, size_t count, - loff_t *ppos, int userbuf) + u64 *ppos, int userbuf) { unsigned long pfn, offset; size_t nr_bytes; diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c index 991253927658..46efbf52cbec 100644 --- a/fs/qnx4/bitmap.c +++ b/fs/qnx4/bitmap.c @@ -23,10 +23,12 @@ #include <linux/buffer_head.h> #include <linux/bitops.h> +#if 0 int qnx4_new_block(struct super_block *sb) { return 0; } +#endif /* 0 */ static void count_bits(register const char *bmPart, register int size, int *const tf) diff --git a/fs/quota.c b/fs/quota.c index 612e04db4b93..ba9e0bf32f67 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -15,6 +15,7 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/buffer_head.h> +#include <linux/capability.h> #include <linux/quotaops.h> /* Check validity of generic quotactl commands */ @@ -168,7 +169,7 @@ static void quota_sync_sb(struct super_block *sb, int type) sync_blockdev(sb->s_bdev); /* Now when everything is written we can discard the pagecache so - * that userspace sees the changes. We need i_sem and so we could + * that userspace sees the changes. We need i_mutex and so we could * not do it inside dqonoff_sem. Moreover we need to be carefull * about races with quotaoff() (that is the reason why we have own * reference to inode). */ @@ -184,9 +185,9 @@ static void quota_sync_sb(struct super_block *sb, int type) up(&sb_dqopt(sb)->dqonoff_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (discard[cnt]) { - down(&discard[cnt]->i_sem); + mutex_lock(&discard[cnt]->i_mutex); truncate_inode_pages(&discard[cnt]->i_data, 0); - up(&discard[cnt]->i_sem); + mutex_unlock(&discard[cnt]->i_mutex); iput(discard[cnt]); } } diff --git a/fs/ramfs/Makefile b/fs/ramfs/Makefile index f096f3007091..5a0236e02ee1 100644 --- a/fs/ramfs/Makefile +++ b/fs/ramfs/Makefile @@ -4,4 +4,6 @@ obj-$(CONFIG_RAMFS) += ramfs.o -ramfs-objs := inode.o +file-mmu-y := file-nommu.o +file-mmu-$(CONFIG_MMU) := file-mmu.o +ramfs-objs += inode.o $(file-mmu-y) diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c new file mode 100644 index 000000000000..2115383dcc8d --- /dev/null +++ b/fs/ramfs/file-mmu.c @@ -0,0 +1,57 @@ +/* file-mmu.c: ramfs MMU-based file operations + * + * Resizable simple ram filesystem for Linux. + * + * Copyright (C) 2000 Linus Torvalds. + * 2000 Transmeta Corp. + * + * Usage limits added by David Gibson, Linuxcare Australia. + * This file is released under the GPL. + */ + +/* + * NOTE! This filesystem is probably most useful + * not as a real filesystem, but as an example of + * how virtual filesystems can be written. + * + * It doesn't get much simpler than this. Consider + * that this file implements the full semantics of + * a POSIX-compliant read-write filesystem. + * + * Note in particular how the filesystem does not + * need to implement any data structures of its own + * to keep track of the virtual data: using the VFS + * caches is sufficient. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/ramfs.h> + +#include <asm/uaccess.h> +#include "internal.h" + +struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .prepare_write = simple_prepare_write, + .commit_write = simple_commit_write +}; + +struct file_operations ramfs_file_operations = { + .read = generic_file_read, + .write = generic_file_write, + .mmap = generic_file_mmap, + .fsync = simple_sync_file, + .sendfile = generic_file_sendfile, + .llseek = generic_file_llseek, +}; + +struct inode_operations ramfs_file_inode_operations = { + .getattr = simple_getattr, +}; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c new file mode 100644 index 000000000000..3f810acd0bfa --- /dev/null +++ b/fs/ramfs/file-nommu.c @@ -0,0 +1,292 @@ +/* file-nommu.c: no-MMU version of ramfs + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/ramfs.h> +#include <linux/quotaops.h> +#include <linux/pagevec.h> +#include <linux/mman.h> + +#include <asm/uaccess.h> +#include "internal.h" + +static int ramfs_nommu_setattr(struct dentry *, struct iattr *); + +struct address_space_operations ramfs_aops = { + .readpage = simple_readpage, + .prepare_write = simple_prepare_write, + .commit_write = simple_commit_write +}; + +struct file_operations ramfs_file_operations = { + .mmap = ramfs_nommu_mmap, + .get_unmapped_area = ramfs_nommu_get_unmapped_area, + .read = generic_file_read, + .write = generic_file_write, + .fsync = simple_sync_file, + .sendfile = generic_file_sendfile, + .llseek = generic_file_llseek, +}; + +struct inode_operations ramfs_file_inode_operations = { + .setattr = ramfs_nommu_setattr, + .getattr = simple_getattr, +}; + +/*****************************************************************************/ +/* + * add a contiguous set of pages into a ramfs inode when it's truncated from + * size 0 on the assumption that it's going to be used for an mmap of shared + * memory + */ +static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +{ + struct pagevec lru_pvec; + unsigned long npages, xpages, loop, limit; + struct page *pages; + unsigned order; + void *data; + int ret; + + /* make various checks */ + order = get_order(newsize); + if (unlikely(order >= MAX_ORDER)) + goto too_big; + + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && newsize > limit) + goto fsize_exceeded; + + if (newsize > inode->i_sb->s_maxbytes) + goto too_big; + + i_size_write(inode, newsize); + + /* allocate enough contiguous pages to be able to satisfy the + * request */ + pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order); + if (!pages) + return -ENOMEM; + + /* split the high-order page into an array of single pages */ + xpages = 1UL << order; + npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT; + + for (loop = 0; loop < npages; loop++) + set_page_count(pages + loop, 1); + + /* trim off any pages we don't actually require */ + for (loop = npages; loop < xpages; loop++) + __free_page(pages + loop); + + /* clear the memory we allocated */ + newsize = PAGE_SIZE * npages; + data = page_address(pages); + memset(data, 0, newsize); + + /* attach all the pages to the inode's address space */ + pagevec_init(&lru_pvec, 0); + for (loop = 0; loop < npages; loop++) { + struct page *page = pages + loop; + + ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL); + if (ret < 0) + goto add_error; + + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + + unlock_page(page); + } + + pagevec_lru_add(&lru_pvec); + return 0; + + fsize_exceeded: + send_sig(SIGXFSZ, current, 0); + too_big: + return -EFBIG; + + add_error: + page_cache_release(pages + loop); + for (loop++; loop < npages; loop++) + __free_page(pages + loop); + return ret; +} + +/*****************************************************************************/ +/* + * check that file shrinkage doesn't leave any VMAs dangling in midair + */ +static int ramfs_nommu_check_mappings(struct inode *inode, + size_t newsize, size_t size) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + newsize >> PAGE_SHIFT, + (size + PAGE_SIZE - 1) >> PAGE_SHIFT + ) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) + return -ETXTBSY; /* not quite true, but near enough */ + } + + return 0; +} + +/*****************************************************************************/ +/* + * + */ +static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) +{ + int ret; + + /* assume a truncate from zero size is going to be for the purposes of + * shared mmap */ + if (size == 0) { + if (unlikely(newsize >> 32)) + return -EFBIG; + + return ramfs_nommu_expand_for_mapping(inode, newsize); + } + + /* check that a decrease in size doesn't cut off any shared mappings */ + if (newsize < size) { + ret = ramfs_nommu_check_mappings(inode, newsize, size); + if (ret < 0) + return ret; + } + + ret = vmtruncate(inode, size); + + return ret; +} + +/*****************************************************************************/ +/* + * handle a change of attributes + * - we're specifically interested in a change of size + */ +static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) +{ + struct inode *inode = dentry->d_inode; + unsigned int old_ia_valid = ia->ia_valid; + int ret = 0; + + /* by providing our own setattr() method, we skip this quotaism */ + if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) || + (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid)) + ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0; + + /* pick out size-changing events */ + if (ia->ia_valid & ATTR_SIZE) { + loff_t size = i_size_read(inode); + if (ia->ia_size != size) { + ret = ramfs_nommu_resize(inode, ia->ia_size, size); + if (ret < 0 || ia->ia_valid == ATTR_SIZE) + goto out; + } else { + /* we skipped the truncate but must still update + * timestamps + */ + ia->ia_valid |= ATTR_MTIME|ATTR_CTIME; + } + } + + ret = inode_setattr(inode, ia); + out: + ia->ia_valid = old_ia_valid; + return ret; +} + +/*****************************************************************************/ +/* + * try to determine where a shared mapping can be made + * - we require that: + * - the pages to be mapped must exist + * - the pages be physically contiguous in sequence + */ +unsigned long ramfs_nommu_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + unsigned long maxpages, lpages, nr, loop, ret; + struct inode *inode = file->f_dentry->d_inode; + struct page **pages = NULL, **ptr, *page; + loff_t isize; + + if (!(flags & MAP_SHARED)) + return addr; + + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + isize = i_size_read(inode); + + ret = -EINVAL; + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= maxpages) + goto out; + + if (maxpages - pgoff < lpages) + goto out; + + /* gang-find the pages */ + ret = -ENOMEM; + pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out; + + nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + if (nr != lpages) + goto out; /* leave if some pages were missing */ + + /* check the pages for physical adjacency */ + ptr = pages; + page = *ptr++; + page++; + for (loop = lpages; loop > 1; loop--) + if (*ptr++ != page++) + goto out; + + /* okay - all conditions fulfilled */ + ret = (unsigned long) page_address(pages[0]); + + out: + if (pages) { + ptr = pages; + for (loop = lpages; loop > 0; loop--) + put_page(*ptr++); + kfree(pages); + } + + return ret; +} + +/*****************************************************************************/ +/* + * set up a mapping + */ +int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) +{ + return 0; +} diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 0a88917605ae..c66bd5e4c05c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -34,13 +34,12 @@ #include <linux/ramfs.h> #include <asm/uaccess.h> +#include "internal.h" /* some random number */ #define RAMFS_MAGIC 0x858458f6 static struct super_operations ramfs_ops; -static struct address_space_operations ramfs_aops; -static struct inode_operations ramfs_file_inode_operations; static struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { @@ -142,25 +141,6 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * return error; } -static struct address_space_operations ramfs_aops = { - .readpage = simple_readpage, - .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write -}; - -struct file_operations ramfs_file_operations = { - .read = generic_file_read, - .write = generic_file_write, - .mmap = generic_file_mmap, - .fsync = simple_sync_file, - .sendfile = generic_file_sendfile, - .llseek = generic_file_llseek, -}; - -static struct inode_operations ramfs_file_inode_operations = { - .getattr = simple_getattr, -}; - static struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h new file mode 100644 index 000000000000..272c8a7120b0 --- /dev/null +++ b/fs/ramfs/internal.h @@ -0,0 +1,15 @@ +/* internal.h: ramfs internal definitions + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +extern struct address_space_operations ramfs_aops; +extern struct file_operations ramfs_file_operations; +extern struct inode_operations ramfs_file_inode_operations; diff --git a/fs/read_write.c b/fs/read_write.c index a091ee4f430d..3f7a1a62165f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -14,6 +14,7 @@ #include <linux/security.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -32,7 +33,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) long long retval; struct inode *inode = file->f_mapping->host; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); switch (origin) { case 2: offset += inode->i_size; @@ -48,7 +49,7 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) } retval = offset; } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return retval; } @@ -182,22 +183,33 @@ bad: } #endif +/* + * rw_verify_area doesn't like huge counts. We limit + * them to something that fits in "int" so that others + * won't have to do range checks all the time. + */ +#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK) int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) { struct inode *inode; loff_t pos; - if (unlikely(count > INT_MAX)) + if (unlikely((ssize_t) count < 0)) goto Einval; pos = *ppos; if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) goto Einval; inode = file->f_dentry->d_inode; - if (inode->i_flock && MANDATORY_LOCK(inode)) - return locks_mandatory_area(read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); - return 0; + if (inode->i_flock && MANDATORY_LOCK(inode)) { + int retval = locks_mandatory_area( + read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, + inode, file, pos, count); + if (retval < 0) + return retval; + } + return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; Einval: return -EINVAL; @@ -244,7 +256,8 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) return -EFAULT; ret = rw_verify_area(READ, file, pos, count); - if (!ret) { + if (ret >= 0) { + count = ret; ret = security_file_permission (file, MAY_READ); if (!ret) { if (file->f_op->read) @@ -295,7 +308,8 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ return -EFAULT; ret = rw_verify_area(WRITE, file, pos, count); - if (!ret) { + if (ret >= 0) { + count = ret; ret = security_file_permission (file, MAY_WRITE); if (!ret) { if (file->f_op->write) @@ -497,7 +511,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } ret = rw_verify_area(type, file, pos, tot_len); - if (ret) + if (ret < 0) goto out; ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE); if (ret) @@ -653,8 +667,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(in_file->f_mode & FMODE_PREAD)) goto fput_in; retval = rw_verify_area(READ, in_file, ppos, count); - if (retval) + if (retval < 0) goto fput_in; + count = retval; retval = security_file_permission (in_file, MAY_READ); if (retval) @@ -674,8 +689,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto fput_out; out_inode = out_file->f_dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); - if (retval) + if (retval < 0) goto fput_out; + count = retval; retval = security_file_permission (out_file, MAY_WRITE); if (retval) diff --git a/fs/readdir.c b/fs/readdir.c index b03579bc0210..b6109329b607 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -30,13 +30,13 @@ int vfs_readdir(struct file *file, filldir_t filler, void *buf) if (res) goto out; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); res = -ENOENT; if (!IS_DEADDIR(inode)) { res = file->f_op->readdir(file, buf, filler); file_accessed(file); } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); out: return res; } diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 7892a865b58a..ad6fa964b0e7 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -49,7 +49,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) } reiserfs_write_lock(inode->i_sb); - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); /* freeing preallocation only involves relogging blocks that * are already in the current transaction. preallocation gets * freed at the end of each transaction, so it is impossible for @@ -100,7 +100,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp) err = reiserfs_truncate_file(inode, 0); } out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); reiserfs_write_unlock(inode->i_sb); return err; } @@ -1342,7 +1342,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; - down(&inode->i_sem); // locks the entire file for just us + mutex_lock(&inode->i_mutex); // locks the entire file for just us pos = *ppos; @@ -1360,7 +1360,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t if (res) goto out; - inode_update_time(inode, 1); /* Both mtime and ctime */ + file_update_time(file); // Ok, we are done with all the checks. @@ -1532,12 +1532,12 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA | OSYNC_DATA); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); reiserfs_async_progress_wait(inode->i_sb); return (already_written != 0) ? already_written : res; out: - up(&inode->i_sem); // unlock the file on exit. + mutex_unlock(&inode->i_mutex); // unlock the file on exit. return res; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a5e3a0ddbe53..ffa34b861bdb 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -40,12 +40,12 @@ void reiserfs_delete_inode(struct inode *inode) /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); reiserfs_delete_xattrs(inode); if (journal_begin(&th, inode->i_sb, jbegin_count)) { - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); goto out; } reiserfs_update_inode_transaction(inode); @@ -59,11 +59,11 @@ void reiserfs_delete_inode(struct inode *inode) DQUOT_FREE_INODE(inode); if (journal_end(&th, inode->i_sb, jbegin_count)) { - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); goto out; } - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); /* check return value from reiserfs_delete_object after * ending the transaction @@ -551,7 +551,7 @@ static int convert_tail_for_hole(struct inode *inode, /* we don't have to make sure the conversion did not happen while ** we were locking the page because anyone that could convert - ** must first take i_sem. + ** must first take i_mutex. ** ** We must fix the tail page for writing because it might have buffers ** that are mapped, but have a block number of 0. This indicates tail @@ -586,7 +586,7 @@ static inline int _allocate_block(struct reiserfs_transaction_handle *th, BUG_ON(!th->t_trans_id); #ifdef REISERFS_PREALLOCATE - if (!(flags & GET_BLOCK_NO_ISEM)) { + if (!(flags & GET_BLOCK_NO_IMUX)) { return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block); } @@ -2318,7 +2318,7 @@ static int map_block_for_writepage(struct inode *inode, /* this is where we fill in holes in the file. */ if (use_get_block) { retval = reiserfs_get_block(inode, block, bh_result, - GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM + GET_BLOCK_CREATE | GET_BLOCK_NO_IMUX | GET_BLOCK_NO_DANGLE); if (!retval) { if (!buffer_mapped(bh_result) diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 81fc00285f60..745c88100895 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -2,6 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ +#include <linux/capability.h> #include <linux/fs.h> #include <linux/reiserfs_fs.h> #include <linux/time.h> @@ -120,7 +121,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp) /* we need to make sure nobody is changing the file size beneath ** us */ - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ @@ -156,7 +157,7 @@ static int reiserfs_unpack(struct inode *inode, struct file *filp) page_cache_release(page); out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 3f17ef844fb6..4491fcf2a0e6 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -3925,10 +3925,13 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, flush = 1; } #ifdef REISERFS_PREALLOCATE - /* quota ops might need to nest, setup the journal_info pointer for them */ + /* quota ops might need to nest, setup the journal_info pointer for them + * and raise the refcount so that it is > 0. */ current->journal_info = th; + th->t_refcount++; reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into * the transaction */ + th->t_refcount--; current->journal_info = th->t_handle_save; #endif diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 42afb5bef111..397d9590c8f2 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2211,7 +2211,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct buffer_head tmp_bh, *bh; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -2250,7 +2250,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c index c92e124f628e..196e971c03c9 100644 --- a/fs/reiserfs/tail_conversion.c +++ b/fs/reiserfs/tail_conversion.c @@ -205,7 +205,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_in 1) * p_s_sb->s_blocksize; pos1 = pos; - // we are protected by i_sem. The tail can not disapper, not + // we are protected by i_mutex. The tail can not disapper, not // append can be done either // we are in truncate or packing tail in file_release diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 72e120798677..cc061bfd437b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -30,6 +30,7 @@ */ #include <linux/reiserfs_fs.h> +#include <linux/capability.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/errno.h> @@ -67,11 +68,11 @@ static struct dentry *create_xa_root(struct super_block *sb) goto out; } else if (!xaroot->d_inode) { int err; - down(&privroot->d_inode->i_sem); + mutex_lock(&privroot->d_inode->i_mutex); err = privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot, 0700); - up(&privroot->d_inode->i_sem); + mutex_unlock(&privroot->d_inode->i_mutex); if (err) { dput(xaroot); @@ -115,8 +116,8 @@ static struct dentry *__get_xa_root(struct super_block *s) } /* Returns the dentry (or NULL) referring to the root of the extended - * attribute directory tree. If it has already been retreived, it is used. - * Otherwise, we attempt to retreive it from disk. It may also return + * attribute directory tree. If it has already been retrieved, it is used. + * Otherwise, we attempt to retrieve it from disk. It may also return * a pointer-encoded error. */ static inline struct dentry *get_xa_root(struct super_block *s) @@ -219,7 +220,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode, } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { goto out; } else { - /* inode->i_sem is down, so nothing else can try to create + /* inode->i_mutex is down, so nothing else can try to create * the same xattr */ err = xadir->d_inode->i_op->create(xadir->d_inode, xafile, 0700 | S_IFREG, NULL); @@ -268,7 +269,7 @@ static struct file *open_xa_file(const struct inode *inode, const char *name, * and don't mess with f->f_pos, but the idea is the same. Do some * action on each and every entry in the directory. * - * we're called with i_sem held, so there are no worries about the directory + * we're called with i_mutex held, so there are no worries about the directory * changing underneath us. */ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) @@ -426,7 +427,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf) int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); // down(&inode->i_zombie); res = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -435,7 +436,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf) unlock_kernel(); } // up(&inode->i_zombie); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); out: return res; } @@ -480,7 +481,7 @@ static inline __u32 xattr_hash(const char *msg, int len) /* Generic extended attribute operations that can be used by xa plugins */ /* - * inode->i_sem: down + * inode->i_mutex: down */ int reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, @@ -497,12 +498,6 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, struct iattr newattrs; __u32 xahash = 0; - if (IS_RDONLY(inode)) - return -EROFS; - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; @@ -535,7 +530,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, /* Resize it so we're ok to write there */ newattrs.ia_size = buffer_size; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; - down(&xinode->i_sem); + mutex_lock(&xinode->i_mutex); err = notify_change(fp->f_dentry, &newattrs); if (err) goto out_filp; @@ -598,7 +593,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, } out_filp: - up(&xinode->i_sem); + mutex_unlock(&xinode->i_mutex); fput(fp); out: @@ -606,7 +601,7 @@ reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer, } /* - * inode->i_sem: down + * inode->i_mutex: down */ int reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer, @@ -758,9 +753,6 @@ int reiserfs_xattr_del(struct inode *inode, const char *name) struct dentry *dir; int err; - if (IS_RDONLY(inode)) - return -EROFS; - dir = open_xa_dir(inode, FL_READONLY); if (IS_ERR(dir)) { err = PTR_ERR(dir); @@ -793,7 +785,7 @@ reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen, } -/* This is called w/ inode->i_sem downed */ +/* This is called w/ inode->i_mutex downed */ int reiserfs_delete_xattrs(struct inode *inode) { struct file *fp; @@ -946,7 +938,7 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) /* * Inode operation getxattr() - * Preliminary locking: we down dentry->d_inode->i_sem + * Preliminary locking: we down dentry->d_inode->i_mutex */ ssize_t reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, @@ -970,7 +962,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, /* * Inode operation setxattr() * - * dentry->d_inode->i_sem down + * dentry->d_inode->i_mutex down */ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, @@ -984,12 +976,6 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - if (IS_RDONLY(dentry->d_inode)) - return -EROFS; - - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) - return -EROFS; - reiserfs_write_lock_xattr_i(dentry->d_inode); lock = !has_xattr_dir(dentry->d_inode); if (lock) @@ -1008,7 +994,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, /* * Inode operation removexattr() * - * dentry->d_inode->i_sem down + * dentry->d_inode->i_mutex down */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { @@ -1019,12 +1005,6 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) return -EOPNOTSUPP; - if (IS_RDONLY(dentry->d_inode)) - return -EROFS; - - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) - return -EPERM; - reiserfs_write_lock_xattr_i(dentry->d_inode); reiserfs_read_lock_xattrs(dentry->d_sb); @@ -1091,7 +1071,7 @@ reiserfs_listxattr_filler(void *buf, const char *name, int namelen, /* * Inode operation listxattr() * - * Preliminary locking: we down dentry->d_inode->i_sem + * Preliminary locking: we down dentry->d_inode->i_mutex */ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) { @@ -1289,9 +1269,9 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (!IS_ERR(dentry)) { if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { struct inode *inode = dentry->d_parent->d_inode; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); err = inode->i_op->mkdir(inode, dentry, 0700); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); if (err) { dput(dentry); dentry = NULL; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index a47ac9aac8b2..43de3ba83332 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -1,3 +1,4 @@ +#include <linux/capability.h> #include <linux/fs.h> #include <linux/posix_acl.h> #include <linux/reiserfs_fs.h> @@ -174,7 +175,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size) /* * Inode operation get_posix_acl(). * - * inode->i_sem: down + * inode->i_mutex: down * BKL held [before 2.5.x] */ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) @@ -237,7 +238,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) /* * Inode operation set_posix_acl(). * - * inode->i_sem: down + * inode->i_mutex: down * BKL held [before 2.5.x] */ static int @@ -312,7 +313,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -/* dir->i_sem: down, +/* dir->i_mutex: locked, * inode is new and not released into the wild yet */ int reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 2501f7e66ab9..024a938ca60f 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -1,4 +1,5 @@ #include <linux/reiserfs_fs.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/pagemap.h> diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 51458048ca66..073f39364b11 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -16,18 +16,10 @@ static int user_get(struct inode *inode, const char *name, void *buffer, size_t size) { - int error; - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; - if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - - error = reiserfs_permission_locked(inode, MAY_READ, NULL); - if (error) - return error; - return reiserfs_xattr_get(inode, name, buffer, size); } @@ -36,43 +28,21 @@ user_set(struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - int error; - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - - if (!S_ISREG(inode->i_mode) && - (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) - return -EPERM; - - error = reiserfs_permission_locked(inode, MAY_WRITE, NULL); - if (error) - return error; - return reiserfs_xattr_set(inode, name, buffer, size, flags); } static int user_del(struct inode *inode, const char *name) { - int error; - if (strlen(name) < sizeof(XATTR_USER_PREFIX)) return -EINVAL; if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - - if (!S_ISREG(inode->i_mode) && - (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) - return -EPERM; - - error = reiserfs_permission_locked(inode, MAY_WRITE, NULL); - if (error) - return error; - return 0; } diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c index 84e21ffa5ca8..10187812771e 100644 --- a/fs/relayfs/buffers.c +++ b/fs/relayfs/buffers.c @@ -185,5 +185,6 @@ void relay_destroy_buf(struct rchan_buf *buf) void relay_remove_buf(struct kref *kref) { struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); - relayfs_remove(buf->dentry); + buf->chan->cb->remove_buf_file(buf->dentry); + relay_destroy_buf(buf); } diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c index 0f7f88d067ad..383523011aad 100644 --- a/fs/relayfs/inode.c +++ b/fs/relayfs/inode.c @@ -26,31 +26,22 @@ static struct vfsmount * relayfs_mount; static int relayfs_mount_count; -static kmem_cache_t * relayfs_inode_cachep; static struct backing_dev_info relayfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -static struct inode *relayfs_get_inode(struct super_block *sb, int mode, - struct rchan *chan) +static struct inode *relayfs_get_inode(struct super_block *sb, + int mode, + struct file_operations *fops, + void *data) { - struct rchan_buf *buf = NULL; struct inode *inode; - if (S_ISREG(mode)) { - BUG_ON(!chan); - buf = relay_create_buf(chan); - if (!buf) - return NULL; - } - inode = new_inode(sb); - if (!inode) { - relay_destroy_buf(buf); + if (!inode) return NULL; - } inode->i_mode = mode; inode->i_uid = 0; @@ -61,8 +52,9 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { case S_IFREG: - inode->i_fop = &relayfs_file_operations; - RELAYFS_I(inode)->buf = buf; + inode->i_fop = fops; + if (data) + inode->u.generic_ip = data; break; case S_IFDIR: inode->i_op = &simple_dir_inode_operations; @@ -83,7 +75,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode, * @name: the name of the file to create * @parent: parent directory * @mode: mode - * @chan: relay channel associated with the file + * @fops: file operations to use for the file + * @data: user-associated data for this file * * Returns the new dentry, NULL on failure * @@ -92,7 +85,8 @@ static struct inode *relayfs_get_inode(struct super_block *sb, int mode, static struct dentry *relayfs_create_entry(const char *name, struct dentry *parent, int mode, - struct rchan *chan) + struct file_operations *fops, + void *data) { struct dentry *d; struct inode *inode; @@ -115,7 +109,7 @@ static struct dentry *relayfs_create_entry(const char *name, } parent = dget(parent); - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); d = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(d)) { d = NULL; @@ -127,7 +121,7 @@ static struct dentry *relayfs_create_entry(const char *name, goto release_mount; } - inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan); + inode = relayfs_get_inode(parent->d_inode->i_sb, mode, fops, data); if (!inode) { d = NULL; goto release_mount; @@ -145,7 +139,7 @@ release_mount: simple_release_fs(&relayfs_mount, &relayfs_mount_count); exit: - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); dput(parent); return d; } @@ -155,20 +149,26 @@ exit: * @name: the name of the file to create * @parent: parent directory * @mode: mode, if not specied the default perms are used - * @chan: channel associated with the file + * @fops: file operations to use for the file + * @data: user-associated data for this file * * Returns file dentry if successful, NULL otherwise. * * The file will be created user r on behalf of current user. */ -struct dentry *relayfs_create_file(const char *name, struct dentry *parent, - int mode, struct rchan *chan) +struct dentry *relayfs_create_file(const char *name, + struct dentry *parent, + int mode, + struct file_operations *fops, + void *data) { + BUG_ON(!fops); + if (!mode) mode = S_IRUSR; mode = (mode & S_IALLUGO) | S_IFREG; - return relayfs_create_entry(name, parent, mode, chan); + return relayfs_create_entry(name, parent, mode, fops, data); } /** @@ -183,7 +183,7 @@ struct dentry *relayfs_create_file(const char *name, struct dentry *parent, struct dentry *relayfs_create_dir(const char *name, struct dentry *parent) { int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - return relayfs_create_entry(name, parent, mode, NULL); + return relayfs_create_entry(name, parent, mode, NULL, NULL); } /** @@ -204,7 +204,7 @@ int relayfs_remove(struct dentry *dentry) return -EINVAL; parent = dget(parent); - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); if (dentry->d_inode) { if (S_ISDIR(dentry->d_inode->i_mode)) error = simple_rmdir(parent->d_inode, dentry); @@ -215,7 +215,7 @@ int relayfs_remove(struct dentry *dentry) } if (!error) dput(dentry); - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); dput(parent); if (!error) @@ -225,6 +225,17 @@ int relayfs_remove(struct dentry *dentry) } /** + * relayfs_remove_file - remove a file from relay filesystem + * @dentry: directory dentry + * + * Returns 0 if successful, negative otherwise. + */ +int relayfs_remove_file(struct dentry *dentry) +{ + return relayfs_remove(dentry); +} + +/** * relayfs_remove_dir - remove a directory in the relay filesystem * @dentry: directory dentry * @@ -236,45 +247,45 @@ int relayfs_remove_dir(struct dentry *dentry) } /** - * relayfs_open - open file op for relayfs files + * relay_file_open - open file op for relay files * @inode: the inode * @filp: the file * * Increments the channel buffer refcount. */ -static int relayfs_open(struct inode *inode, struct file *filp) +static int relay_file_open(struct inode *inode, struct file *filp) { - struct rchan_buf *buf = RELAYFS_I(inode)->buf; + struct rchan_buf *buf = inode->u.generic_ip; kref_get(&buf->kref); + filp->private_data = buf; return 0; } /** - * relayfs_mmap - mmap file op for relayfs files + * relay_file_mmap - mmap file op for relay files * @filp: the file * @vma: the vma describing what to map * * Calls upon relay_mmap_buf to map the file into user space. */ -static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma) +static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) { - struct inode *inode = filp->f_dentry->d_inode; - return relay_mmap_buf(RELAYFS_I(inode)->buf, vma); + struct rchan_buf *buf = filp->private_data; + return relay_mmap_buf(buf, vma); } /** - * relayfs_poll - poll file op for relayfs files + * relay_file_poll - poll file op for relay files * @filp: the file * @wait: poll table * * Poll implemention. */ -static unsigned int relayfs_poll(struct file *filp, poll_table *wait) +static unsigned int relay_file_poll(struct file *filp, poll_table *wait) { unsigned int mask = 0; - struct inode *inode = filp->f_dentry->d_inode; - struct rchan_buf *buf = RELAYFS_I(inode)->buf; + struct rchan_buf *buf = filp->private_data; if (buf->finalized) return POLLERR; @@ -289,27 +300,27 @@ static unsigned int relayfs_poll(struct file *filp, poll_table *wait) } /** - * relayfs_release - release file op for relayfs files + * relay_file_release - release file op for relay files * @inode: the inode * @filp: the file * * Decrements the channel refcount, as the filesystem is * no longer using it. */ -static int relayfs_release(struct inode *inode, struct file *filp) +static int relay_file_release(struct inode *inode, struct file *filp) { - struct rchan_buf *buf = RELAYFS_I(inode)->buf; + struct rchan_buf *buf = filp->private_data; kref_put(&buf->kref, relay_remove_buf); return 0; } /** - * relayfs_read_consume - update the consumed count for the buffer + * relay_file_read_consume - update the consumed count for the buffer */ -static void relayfs_read_consume(struct rchan_buf *buf, - size_t read_pos, - size_t bytes_consumed) +static void relay_file_read_consume(struct rchan_buf *buf, + size_t read_pos, + size_t bytes_consumed) { size_t subbuf_size = buf->chan->subbuf_size; size_t n_subbufs = buf->chan->n_subbufs; @@ -332,9 +343,9 @@ static void relayfs_read_consume(struct rchan_buf *buf, } /** - * relayfs_read_avail - boolean, are there unconsumed bytes available? + * relay_file_read_avail - boolean, are there unconsumed bytes available? */ -static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos) +static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) { size_t bytes_produced, bytes_consumed, write_offset; size_t subbuf_size = buf->chan->subbuf_size; @@ -365,16 +376,16 @@ static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos) if (bytes_produced == bytes_consumed) return 0; - relayfs_read_consume(buf, read_pos, 0); + relay_file_read_consume(buf, read_pos, 0); return 1; } /** - * relayfs_read_subbuf_avail - return bytes available in sub-buffer + * relay_file_read_subbuf_avail - return bytes available in sub-buffer */ -static size_t relayfs_read_subbuf_avail(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_subbuf_avail(size_t read_pos, + struct rchan_buf *buf) { size_t padding, avail = 0; size_t read_subbuf, read_offset, write_subbuf, write_offset; @@ -396,14 +407,14 @@ static size_t relayfs_read_subbuf_avail(size_t read_pos, } /** - * relayfs_read_start_pos - find the first available byte to read + * relay_file_read_start_pos - find the first available byte to read * * If the read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ -static size_t relayfs_read_start_pos(size_t read_pos, - struct rchan_buf *buf) +static size_t relay_file_read_start_pos(size_t read_pos, + struct rchan_buf *buf) { size_t read_subbuf, padding, padding_start, padding_end; size_t subbuf_size = buf->chan->subbuf_size; @@ -422,11 +433,11 @@ static size_t relayfs_read_start_pos(size_t read_pos, } /** - * relayfs_read_end_pos - return the new read position + * relay_file_read_end_pos - return the new read position */ -static size_t relayfs_read_end_pos(struct rchan_buf *buf, - size_t read_pos, - size_t count) +static size_t relay_file_read_end_pos(struct rchan_buf *buf, + size_t read_pos, + size_t count) { size_t read_subbuf, padding, end_pos; size_t subbuf_size = buf->chan->subbuf_size; @@ -445,7 +456,7 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf, } /** - * relayfs_read - read file op for relayfs files + * relay_file_read - read file op for relay files * @filp: the file * @buffer: the userspace buffer * @count: number of bytes to read @@ -454,23 +465,23 @@ static size_t relayfs_read_end_pos(struct rchan_buf *buf, * Reads count bytes or the number of bytes available in the * current sub-buffer being read, whichever is smaller. */ -static ssize_t relayfs_read(struct file *filp, - char __user *buffer, - size_t count, - loff_t *ppos) +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) { + struct rchan_buf *buf = filp->private_data; struct inode *inode = filp->f_dentry->d_inode; - struct rchan_buf *buf = RELAYFS_I(inode)->buf; size_t read_start, avail; ssize_t ret = 0; void *from; - down(&inode->i_sem); - if(!relayfs_read_avail(buf, *ppos)) + mutex_lock(&inode->i_mutex); + if(!relay_file_read_avail(buf, *ppos)) goto out; - read_start = relayfs_read_start_pos(*ppos, buf); - avail = relayfs_read_subbuf_avail(read_start, buf); + read_start = relay_file_read_start_pos(*ppos, buf); + avail = relay_file_read_subbuf_avail(read_start, buf); if (!avail) goto out; @@ -480,58 +491,25 @@ static ssize_t relayfs_read(struct file *filp, ret = -EFAULT; goto out; } - relayfs_read_consume(buf, read_start, count); - *ppos = relayfs_read_end_pos(buf, read_start, count); + relay_file_read_consume(buf, read_start, count); + *ppos = relay_file_read_end_pos(buf, read_start, count); out: - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return ret; } -/** - * relayfs alloc_inode() implementation - */ -static struct inode *relayfs_alloc_inode(struct super_block *sb) -{ - struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL); - if (!p) - return NULL; - p->buf = NULL; - - return &p->vfs_inode; -} - -/** - * relayfs destroy_inode() implementation - */ -static void relayfs_destroy_inode(struct inode *inode) -{ - if (RELAYFS_I(inode)->buf) - relay_destroy_buf(RELAYFS_I(inode)->buf); - - kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode)); -} - -static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags) -{ - struct relayfs_inode_info *i = p; - if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&i->vfs_inode); -} - -struct file_operations relayfs_file_operations = { - .open = relayfs_open, - .poll = relayfs_poll, - .mmap = relayfs_mmap, - .read = relayfs_read, +struct file_operations relay_file_operations = { + .open = relay_file_open, + .poll = relay_file_poll, + .mmap = relay_file_mmap, + .read = relay_file_read, .llseek = no_llseek, - .release = relayfs_release, + .release = relay_file_release, }; static struct super_operations relayfs_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, - .alloc_inode = relayfs_alloc_inode, - .destroy_inode = relayfs_destroy_inode, }; static int relayfs_fill_super(struct super_block * sb, void * data, int silent) @@ -544,7 +522,7 @@ static int relayfs_fill_super(struct super_block * sb, void * data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = RELAYFS_MAGIC; sb->s_op = &relayfs_ops; - inode = relayfs_get_inode(sb, mode, NULL); + inode = relayfs_get_inode(sb, mode, NULL, NULL); if (!inode) return -ENOMEM; @@ -575,33 +553,27 @@ static struct file_system_type relayfs_fs_type = { static int __init init_relayfs_fs(void) { - int err; - - relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache", - sizeof(struct relayfs_inode_info), 0, - 0, init_once, NULL); - if (!relayfs_inode_cachep) - return -ENOMEM; - - err = register_filesystem(&relayfs_fs_type); - if (err) - kmem_cache_destroy(relayfs_inode_cachep); - - return err; + return register_filesystem(&relayfs_fs_type); } static void __exit exit_relayfs_fs(void) { + + + + + unregister_filesystem(&relayfs_fs_type); - kmem_cache_destroy(relayfs_inode_cachep); } module_init(init_relayfs_fs) module_exit(exit_relayfs_fs) -EXPORT_SYMBOL_GPL(relayfs_file_operations); +EXPORT_SYMBOL_GPL(relay_file_operations); EXPORT_SYMBOL_GPL(relayfs_create_dir); EXPORT_SYMBOL_GPL(relayfs_remove_dir); +EXPORT_SYMBOL_GPL(relayfs_create_file); +EXPORT_SYMBOL_GPL(relayfs_remove_file); MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>"); MODULE_DESCRIPTION("Relay Filesystem"); diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c index 2a6f7f12b7f9..abf3ceaace49 100644 --- a/fs/relayfs/relay.c +++ b/fs/relayfs/relay.c @@ -80,11 +80,34 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf, { } +/* + * create_buf_file_create() default callback. Creates file to represent buf. + */ +static struct dentry *create_buf_file_default_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return relayfs_create_file(filename, parent, mode, + &relay_file_operations, buf); +} + +/* + * remove_buf_file() default callback. Removes file representing relay buffer. + */ +static int remove_buf_file_default_callback(struct dentry *dentry) +{ + return relayfs_remove(dentry); +} + /* relay channel default callbacks */ static struct rchan_callbacks default_channel_callbacks = { .subbuf_start = subbuf_start_default_callback, .buf_mapped = buf_mapped_default_callback, .buf_unmapped = buf_unmapped_default_callback, + .create_buf_file = create_buf_file_default_callback, + .remove_buf_file = remove_buf_file_default_callback, }; /** @@ -148,14 +171,16 @@ static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) void relay_reset(struct rchan *chan) { unsigned int i; + struct rchan_buf *prev = NULL; if (!chan) return; for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i]) - continue; + if (!chan->buf[i] || chan->buf[i] == prev) + break; __relay_reset(chan->buf[i], 0); + prev = chan->buf[i]; } } @@ -166,17 +191,27 @@ void relay_reset(struct rchan *chan) */ static struct rchan_buf *relay_open_buf(struct rchan *chan, const char *filename, - struct dentry *parent) + struct dentry *parent, + int *is_global) { struct rchan_buf *buf; struct dentry *dentry; + if (*is_global) + return chan->buf[0]; + + buf = relay_create_buf(chan); + if (!buf) + return NULL; + /* Create file in fs */ - dentry = relayfs_create_file(filename, parent, S_IRUSR, chan); - if (!dentry) + dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, + buf, is_global); + if (!dentry) { + relay_destroy_buf(buf); return NULL; + } - buf = RELAYFS_I(dentry->d_inode)->buf; buf->dentry = dentry; __relay_reset(buf, 1); @@ -214,6 +249,10 @@ static inline void setup_callbacks(struct rchan *chan, cb->buf_mapped = buf_mapped_default_callback; if (!cb->buf_unmapped) cb->buf_unmapped = buf_unmapped_default_callback; + if (!cb->create_buf_file) + cb->create_buf_file = create_buf_file_default_callback; + if (!cb->remove_buf_file) + cb->remove_buf_file = remove_buf_file_default_callback; chan->cb = cb; } @@ -241,6 +280,7 @@ struct rchan *relay_open(const char *base_filename, unsigned int i; struct rchan *chan; char *tmpname; + int is_global = 0; if (!base_filename) return NULL; @@ -265,7 +305,8 @@ struct rchan *relay_open(const char *base_filename, for_each_online_cpu(i) { sprintf(tmpname, "%s%d", base_filename, i); - chan->buf[i] = relay_open_buf(chan, tmpname, parent); + chan->buf[i] = relay_open_buf(chan, tmpname, parent, + &is_global); chan->buf[i]->cpu = i; if (!chan->buf[i]) goto free_bufs; @@ -279,6 +320,8 @@ free_bufs: if (!chan->buf[i]) break; relay_close_buf(chan->buf[i]); + if (is_global) + break; } kfree(tmpname); @@ -388,14 +431,16 @@ void relay_destroy_channel(struct kref *kref) void relay_close(struct rchan *chan) { unsigned int i; + struct rchan_buf *prev = NULL; if (!chan) return; for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i]) - continue; + if (!chan->buf[i] || chan->buf[i] == prev) + break; relay_close_buf(chan->buf[i]); + prev = chan->buf[i]; } if (chan->last_toobig) @@ -415,14 +460,16 @@ void relay_close(struct rchan *chan) void relay_flush(struct rchan *chan) { unsigned int i; + struct rchan_buf *prev = NULL; if (!chan) return; for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i]) - continue; + if (!chan->buf[i] || chan->buf[i] == prev) + break; relay_switch_subbuf(chan->buf[i], 0); + prev = chan->buf[i]; } } diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h index 703503fa22b6..0993d3e5753b 100644 --- a/fs/relayfs/relay.h +++ b/fs/relayfs/relay.h @@ -1,10 +1,6 @@ #ifndef _RELAY_H #define _RELAY_H -struct dentry *relayfs_create_file(const char *name, - struct dentry *parent, - int mode, - struct rchan *chan); extern int relayfs_remove(struct dentry *dentry); extern int relay_buf_empty(struct rchan_buf *buf); extern void relay_destroy_channel(struct kref *kref); diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index c74f382dabba..0a13859fd57b 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -418,7 +418,7 @@ static int romfs_readpage(struct file *file, struct page * page) { struct inode *inode = page->mapping->host; - unsigned long offset, avail, readlen; + loff_t offset, avail, readlen; void *buf; int result = -EIO; @@ -429,8 +429,8 @@ romfs_readpage(struct file *file, struct page * page) goto err_out; /* 32 bit warning -- but not for us :) */ - offset = page->index << PAGE_CACHE_SHIFT; - if (offset < inode->i_size) { + offset = page_offset(page); + if (offset < i_size_read(inode)) { avail = inode->i_size-offset; readlen = min_t(unsigned long, avail, PAGE_SIZE); if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) { diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c index f3e6b81288ab..74b86d9725a6 100644 --- a/fs/smbfs/cache.c +++ b/fs/smbfs/cache.c @@ -66,7 +66,7 @@ smb_invalidate_dircache_entries(struct dentry *parent) spin_lock(&dcache_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { - dentry = list_entry(next, struct dentry, d_child); + dentry = list_entry(next, struct dentry, d_u.d_child); dentry->d_fsdata = NULL; smb_age_dentry(server, dentry); next = next->next; @@ -100,7 +100,7 @@ smb_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) spin_lock(&dcache_lock); next = parent->d_subdirs.next; while (next != &parent->d_subdirs) { - dent = list_entry(next, struct dentry, d_child); + dent = list_entry(next, struct dentry, d_u.d_child); if ((unsigned long)dent->d_fsdata == fpos) { if (dent->d_inode) dget_locked(dent); diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index b4fcfa8b55a1..7042e62726a4 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -209,8 +209,8 @@ smb_updatepage(struct file *file, struct page *page, unsigned long offset, { struct dentry *dentry = file->f_dentry; - DEBUG1("(%s/%s %d@%ld)\n", DENTRY_PATH(dentry), - count, (page->index << PAGE_CACHE_SHIFT)+offset); + DEBUG1("(%s/%s %d@%lld)\n", DENTRY_PATH(dentry), count, + ((unsigned long long)page->index << PAGE_CACHE_SHIFT) + offset); return smb_writepage_sync(dentry->d_inode, page, offset, count); } @@ -374,8 +374,7 @@ smb_file_release(struct inode *inode, struct file * file) /* We must flush any dirty pages now as we won't be able to write anything after close. mmap can trigger this. "openers" should perhaps include mmap'ers ... */ - filemap_fdatawrite(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + filemap_write_and_wait(inode->i_mapping); smb_close(inode); } unlock_kernel(); diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 10b994428fef..6ec88bf59b2d 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -697,8 +697,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr) DENTRY_PATH(dentry), (long) inode->i_size, (long) attr->ia_size); - filemap_fdatawrite(inode->i_mapping); - filemap_fdatawait(inode->i_mapping); + filemap_write_and_wait(inode->i_mapping); error = smb_open(dentry, O_WRONLY); if (error) diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index 38ab558835c4..b1b878b81730 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -8,6 +8,7 @@ */ #include <linux/types.h> +#include <linux/capability.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/fs.h> @@ -3113,7 +3114,7 @@ smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, LSET(data, 32, SMB_TIME_NO_CHANGE); LSET(data, 40, SMB_UID_NO_CHANGE); LSET(data, 48, SMB_GID_NO_CHANGE); - LSET(data, 56, smb_filetype_from_mode(attr->ia_mode)); + DSET(data, 56, smb_filetype_from_mode(attr->ia_mode)); LSET(data, 60, major); LSET(data, 68, minor); LSET(data, 76, 0); diff --git a/fs/super.c b/fs/super.c index 6689dded3c84..c177b92419c5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -72,7 +72,7 @@ static struct super_block *alloc_super(void) INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); init_rwsem(&s->s_umount); - sema_init(&s->s_lock, 1); + mutex_init(&s->s_lock); down_write(&s->s_umount); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); @@ -665,16 +665,6 @@ static int test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } -static void bdev_uevent(struct block_device *bdev, enum kobject_action action) -{ - if (bdev->bd_disk) { - if (bdev->bd_part) - kobject_uevent(&bdev->bd_part->kobj, action, NULL); - else - kobject_uevent(&bdev->bd_disk->kobj, action, NULL); - } -} - struct super_block *get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) @@ -710,17 +700,14 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, s->s_flags = flags; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); - s->s_old_blocksize = block_size(bdev); - sb_set_blocksize(s, s->s_old_blocksize); + sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & MS_VERBOSE ? 1 : 0); if (error) { up_write(&s->s_umount); deactivate_super(s); s = ERR_PTR(error); - } else { + } else s->s_flags |= MS_ACTIVE; - bdev_uevent(bdev, KOBJ_MOUNT); - } } return s; @@ -736,7 +723,6 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - bdev_uevent(bdev, KOBJ_UMOUNT); generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_excl(bdev); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 59734ba1ee60..49bd219275db 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -99,7 +99,7 @@ static int create_dir(struct kobject * k, struct dentry * p, int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - down(&p->d_inode->i_sem); + mutex_lock(&p->d_inode->i_mutex); *d = lookup_one_len(n, p, strlen(n)); if (!IS_ERR(*d)) { error = sysfs_make_dirent(p->d_fsdata, *d, k, mode, SYSFS_DIR); @@ -112,13 +112,17 @@ static int create_dir(struct kobject * k, struct dentry * p, } } if (error && (error != -EEXIST)) { - sysfs_put((*d)->d_fsdata); + struct sysfs_dirent *sd = (*d)->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + sysfs_put(sd); + } d_drop(*d); } dput(*d); } else error = PTR_ERR(*d); - up(&p->d_inode->i_sem); + mutex_unlock(&p->d_inode->i_mutex); return error; } @@ -242,7 +246,7 @@ static void remove_dir(struct dentry * d) struct dentry * parent = dget(d->d_parent); struct sysfs_dirent * sd; - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); d_delete(d); sd = d->d_fsdata; list_del_init(&sd->s_sibling); @@ -253,7 +257,7 @@ static void remove_dir(struct dentry * d) pr_debug(" o %s removing done (%d)\n",d->d_name.name, atomic_read(&d->d_count)); - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); dput(parent); } @@ -282,7 +286,7 @@ void sysfs_remove_dir(struct kobject * kobj) return; pr_debug("sysfs %s: removing dir\n",dentry->d_name.name); - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); parent_sd = dentry->d_fsdata; list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) { if (!sd->s_element || !(sd->s_type & SYSFS_NOT_PINNED)) @@ -291,7 +295,7 @@ void sysfs_remove_dir(struct kobject * kobj) sysfs_drop_dentry(sd, dentry); sysfs_put(sd); } - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); remove_dir(dentry); /** @@ -314,7 +318,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) down_write(&sysfs_rename_sem); parent = kobj->parent->dentry; - down(&parent->d_inode->i_sem); + mutex_lock(&parent->d_inode->i_mutex); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { @@ -330,7 +334,7 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) error = -EEXIST; dput(new_dentry); } - up(&parent->d_inode->i_sem); + mutex_unlock(&parent->d_inode->i_mutex); up_write(&sysfs_rename_sem); return error; @@ -341,9 +345,9 @@ static int sysfs_dir_open(struct inode *inode, struct file *file) struct dentry * dentry = file->f_dentry; struct sysfs_dirent * parent_sd = dentry->d_fsdata; - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); file->private_data = sysfs_new_dirent(parent_sd, NULL); - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); return file->private_data ? 0 : -ENOMEM; @@ -354,9 +358,9 @@ static int sysfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_dentry; struct sysfs_dirent * cursor = file->private_data; - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); list_del_init(&cursor->s_sibling); - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); release_sysfs_dirent(cursor); @@ -432,7 +436,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) { struct dentry * dentry = file->f_dentry; - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); switch (origin) { case 1: offset += file->f_pos; @@ -440,7 +444,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) if (offset >= 0) break; default: - up(&file->f_dentry->d_inode->i_sem); + mutex_unlock(&file->f_dentry->d_inode->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -464,7 +468,7 @@ static loff_t sysfs_dir_lseek(struct file * file, loff_t offset, int origin) list_add_tail(&cursor->s_sibling, p); } } - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); return offset; } @@ -479,4 +483,3 @@ struct file_operations sysfs_dir_operations = { EXPORT_SYMBOL_GPL(sysfs_create_dir); EXPORT_SYMBOL_GPL(sysfs_remove_dir); EXPORT_SYMBOL_GPL(sysfs_rename_dir); - diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 4013d7905e84..d0e3d8495165 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -364,9 +364,9 @@ int sysfs_add_file(struct dentry * dir, const struct attribute * attr, int type) umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; int error = 0; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); error = sysfs_make_dirent(parent_sd, NULL, (void *) attr, mode, type); - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return error; } @@ -398,7 +398,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) struct dentry * victim; int res = -ENOENT; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { /* make sure dentry is really there */ @@ -420,7 +420,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr) */ dput(victim); } - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return res; } @@ -441,22 +441,22 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) struct iattr newattrs; int res = -ENOENT; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); victim = lookup_one_len(attr->name, dir, strlen(attr->name)); if (!IS_ERR(victim)) { if (victim->d_inode && (victim->d_parent->d_inode == dir->d_inode)) { inode = victim->d_inode; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; res = notify_change(victim, &newattrs); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); } dput(victim); } - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return res; } @@ -480,4 +480,3 @@ void sysfs_remove_file(struct kobject * kobj, const struct attribute * attr) EXPORT_SYMBOL_GPL(sysfs_create_file); EXPORT_SYMBOL_GPL(sysfs_remove_file); EXPORT_SYMBOL_GPL(sysfs_update_file); - diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 970a33f03299..689f7bcfaf30 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -11,6 +11,7 @@ #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/backing-dev.h> +#include <linux/capability.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -201,7 +202,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) /* * Unhashes the dentry corresponding to given sysfs_dirent - * Called with parent inode's i_sem held. + * Called with parent inode's i_mutex held. */ void sysfs_drop_dentry(struct sysfs_dirent * sd, struct dentry * parent) { @@ -232,7 +233,7 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -243,7 +244,5 @@ void sysfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); } - - diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index de402fa915f2..e38d6338a20d 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -86,9 +86,9 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char BUG_ON(!kobj || !kobj->dentry || !name); - down(&dentry->d_inode->i_sem); + mutex_lock(&dentry->d_inode->i_mutex); error = sysfs_add_link(dentry, name, target); - up(&dentry->d_inode->i_sem); + mutex_unlock(&dentry->d_inode->i_mutex); return error; } @@ -177,4 +177,3 @@ struct inode_operations sysfs_symlink_inode_operations = { EXPORT_SYMBOL_GPL(sysfs_create_link); EXPORT_SYMBOL_GPL(sysfs_remove_link); - diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 69a085abad6f..cce8b05cba5a 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -103,7 +103,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) offset = (char *)de - kaddr; over = filldir(dirent, name, strnlen(name,SYSV_NAMELEN), - (n<<PAGE_CACHE_SHIFT) | offset, + ((loff_t)n<<PAGE_CACHE_SHIFT) | offset, fs16_to_cpu(SYSV_SB(sb), de->inode), DT_UNKNOWN); if (over) { @@ -115,7 +115,7 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) } done: - filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; + filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; unlock_kernel(); return 0; } diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index b9ded26b10a9..4fae57d9d115 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -4,11 +4,6 @@ * PURPOSE * Block allocation handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: @@ -46,7 +41,7 @@ #define uint(x) xuint(x) #define xuint(x) __le ## x -extern inline int find_next_one_bit (void * addr, int size, int offset) +static inline int find_next_one_bit (void * addr, int size, int offset) { uintBPL_t * p = ((uintBPL_t *) addr) + (offset / BITS_PER_LONG); int result = offset & ~(BITS_PER_LONG-1); diff --git a/fs/udf/crc.c b/fs/udf/crc.c index d95c6e38a455..1b82a4adc2f7 100644 --- a/fs/udf/crc.c +++ b/fs/udf/crc.c @@ -14,11 +14,6 @@ * * AT&T gives permission for the free use of the CRC source code. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 82440b731142..f5222527fe39 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -4,11 +4,6 @@ * PURPOSE * Directory handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 9a61ecc5451b..fe751a2a0e47 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -4,11 +4,6 @@ * PURPOSE * Directory related functions * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/file.c b/fs/udf/file.c index 01f520c71dc1..a6f2acc1f15c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -4,11 +4,6 @@ * PURPOSE * File handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: @@ -36,6 +31,7 @@ #include <asm/uaccess.h> #include <linux/kernel.h> #include <linux/string.h> /* memset */ +#include <linux/capability.h> #include <linux/errno.h> #include <linux/smp_lock.h> #include <linux/pagemap.h> diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c index 2dde6b888c2b..5887d78cde43 100644 --- a/fs/udf/fsync.c +++ b/fs/udf/fsync.c @@ -4,11 +4,6 @@ * PURPOSE * Fsync handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index a7e5d40f1ebc..c9b707b470ca 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -4,11 +4,6 @@ * PURPOSE * Inode allocation handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/inode.c b/fs/udf/inode.c index b83890beaaac..395e582ee542 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -4,11 +4,6 @@ * PURPOSE * Inode handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: @@ -1962,11 +1957,6 @@ int8_t inode_bmap(struct inode *inode, int block, kernel_lb_addr *bloc, uint32_t printk(KERN_ERR "udf: inode_bmap: block < 0\n"); return -1; } - if (!inode) - { - printk(KERN_ERR "udf: inode_bmap: NULL inode\n"); - return -1; - } *extoffset = 0; *elen = 0; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 2da5087dfe05..084216107667 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -4,11 +4,6 @@ * PURPOSE * Low Level Device Routines for the UDF filesystem * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/misc.c b/fs/udf/misc.c index fd321f9ace83..cc8ca3254db1 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -4,11 +4,6 @@ * PURPOSE * Miscellaneous routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ac191ed7df0a..ca732e79c48b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -4,11 +4,6 @@ * PURPOSE * Inode name handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/partition.c b/fs/udf/partition.c index 4d36f264be0d..dabf2b841db8 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -4,11 +4,6 @@ * PURPOSE * Partition handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/super.c b/fs/udf/super.c index 15bd4f24c5b7..4a6f49adc609 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -14,11 +14,6 @@ * http://www.ecma.ch/ * http://www.iso.org/ * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 43f3051ef756..674bb40edc83 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -4,11 +4,6 @@ * PURPOSE * Symlink handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 7dc8a5572ca1..e1b0e8cfecb4 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -4,11 +4,6 @@ * PURPOSE * Truncate handling routines for the OSTA-UDF(tm) filesystem. * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 5a80efd8debc..706c92e1dcc9 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -11,11 +11,6 @@ * UTF-8 is explained in the IETF RFC XXXX. * ftp://ftp.internic.net/rfc/rfcxxxx.txt * - * CONTACTS - * E-mail regarding any portion of the Linux UDF file system should be - * directed to the development team's mailing list (run by majordomo): - * linux_udf@hpesjro.fc.hp.com - * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index faf1512173eb..a9f4421ddb6f 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -13,6 +13,7 @@ #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> +#include <linux/capability.h> #include <linux/sched.h> #include <linux/bitops.h> #include <asm/byteorder.h> diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index d0915fba155a..7c10c68902ae 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -491,7 +491,7 @@ int ufs_delete_entry (struct inode * inode, struct ufs_dir_entry * dir, UFSD(("ino %u, reclen %u, namlen %u, name %s\n", fs32_to_cpu(sb, de->d_ino), - fs16to_cpu(sb, de->d_reclen), + fs16_to_cpu(sb, de->d_reclen), ufs_get_de_namlen(sb, de), de->d_name)) while (i < bh->b_size) { diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 54828ebcf1ba..e9a42c711a9e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1275,7 +1275,7 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type, size_t towrite = len; struct buffer_head *bh; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -1296,14 +1296,16 @@ static ssize_t ufs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return len - towrite; } diff --git a/fs/xattr.c b/fs/xattr.c index bcc2156d4d28..80eca7d3d69f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -19,6 +19,149 @@ #include <linux/fsnotify.h> #include <asm/uaccess.h> + +/* + * Check permissions for extended attribute access. This is a bit complicated + * because different namespaces have very different rules. + */ +static int +xattr_permission(struct inode *inode, const char *name, int mask) +{ + /* + * We can never set or remove an extended attribute on a read-only + * filesystem or on an immutable / append-only inode. + */ + if (mask & MAY_WRITE) { + if (IS_RDONLY(inode)) + return -EROFS; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + /* + * No restriction for security.* and system.* from the VFS. Decision + * on these is left to the underlying filesystem / security module. + */ + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return 0; + + /* + * The trusted.* namespace can only accessed by a privilegued user. + */ + if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); + + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { + if (!S_ISREG(inode->i_mode) && + (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + } + + return permission(inode, mask, NULL); +} + +int +vfs_setxattr(struct dentry *dentry, char *name, void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + error = -EOPNOTSUPP; + if (inode->i_op->setxattr) { + error = inode->i_op->setxattr(dentry, name, value, size, flags); + if (!error) { + fsnotify_xattr(dentry); + security_inode_post_setxattr(dentry, name, value, + size, flags); + } + } else if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(inode, suffix, value, + size, flags); + if (!error) + fsnotify_xattr(dentry); + } +out: + mutex_unlock(&inode->i_mutex); + return error; +} +EXPORT_SYMBOL_GPL(vfs_setxattr); + +ssize_t +vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_READ); + if (error) + return error; + + error = security_inode_getxattr(dentry, name); + if (error) + return error; + + if (inode->i_op->getxattr) + error = inode->i_op->getxattr(dentry, name, value, size); + else + error = -EOPNOTSUPP; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + int ret = security_inode_getsecurity(inode, suffix, value, + size, error); + /* + * Only overwrite the return value if a security module + * is actually active. + */ + if (ret != -EOPNOTSUPP) + error = ret; + } + + return error; +} +EXPORT_SYMBOL_GPL(vfs_getxattr); + +int +vfs_removexattr(struct dentry *dentry, char *name) +{ + struct inode *inode = dentry->d_inode; + int error; + + if (!inode->i_op->removexattr) + return -EOPNOTSUPP; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + error = security_inode_removexattr(dentry, name); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = inode->i_op->removexattr(dentry, name); + mutex_unlock(&inode->i_mutex); + + if (!error) + fsnotify_xattr(dentry); + return error; +} +EXPORT_SYMBOL_GPL(vfs_removexattr); + + /* * Extended attribute SET operations */ @@ -51,29 +194,7 @@ setxattr(struct dentry *d, char __user *name, void __user *value, } } - down(&d->d_inode->i_sem); - error = security_inode_setxattr(d, kname, kvalue, size, flags); - if (error) - goto out; - error = -EOPNOTSUPP; - if (d->d_inode->i_op && d->d_inode->i_op->setxattr) { - error = d->d_inode->i_op->setxattr(d, kname, kvalue, - size, flags); - if (!error) { - fsnotify_xattr(d); - security_inode_post_setxattr(d, kname, kvalue, - size, flags); - } - } else if (!strncmp(kname, XATTR_SECURITY_PREFIX, - sizeof XATTR_SECURITY_PREFIX - 1)) { - const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1; - error = security_inode_setsecurity(d->d_inode, suffix, kvalue, - size, flags); - if (!error) - fsnotify_xattr(d); - } -out: - up(&d->d_inode->i_sem); + error = vfs_setxattr(d, kname, kvalue, size, flags); kfree(kvalue); return error; } @@ -147,22 +268,7 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size) return -ENOMEM; } - error = security_inode_getxattr(d, kname); - if (error) - goto out; - error = -EOPNOTSUPP; - if (d->d_inode->i_op && d->d_inode->i_op->getxattr) - error = d->d_inode->i_op->getxattr(d, kname, kvalue, size); - - if (!strncmp(kname, XATTR_SECURITY_PREFIX, - sizeof XATTR_SECURITY_PREFIX - 1)) { - const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1; - int rv = security_inode_getsecurity(d->d_inode, suffix, kvalue, - size, error); - /* Security module active: overwrite error value */ - if (rv != -EOPNOTSUPP) - error = rv; - } + error = vfs_getxattr(d, kname, kvalue, size); if (error > 0) { if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; @@ -171,7 +277,6 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size) than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } -out: kfree(kvalue); return error; } @@ -318,19 +423,7 @@ removexattr(struct dentry *d, char __user *name) if (error < 0) return error; - error = -EOPNOTSUPP; - if (d->d_inode->i_op && d->d_inode->i_op->removexattr) { - error = security_inode_removexattr(d, kname); - if (error) - goto out; - down(&d->d_inode->i_sem); - error = d->d_inode->i_op->removexattr(d, kname); - up(&d->d_inode->i_sem); - if (!error) - fsnotify_xattr(d); - } -out: - return error; + return vfs_removexattr(d, kname); } asmlinkage long diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild new file mode 100644 index 000000000000..2566e96706f1 --- /dev/null +++ b/fs/xfs/Kbuild @@ -0,0 +1,6 @@ +# +# The xfs people like to share Makefile with 2.6 and 2.4. +# Utilise file named Kbuild file which has precedence over Makefile. +# + +include $(srctree)/$(obj)/Makefile-linux-2.6 diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/linux-2.6/mutex.h index ce773d89a923..d3369b6ca168 100644 --- a/fs/xfs/linux-2.6/mutex.h +++ b/fs/xfs/linux-2.6/mutex.h @@ -19,7 +19,7 @@ #define __XFS_SUPPORT_MUTEX_H__ #include <linux/spinlock.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> /* * Map the mutex'es from IRIX to Linux semaphores. @@ -28,12 +28,8 @@ * callers. */ #define MUTEX_DEFAULT 0x0 -typedef struct semaphore mutex_t; -#define mutex_init(lock, type, name) sema_init(lock, 1) -#define mutex_destroy(lock) sema_init(lock, -99) -#define mutex_lock(lock, num) down(lock) -#define mutex_trylock(lock) (down_trylock(lock) ? 0 : 1) -#define mutex_unlock(lock) up(lock) +typedef struct mutex mutex_t; +//#define mutex_destroy(lock) do{}while(0) #endif /* __XFS_SUPPORT_MUTEX_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 94d3cdfbf9b8..d1db8c17a74e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -40,11 +40,10 @@ #include "xfs_rw.h" #include "xfs_iomap.h" #include <linux/mpage.h> +#include <linux/pagevec.h> #include <linux/writeback.h> STATIC void xfs_count_page_state(struct page *, int *, int *, int *); -STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *, - struct writeback_control *wbc, void *, int, int); #if defined(XFS_RW_TRACE) void @@ -55,17 +54,15 @@ xfs_page_trace( int mask) { xfs_inode_t *ip; - bhv_desc_t *bdp; vnode_t *vp = LINVFS_GET_VP(inode); loff_t isize = i_size_read(inode); - loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; + loff_t offset = page_offset(page); int delalloc = -1, unmapped = -1, unwritten = -1; if (page_has_buffers(page)) xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); - ip = XFS_BHVTOI(bdp); + ip = xfs_vtoi(vp); if (!ip->i_rwtrace) return; @@ -103,15 +100,56 @@ xfs_finish_ioend( queue_work(xfsdatad_workqueue, &ioend->io_work); } +/* + * We're now finished for good with this ioend structure. + * Update the page state via the associated buffer_heads, + * release holds on the inode and bio, and finally free + * up memory. Do not use the ioend after this. + */ STATIC void xfs_destroy_ioend( xfs_ioend_t *ioend) { + struct buffer_head *bh, *next; + + for (bh = ioend->io_buffer_head; bh; bh = next) { + next = bh->b_private; + bh->b_end_io(bh, ioend->io_uptodate); + } + vn_iowake(ioend->io_vnode); mempool_free(ioend, xfs_ioend_pool); } /* + * Buffered IO write completion for delayed allocate extents. + * TODO: Update ondisk isize now that we know the file data + * has been flushed (i.e. the notorious "NULL file" problem). + */ +STATIC void +xfs_end_bio_delalloc( + void *data) +{ + xfs_ioend_t *ioend = data; + + xfs_destroy_ioend(ioend); +} + +/* + * Buffered IO write completion for regular, written extents. + */ +STATIC void +xfs_end_bio_written( + void *data) +{ + xfs_ioend_t *ioend = data; + + xfs_destroy_ioend(ioend); +} + +/* + * IO write completion for unwritten extents. + * * Issue transactions to convert a buffer range from unwritten * to written extents. */ @@ -123,21 +161,10 @@ xfs_end_bio_unwritten( vnode_t *vp = ioend->io_vnode; xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; - struct buffer_head *bh, *next; int error; if (ioend->io_uptodate) VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); - - /* ioend->io_buffer_head is only non-NULL for buffered I/O */ - for (bh = ioend->io_buffer_head; bh; bh = next) { - next = bh->b_private; - - bh->b_end_io = NULL; - clear_buffer_unwritten(bh); - end_buffer_async_write(bh, ioend->io_uptodate); - } - xfs_destroy_ioend(ioend); } @@ -149,7 +176,8 @@ xfs_end_bio_unwritten( */ STATIC xfs_ioend_t * xfs_alloc_ioend( - struct inode *inode) + struct inode *inode, + unsigned int type) { xfs_ioend_t *ioend; @@ -162,45 +190,25 @@ xfs_alloc_ioend( */ atomic_set(&ioend->io_remaining, 1); ioend->io_uptodate = 1; /* cleared if any I/O fails */ + ioend->io_list = NULL; + ioend->io_type = type; ioend->io_vnode = LINVFS_GET_VP(inode); ioend->io_buffer_head = NULL; + ioend->io_buffer_tail = NULL; atomic_inc(&ioend->io_vnode->v_iocount); ioend->io_offset = 0; ioend->io_size = 0; - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); + if (type == IOMAP_UNWRITTEN) + INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); + else if (type == IOMAP_DELAY) + INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend); + else + INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend); return ioend; } -void -linvfs_unwritten_done( - struct buffer_head *bh, - int uptodate) -{ - xfs_ioend_t *ioend = bh->b_private; - static spinlock_t unwritten_done_lock = SPIN_LOCK_UNLOCKED; - unsigned long flags; - - ASSERT(buffer_unwritten(bh)); - bh->b_end_io = NULL; - - if (!uptodate) - ioend->io_uptodate = 0; - - /* - * Deep magic here. We reuse b_private in the buffer_heads to build - * a chain for completing the I/O from user context after we've issued - * a transaction to convert the unwritten extent. - */ - spin_lock_irqsave(&unwritten_done_lock, flags); - bh->b_private = ioend->io_buffer_head; - ioend->io_buffer_head = bh; - spin_unlock_irqrestore(&unwritten_done_lock, flags); - - xfs_finish_ioend(ioend); -} - STATIC int xfs_map_blocks( struct inode *inode, @@ -218,138 +226,260 @@ xfs_map_blocks( return -error; } +STATIC inline int +xfs_iomap_valid( + xfs_iomap_t *iomapp, + loff_t offset) +{ + return offset >= iomapp->iomap_offset && + offset < iomapp->iomap_offset + iomapp->iomap_bsize; +} + /* - * Finds the corresponding mapping in block @map array of the - * given @offset within a @page. + * BIO completion handler for buffered IO. */ -STATIC xfs_iomap_t * -xfs_offset_to_map( +STATIC int +xfs_end_bio( + struct bio *bio, + unsigned int bytes_done, + int error) +{ + xfs_ioend_t *ioend = bio->bi_private; + + if (bio->bi_size) + return 1; + + ASSERT(ioend); + ASSERT(atomic_read(&bio->bi_cnt) >= 1); + + /* Toss bio and pass work off to an xfsdatad thread */ + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + ioend->io_uptodate = 0; + bio->bi_private = NULL; + bio->bi_end_io = NULL; + + bio_put(bio); + xfs_finish_ioend(ioend); + return 0; +} + +STATIC void +xfs_submit_ioend_bio( + xfs_ioend_t *ioend, + struct bio *bio) +{ + atomic_inc(&ioend->io_remaining); + + bio->bi_private = ioend; + bio->bi_end_io = xfs_end_bio; + + submit_bio(WRITE, bio); + ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); + bio_put(bio); +} + +STATIC struct bio * +xfs_alloc_ioend_bio( + struct buffer_head *bh) +{ + struct bio *bio; + int nvecs = bio_get_nr_vecs(bh->b_bdev); + + do { + bio = bio_alloc(GFP_NOIO, nvecs); + nvecs >>= 1; + } while (!bio); + + ASSERT(bio->bi_private == NULL); + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio_get(bio); + return bio; +} + +STATIC void +xfs_start_buffer_writeback( + struct buffer_head *bh) +{ + ASSERT(buffer_mapped(bh)); + ASSERT(buffer_locked(bh)); + ASSERT(!buffer_delay(bh)); + ASSERT(!buffer_unwritten(bh)); + + mark_buffer_async_write(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); +} + +STATIC void +xfs_start_page_writeback( struct page *page, - xfs_iomap_t *iomapp, - unsigned long offset) + struct writeback_control *wbc, + int clear_dirty, + int buffers) +{ + ASSERT(PageLocked(page)); + ASSERT(!PageWriteback(page)); + set_page_writeback(page); + if (clear_dirty) + clear_page_dirty(page); + unlock_page(page); + if (!buffers) { + end_page_writeback(page); + wbc->pages_skipped++; /* We didn't write this page */ + } +} + +static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) +{ + return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/* + * Submit all of the bios for all of the ioends we have saved up, + * covering the initial writepage page and also any probed pages. + */ +STATIC void +xfs_submit_ioend( + xfs_ioend_t *ioend) +{ + xfs_ioend_t *next; + struct buffer_head *bh; + struct bio *bio; + sector_t lastblock = 0; + + do { + next = ioend->io_list; + bio = NULL; + + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + xfs_start_buffer_writeback(bh); + + if (!bio) { + retry: + bio = xfs_alloc_ioend_bio(bh); + } else if (bh->b_blocknr != lastblock + 1) { + xfs_submit_ioend_bio(ioend, bio); + goto retry; + } + + if (bio_add_buffer(bio, bh) != bh->b_size) { + xfs_submit_ioend_bio(ioend, bio); + goto retry; + } + + lastblock = bh->b_blocknr; + } + if (bio) + xfs_submit_ioend_bio(ioend, bio); + xfs_finish_ioend(ioend); + } while ((ioend = next) != NULL); +} + +/* + * Cancel submission of all buffer_heads so far in this endio. + * Toss the endio too. Only ever called for the initial page + * in a writepage request, so only ever one page. + */ +STATIC void +xfs_cancel_ioend( + xfs_ioend_t *ioend) +{ + xfs_ioend_t *next; + struct buffer_head *bh, *next_bh; + + do { + next = ioend->io_list; + bh = ioend->io_buffer_head; + do { + next_bh = bh->b_private; + clear_buffer_async_write(bh); + unlock_buffer(bh); + } while ((bh = next_bh) != NULL); + + vn_iowake(ioend->io_vnode); + mempool_free(ioend, xfs_ioend_pool); + } while ((ioend = next) != NULL); +} + +/* + * Test to see if we've been building up a completion structure for + * earlier buffers -- if so, we try to append to this ioend if we + * can, otherwise we finish off any current ioend and start another. + * Return true if we've finished the given ioend. + */ +STATIC void +xfs_add_to_ioend( + struct inode *inode, + struct buffer_head *bh, + xfs_off_t offset, + unsigned int type, + xfs_ioend_t **result, + int need_ioend) { - loff_t full_offset; /* offset from start of file */ + xfs_ioend_t *ioend = *result; - ASSERT(offset < PAGE_CACHE_SIZE); + if (!ioend || need_ioend || type != ioend->io_type) { + xfs_ioend_t *previous = *result; - full_offset = page->index; /* NB: using 64bit number */ - full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */ - full_offset += offset; /* offset from page start */ + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_buffer_head = bh; + ioend->io_buffer_tail = bh; + if (previous) + previous->io_list = ioend; + *result = ioend; + } else { + ioend->io_buffer_tail->b_private = bh; + ioend->io_buffer_tail = bh; + } - if (full_offset < iomapp->iomap_offset) - return NULL; - if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset) - return iomapp; - return NULL; + bh->b_private = NULL; + ioend->io_size += bh->b_size; } STATIC void xfs_map_at_offset( - struct page *page, struct buffer_head *bh, - unsigned long offset, + loff_t offset, int block_bits, xfs_iomap_t *iomapp) { xfs_daddr_t bn; - loff_t delta; int sector_shift; ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL); - delta = page->index; - delta <<= PAGE_CACHE_SHIFT; - delta += offset; - delta -= iomapp->iomap_offset; - delta >>= block_bits; - sector_shift = block_bits - BBSHIFT; - bn = iomapp->iomap_bn >> sector_shift; - bn += delta; - BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME)); + bn = (iomapp->iomap_bn >> sector_shift) + + ((offset - iomapp->iomap_offset) >> block_bits); + + ASSERT(bn || (iomapp->iomap_flags & IOMAP_REALTIME)); ASSERT((bn << sector_shift) >= iomapp->iomap_bn); lock_buffer(bh); bh->b_blocknr = bn; - bh->b_bdev = iomapp->iomap_target->pbr_bdev; + bh->b_bdev = iomapp->iomap_target->bt_bdev; set_buffer_mapped(bh); clear_buffer_delay(bh); + clear_buffer_unwritten(bh); } /* - * Look for a page at index which is unlocked and contains our - * unwritten extent flagged buffers at its head. Returns page - * locked and with an extra reference count, and length of the - * unwritten extent component on this page that we can write, - * in units of filesystem blocks. - */ -STATIC struct page * -xfs_probe_unwritten_page( - struct address_space *mapping, - pgoff_t index, - xfs_iomap_t *iomapp, - xfs_ioend_t *ioend, - unsigned long max_offset, - unsigned long *fsbs, - unsigned int bbits) -{ - struct page *page; - - page = find_trylock_page(mapping, index); - if (!page) - return NULL; - if (PageWriteback(page)) - goto out; - - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - unsigned long p_offset = 0; - - *fsbs = 0; - bh = head = page_buffers(page); - do { - if (!buffer_unwritten(bh) || !buffer_uptodate(bh)) - break; - if (!xfs_offset_to_map(page, iomapp, p_offset)) - break; - if (p_offset >= max_offset) - break; - xfs_map_at_offset(page, bh, p_offset, bbits, iomapp); - set_buffer_unwritten_io(bh); - bh->b_private = ioend; - p_offset += bh->b_size; - (*fsbs)++; - } while ((bh = bh->b_this_page) != head); - - if (p_offset) - return page; - } - -out: - unlock_page(page); - return NULL; -} - -/* - * Look for a page at index which is unlocked and not mapped - * yet - clustering for mmap write case. + * Look for a page at index that is suitable for clustering. */ STATIC unsigned int -xfs_probe_unmapped_page( - struct address_space *mapping, - pgoff_t index, - unsigned int pg_offset) +xfs_probe_page( + struct page *page, + unsigned int pg_offset, + int mapped) { - struct page *page; int ret = 0; - page = find_trylock_page(mapping, index); - if (!page) - return 0; if (PageWriteback(page)) - goto out; + return 0; if (page->mapping && PageDirty(page)) { if (page_has_buffers(page)) { @@ -357,79 +487,101 @@ xfs_probe_unmapped_page( bh = head = page_buffers(page); do { - if (buffer_mapped(bh) || !buffer_uptodate(bh)) + if (!buffer_uptodate(bh)) + break; + if (mapped != buffer_mapped(bh)) break; ret += bh->b_size; if (ret >= pg_offset) break; } while ((bh = bh->b_this_page) != head); } else - ret = PAGE_CACHE_SIZE; + ret = mapped ? 0 : PAGE_CACHE_SIZE; } -out: - unlock_page(page); return ret; } -STATIC unsigned int -xfs_probe_unmapped_cluster( +STATIC size_t +xfs_probe_cluster( struct inode *inode, struct page *startpage, struct buffer_head *bh, - struct buffer_head *head) + struct buffer_head *head, + int mapped) { + struct pagevec pvec; pgoff_t tindex, tlast, tloff; - unsigned int pg_offset, len, total = 0; - struct address_space *mapping = inode->i_mapping; + size_t total = 0; + int done = 0, i; /* First sum forwards in this page */ do { - if (buffer_mapped(bh)) - break; + if (mapped != buffer_mapped(bh)) + return total; total += bh->b_size; } while ((bh = bh->b_this_page) != head); - /* If we reached the end of the page, sum forwards in - * following pages. - */ - if (bh == head) { - tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; - /* Prune this back to avoid pathological behavior */ - tloff = min(tlast, startpage->index + 64); - for (tindex = startpage->index + 1; tindex < tloff; tindex++) { - len = xfs_probe_unmapped_page(mapping, tindex, - PAGE_CACHE_SIZE); - if (!len) - return total; + /* if we reached the end of the page, sum forwards in following pages */ + tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; + tindex = startpage->index + 1; + + /* Prune this back to avoid pathological behavior */ + tloff = min(tlast, startpage->index + 64); + + pagevec_init(&pvec, 0); + while (!done && tindex <= tloff) { + unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); + + if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + size_t pg_offset, len = 0; + + if (tindex == tlast) { + pg_offset = + i_size_read(inode) & (PAGE_CACHE_SIZE - 1); + if (!pg_offset) { + done = 1; + break; + } + } else + pg_offset = PAGE_CACHE_SIZE; + + if (page->index == tindex && !TestSetPageLocked(page)) { + len = xfs_probe_page(page, pg_offset, mapped); + unlock_page(page); + } + + if (!len) { + done = 1; + break; + } + total += len; + tindex++; } - if (tindex == tlast && - (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { - total += xfs_probe_unmapped_page(mapping, - tindex, pg_offset); - } + + pagevec_release(&pvec); + cond_resched(); } + return total; } /* - * Probe for a given page (index) in the inode and test if it is delayed - * and without unwritten buffers. Returns page locked and with an extra - * reference count. + * Test if a given page is suitable for writing as part of an unwritten + * or delayed allocate extent. */ -STATIC struct page * -xfs_probe_delalloc_page( - struct inode *inode, - pgoff_t index) +STATIC int +xfs_is_delayed_page( + struct page *page, + unsigned int type) { - struct page *page; - - page = find_trylock_page(inode->i_mapping, index); - if (!page) - return NULL; if (PageWriteback(page)) - goto out; + return 0; if (page->mapping && page_has_buffers(page)) { struct buffer_head *bh, *head; @@ -437,243 +589,156 @@ xfs_probe_delalloc_page( bh = head = page_buffers(page); do { - if (buffer_unwritten(bh)) { - acceptable = 0; + if (buffer_unwritten(bh)) + acceptable = (type == IOMAP_UNWRITTEN); + else if (buffer_delay(bh)) + acceptable = (type == IOMAP_DELAY); + else if (buffer_mapped(bh)) + acceptable = (type == 0); + else break; - } else if (buffer_delay(bh)) { - acceptable = 1; - } } while ((bh = bh->b_this_page) != head); if (acceptable) - return page; - } - -out: - unlock_page(page); - return NULL; -} - -STATIC int -xfs_map_unwritten( - struct inode *inode, - struct page *start_page, - struct buffer_head *head, - struct buffer_head *curr, - unsigned long p_offset, - int block_bits, - xfs_iomap_t *iomapp, - struct writeback_control *wbc, - int startio, - int all_bh) -{ - struct buffer_head *bh = curr; - xfs_iomap_t *tmp; - xfs_ioend_t *ioend; - loff_t offset; - unsigned long nblocks = 0; - - offset = start_page->index; - offset <<= PAGE_CACHE_SHIFT; - offset += p_offset; - - ioend = xfs_alloc_ioend(inode); - - /* First map forwards in the page consecutive buffers - * covering this unwritten extent - */ - do { - if (!buffer_unwritten(bh)) - break; - tmp = xfs_offset_to_map(start_page, iomapp, p_offset); - if (!tmp) - break; - xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp); - set_buffer_unwritten_io(bh); - bh->b_private = ioend; - p_offset += bh->b_size; - nblocks++; - } while ((bh = bh->b_this_page) != head); - - atomic_add(nblocks, &ioend->io_remaining); - - /* If we reached the end of the page, map forwards in any - * following pages which are also covered by this extent. - */ - if (bh == head) { - struct address_space *mapping = inode->i_mapping; - pgoff_t tindex, tloff, tlast; - unsigned long bs; - unsigned int pg_offset, bbits = inode->i_blkbits; - struct page *page; - - tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; - tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT; - tloff = min(tlast, tloff); - for (tindex = start_page->index + 1; tindex < tloff; tindex++) { - page = xfs_probe_unwritten_page(mapping, - tindex, iomapp, ioend, - PAGE_CACHE_SIZE, &bs, bbits); - if (!page) - break; - nblocks += bs; - atomic_add(bs, &ioend->io_remaining); - xfs_convert_page(inode, page, iomapp, wbc, ioend, - startio, all_bh); - /* stop if converting the next page might add - * enough blocks that the corresponding byte - * count won't fit in our ulong page buf length */ - if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) - goto enough; - } - - if (tindex == tlast && - (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) { - page = xfs_probe_unwritten_page(mapping, - tindex, iomapp, ioend, - pg_offset, &bs, bbits); - if (page) { - nblocks += bs; - atomic_add(bs, &ioend->io_remaining); - xfs_convert_page(inode, page, iomapp, wbc, ioend, - startio, all_bh); - if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) - goto enough; - } - } + return 1; } -enough: - ioend->io_size = (xfs_off_t)nblocks << block_bits; - ioend->io_offset = offset; - xfs_finish_ioend(ioend); return 0; } -STATIC void -xfs_submit_page( - struct page *page, - struct writeback_control *wbc, - struct buffer_head *bh_arr[], - int bh_count, - int probed_page, - int clear_dirty) -{ - struct buffer_head *bh; - int i; - - BUG_ON(PageWriteback(page)); - if (bh_count) - set_page_writeback(page); - if (clear_dirty) - clear_page_dirty(page); - unlock_page(page); - - if (bh_count) { - for (i = 0; i < bh_count; i++) { - bh = bh_arr[i]; - mark_buffer_async_write(bh); - if (buffer_unwritten(bh)) - set_buffer_unwritten_io(bh); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - } - - for (i = 0; i < bh_count; i++) - submit_bh(WRITE, bh_arr[i]); - - if (probed_page && clear_dirty) - wbc->nr_to_write--; /* Wrote an "extra" page */ - } -} - /* * Allocate & map buffers for page given the extent map. Write it out. * except for the original page of a writepage, this is called on * delalloc/unwritten pages only, for the original page it is possible * that the page has no mapping at all. */ -STATIC void +STATIC int xfs_convert_page( struct inode *inode, struct page *page, - xfs_iomap_t *iomapp, + loff_t tindex, + xfs_iomap_t *mp, + xfs_ioend_t **ioendp, struct writeback_control *wbc, - void *private, int startio, int all_bh) { - struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; - xfs_iomap_t *mp = iomapp, *tmp; - unsigned long offset, end_offset; - int index = 0; + struct buffer_head *bh, *head; + xfs_off_t end_offset; + unsigned long p_offset; + unsigned int type; int bbits = inode->i_blkbits; int len, page_dirty; + int count = 0, done = 0, uptodate = 1; + xfs_off_t offset = page_offset(page); - end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)); + if (page->index != tindex) + goto fail; + if (TestSetPageLocked(page)) + goto fail; + if (PageWriteback(page)) + goto fail_unlock_page; + if (page->mapping != inode->i_mapping) + goto fail_unlock_page; + if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) + goto fail_unlock_page; /* * page_dirty is initially a count of buffers on the page before * EOF and is decrememted as we move each into a cleanable state. + * + * Derivation: + * + * End offset is the highest offset that this page should represent. + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and + * hence give us the correct page_dirty count. On any other page, + * it will be zero and in that case we need page_dirty to be the + * count of buffers on the page. */ + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + i_size_read(inode)); + len = 1 << inode->i_blkbits; - end_offset = max(end_offset, PAGE_CACHE_SIZE); - end_offset = roundup(end_offset, len); - page_dirty = end_offset / len; + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), + PAGE_CACHE_SIZE); + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; + page_dirty = p_offset / len; - offset = 0; bh = head = page_buffers(page); do { if (offset >= end_offset) break; - if (!(PageUptodate(page) || buffer_uptodate(bh))) + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh))) { + done = 1; continue; - if (buffer_mapped(bh) && all_bh && - !(buffer_unwritten(bh) || buffer_delay(bh))) { + } + + if (buffer_unwritten(bh) || buffer_delay(bh)) { + if (buffer_unwritten(bh)) + type = IOMAP_UNWRITTEN; + else + type = IOMAP_DELAY; + + if (!xfs_iomap_valid(mp, offset)) { + done = 1; + continue; + } + + ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); + ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); + + xfs_map_at_offset(bh, offset, bbits, mp); if (startio) { + xfs_add_to_ioend(inode, bh, offset, + type, ioendp, done); + } else { + set_buffer_dirty(bh); + unlock_buffer(bh); + mark_buffer_dirty(bh); + } + page_dirty--; + count++; + } else { + type = 0; + if (buffer_mapped(bh) && all_bh && startio) { lock_buffer(bh); - bh_arr[index++] = bh; + xfs_add_to_ioend(inode, bh, offset, + type, ioendp, done); + count++; page_dirty--; + } else { + done = 1; } - continue; } - tmp = xfs_offset_to_map(page, mp, offset); - if (!tmp) - continue; - ASSERT(!(tmp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(tmp->iomap_flags & IOMAP_DELAY)); + } while (offset += len, (bh = bh->b_this_page) != head); - /* If this is a new unwritten extent buffer (i.e. one - * that we haven't passed in private data for, we must - * now map this buffer too. - */ - if (buffer_unwritten(bh) && !bh->b_end_io) { - ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN); - xfs_map_unwritten(inode, page, head, bh, offset, - bbits, tmp, wbc, startio, all_bh); - } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) { - xfs_map_at_offset(page, bh, offset, bbits, tmp); - if (buffer_unwritten(bh)) { - set_buffer_unwritten_io(bh); - bh->b_private = private; - ASSERT(private); + if (uptodate && bh == head) + SetPageUptodate(page); + + if (startio) { + if (count) { + struct backing_dev_info *bdi; + + bdi = inode->i_mapping->backing_dev_info; + if (bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } else if (--wbc->nr_to_write <= 0) { + done = 1; } } - if (startio) { - bh_arr[index++] = bh; - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; - } while (offset += len, (bh = bh->b_this_page) != head); - - if (startio && index) { - xfs_submit_page(page, wbc, bh_arr, index, 1, !page_dirty); - } else { - unlock_page(page); + xfs_start_page_writeback(page, wbc, !page_dirty, count); } + + return done; + fail_unlock_page: + unlock_page(page); + fail: + return 1; } /* @@ -685,19 +750,31 @@ xfs_cluster_write( struct inode *inode, pgoff_t tindex, xfs_iomap_t *iomapp, + xfs_ioend_t **ioendp, struct writeback_control *wbc, int startio, int all_bh, pgoff_t tlast) { - struct page *page; + struct pagevec pvec; + int done = 0, i; - for (; tindex <= tlast; tindex++) { - page = xfs_probe_delalloc_page(inode, tindex); - if (!page) + pagevec_init(&pvec, 0); + while (!done && tindex <= tlast) { + unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); + + if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) break; - xfs_convert_page(inode, page, iomapp, wbc, NULL, - startio, all_bh); + + for (i = 0; i < pagevec_count(&pvec); i++) { + done = xfs_convert_page(inode, pvec.pages[i], tindex++, + iomapp, ioendp, wbc, startio, all_bh); + if (done) + break; + } + + pagevec_release(&pvec); + cond_resched(); } } @@ -728,18 +805,22 @@ xfs_page_state_convert( int startio, int unmapped) /* also implies page uptodate */ { - struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; - xfs_iomap_t *iomp, iomap; + struct buffer_head *bh, *head; + xfs_iomap_t iomap; + xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned long p_offset = 0; + unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index, tlast; - int len, err, i, cnt = 0, uptodate = 1; - int flags; - int page_dirty; + ssize_t size, len; + int flags, err, iomap_valid = 0, uptodate = 1; + int page_dirty, count = 0, trylock_flag = 0; + int all_bh = unmapped; /* wait for other IO threads? */ - flags = (startio && wbc->sync_mode != WB_SYNC_NONE) ? 0 : BMAPI_TRYLOCK; + if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)) + trylock_flag |= BMAPI_TRYLOCK; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -754,161 +835,173 @@ xfs_page_state_convert( } } - end_offset = min_t(unsigned long long, - (loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - /* * page_dirty is initially a count of buffers on the page before * EOF and is decrememted as we move each into a cleanable state. - */ + * + * Derivation: + * + * End offset is the highest offset that this page should represent. + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and + * hence give us the correct page_dirty count. On any other page, + * it will be zero and in that case we need page_dirty to be the + * count of buffers on the page. + */ + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); len = 1 << inode->i_blkbits; - p_offset = max(p_offset, PAGE_CACHE_SIZE); - p_offset = roundup(p_offset, len); + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), + PAGE_CACHE_SIZE); + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; page_dirty = p_offset / len; - iomp = NULL; - p_offset = 0; bh = head = page_buffers(page); + offset = page_offset(page); + flags = -1; + type = 0; + + /* TODO: cleanup count and page_dirty */ do { if (offset >= end_offset) break; if (!buffer_uptodate(bh)) uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) + if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { + /* + * the iomap is actually still valid, but the ioend + * isn't. shouldn't happen too often. + */ + iomap_valid = 0; continue; - - if (iomp) { - iomp = xfs_offset_to_map(page, &iomap, p_offset); } + if (iomap_valid) + iomap_valid = xfs_iomap_valid(&iomap, offset); + /* * First case, map an unwritten extent and prepare for * extent state conversion transaction on completion. - */ - if (buffer_unwritten(bh)) { - if (!startio) - continue; - if (!iomp) { - err = xfs_map_blocks(inode, offset, len, &iomap, - BMAPI_WRITE|BMAPI_IGNSTATE); - if (err) { - goto error; - } - iomp = xfs_offset_to_map(page, &iomap, - p_offset); + * + * Second case, allocate space for a delalloc buffer. + * We can return EAGAIN here in the release page case. + * + * Third case, an unmapped buffer was found, and we are + * in a path where we need to write the whole page out. + */ + if (buffer_unwritten(bh) || buffer_delay(bh) || + ((buffer_uptodate(bh) || PageUptodate(page)) && + !buffer_mapped(bh) && (unmapped || startio))) { + /* + * Make sure we don't use a read-only iomap + */ + if (flags == BMAPI_READ) + iomap_valid = 0; + + if (buffer_unwritten(bh)) { + type = IOMAP_UNWRITTEN; + flags = BMAPI_WRITE|BMAPI_IGNSTATE; + } else if (buffer_delay(bh)) { + type = IOMAP_DELAY; + flags = BMAPI_ALLOCATE; + if (!startio) + flags |= trylock_flag; + } else { + type = IOMAP_NEW; + flags = BMAPI_WRITE|BMAPI_MMAP; } - if (iomp) { - if (!bh->b_end_io) { - err = xfs_map_unwritten(inode, page, - head, bh, p_offset, - inode->i_blkbits, iomp, - wbc, startio, unmapped); - if (err) { - goto error; - } + + if (!iomap_valid) { + if (type == IOMAP_NEW) { + size = xfs_probe_cluster(inode, + page, bh, head, 0); } else { - set_bit(BH_Lock, &bh->b_state); + size = len; } - BUG_ON(!buffer_locked(bh)); - bh_arr[cnt++] = bh; - page_dirty--; - } - /* - * Second case, allocate space for a delalloc buffer. - * We can return EAGAIN here in the release page case. - */ - } else if (buffer_delay(bh)) { - if (!iomp) { - err = xfs_map_blocks(inode, offset, len, &iomap, - BMAPI_ALLOCATE | flags); - if (err) { + + err = xfs_map_blocks(inode, offset, size, + &iomap, flags); + if (err) goto error; - } - iomp = xfs_offset_to_map(page, &iomap, - p_offset); + iomap_valid = xfs_iomap_valid(&iomap, offset); } - if (iomp) { - xfs_map_at_offset(page, bh, p_offset, - inode->i_blkbits, iomp); + if (iomap_valid) { + xfs_map_at_offset(bh, offset, + inode->i_blkbits, &iomap); if (startio) { - bh_arr[cnt++] = bh; + xfs_add_to_ioend(inode, bh, offset, + type, &ioend, + !iomap_valid); } else { set_buffer_dirty(bh); unlock_buffer(bh); mark_buffer_dirty(bh); } page_dirty--; + count++; + } + } else if (buffer_uptodate(bh) && startio) { + /* + * we got here because the buffer is already mapped. + * That means it must already have extents allocated + * underneath it. Map the extent by reading it. + */ + if (!iomap_valid || type != 0) { + flags = BMAPI_READ; + size = xfs_probe_cluster(inode, page, bh, + head, 1); + err = xfs_map_blocks(inode, offset, size, + &iomap, flags); + if (err) + goto error; + iomap_valid = xfs_iomap_valid(&iomap, offset); } - } else if ((buffer_uptodate(bh) || PageUptodate(page)) && - (unmapped || startio)) { - if (!buffer_mapped(bh)) { - int size; - - /* - * Getting here implies an unmapped buffer - * was found, and we are in a path where we - * need to write the whole page out. - */ - if (!iomp) { - size = xfs_probe_unmapped_cluster( - inode, page, bh, head); - err = xfs_map_blocks(inode, offset, - size, &iomap, - BMAPI_WRITE|BMAPI_MMAP); - if (err) { - goto error; - } - iomp = xfs_offset_to_map(page, &iomap, - p_offset); - } - if (iomp) { - xfs_map_at_offset(page, - bh, p_offset, - inode->i_blkbits, iomp); - if (startio) { - bh_arr[cnt++] = bh; - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; - } - } else if (startio) { - if (buffer_uptodate(bh) && - !test_and_set_bit(BH_Lock, &bh->b_state)) { - bh_arr[cnt++] = bh; - page_dirty--; - } + type = 0; + if (!test_and_set_bit(BH_Lock, &bh->b_state)) { + ASSERT(buffer_mapped(bh)); + if (iomap_valid) + all_bh = 1; + xfs_add_to_ioend(inode, bh, offset, type, + &ioend, !iomap_valid); + page_dirty--; + count++; + } else { + iomap_valid = 0; } + } else if ((buffer_uptodate(bh) || PageUptodate(page)) && + (unmapped || startio)) { + iomap_valid = 0; } - } while (offset += len, p_offset += len, - ((bh = bh->b_this_page) != head)); + + if (!iohead) + iohead = ioend; + + } while (offset += len, ((bh = bh->b_this_page) != head)); if (uptodate && bh == head) SetPageUptodate(page); - if (startio) { - xfs_submit_page(page, wbc, bh_arr, cnt, 0, !page_dirty); - } + if (startio) + xfs_start_page_writeback(page, wbc, 1, count); - if (iomp) { - offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >> + if (ioend && iomap_valid) { + offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> PAGE_CACHE_SHIFT; tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, iomp, wbc, - startio, unmapped, tlast); + xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, + wbc, startio, all_bh, tlast); } + if (iohead) + xfs_submit_ioend(iohead); + return page_dirty; error: - for (i = 0; i < cnt; i++) { - unlock_buffer(bh_arr[i]); - } + if (iohead) + xfs_cancel_ioend(iohead); /* * If it's delalloc and we have nowhere to put it, @@ -916,9 +1009,8 @@ error: * us to try again. */ if (err != -EAGAIN) { - if (!unmapped) { + if (!unmapped) block_invalidatepage(page, 0); - } ClearPageUptodate(page); } return err; @@ -982,7 +1074,7 @@ __linvfs_get_block( } /* If this is a realtime file, data might be on a new device */ - bh_result->b_bdev = iomap.iomap_target->pbr_bdev; + bh_result->b_bdev = iomap.iomap_target->bt_bdev; /* If we previously allocated a block out beyond eof and * we are now coming back to use it then we will need to @@ -1094,10 +1186,10 @@ linvfs_direct_IO( if (error) return -error; - iocb->private = xfs_alloc_ioend(inode); + iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - iomap.iomap_target->pbr_bdev, + iomap.iomap_target->bt_bdev, iov, offset, nr_segs, linvfs_get_blocks_direct, linvfs_end_io_direct); diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 4720758a9ade..55339dd5a30d 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -23,14 +23,24 @@ extern mempool_t *xfs_ioend_pool; typedef void (*xfs_ioend_func_t)(void *); +/* + * xfs_ioend struct manages large extent writes for XFS. + * It can manage several multi-page bio's at once. + */ typedef struct xfs_ioend { + struct xfs_ioend *io_list; /* next ioend in chain */ + unsigned int io_type; /* delalloc / unwritten */ unsigned int io_uptodate; /* I/O status register */ atomic_t io_remaining; /* hold count */ struct vnode *io_vnode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ + struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ } xfs_ioend_t; +extern struct address_space_operations linvfs_aops; +extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int); + #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 6fe21d2b8847..e44b7c1a3a36 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -31,76 +31,77 @@ #include <linux/kthread.h> #include "xfs_linux.h" -STATIC kmem_cache_t *pagebuf_zone; -STATIC kmem_shaker_t pagebuf_shake; +STATIC kmem_zone_t *xfs_buf_zone; +STATIC kmem_shaker_t xfs_buf_shake; +STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); -STATIC void pagebuf_delwri_queue(xfs_buf_t *, int); +STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); STATIC struct workqueue_struct *xfslogd_workqueue; struct workqueue_struct *xfsdatad_workqueue; -#ifdef PAGEBUF_TRACE +#ifdef XFS_BUF_TRACE void -pagebuf_trace( - xfs_buf_t *pb, +xfs_buf_trace( + xfs_buf_t *bp, char *id, void *data, void *ra) { - ktrace_enter(pagebuf_trace_buf, - pb, id, - (void *)(unsigned long)pb->pb_flags, - (void *)(unsigned long)pb->pb_hold.counter, - (void *)(unsigned long)pb->pb_sema.count.counter, + ktrace_enter(xfs_buf_trace_buf, + bp, id, + (void *)(unsigned long)bp->b_flags, + (void *)(unsigned long)bp->b_hold.counter, + (void *)(unsigned long)bp->b_sema.count.counter, (void *)current, data, ra, - (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff), - (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff), - (void *)(unsigned long)pb->pb_buffer_length, + (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), + (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), + (void *)(unsigned long)bp->b_buffer_length, NULL, NULL, NULL, NULL, NULL); } -ktrace_t *pagebuf_trace_buf; -#define PAGEBUF_TRACE_SIZE 4096 -#define PB_TRACE(pb, id, data) \ - pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0)) +ktrace_t *xfs_buf_trace_buf; +#define XFS_BUF_TRACE_SIZE 4096 +#define XB_TRACE(bp, id, data) \ + xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) #else -#define PB_TRACE(pb, id, data) do { } while (0) +#define XB_TRACE(bp, id, data) do { } while (0) #endif -#ifdef PAGEBUF_LOCK_TRACKING -# define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid) -# define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1) -# define PB_GET_OWNER(pb) ((pb)->pb_last_holder) +#ifdef XFS_BUF_LOCK_TRACKING +# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) +# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) +# define XB_GET_OWNER(bp) ((bp)->b_last_holder) #else -# define PB_SET_OWNER(pb) do { } while (0) -# define PB_CLEAR_OWNER(pb) do { } while (0) -# define PB_GET_OWNER(pb) do { } while (0) +# define XB_SET_OWNER(bp) do { } while (0) +# define XB_CLEAR_OWNER(bp) do { } while (0) +# define XB_GET_OWNER(bp) do { } while (0) #endif -#define pb_to_gfp(flags) \ - ((((flags) & PBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) +#define xb_to_gfp(flags) \ + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ + ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) -#define pb_to_km(flags) \ - (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) +#define xb_to_km(flags) \ + (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) -#define pagebuf_allocate(flags) \ - kmem_zone_alloc(pagebuf_zone, pb_to_km(flags)) -#define pagebuf_deallocate(pb) \ - kmem_zone_free(pagebuf_zone, (pb)); +#define xfs_buf_allocate(flags) \ + kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) +#define xfs_buf_deallocate(bp) \ + kmem_zone_free(xfs_buf_zone, (bp)); /* - * Page Region interfaces. + * Page Region interfaces. * - * For pages in filesystems where the blocksize is smaller than the - * pagesize, we use the page->private field (long) to hold a bitmap - * of uptodate regions within the page. + * For pages in filesystems where the blocksize is smaller than the + * pagesize, we use the page->private field (long) to hold a bitmap + * of uptodate regions within the page. * - * Each such region is "bytes per page / bits per long" bytes long. + * Each such region is "bytes per page / bits per long" bytes long. * - * NBPPR == number-of-bytes-per-page-region - * BTOPR == bytes-to-page-region (rounded up) - * BTOPRT == bytes-to-page-region-truncated (rounded down) + * NBPPR == number-of-bytes-per-page-region + * BTOPR == bytes-to-page-region (rounded up) + * BTOPRT == bytes-to-page-region-truncated (rounded down) */ #if (BITS_PER_LONG == 32) #define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ @@ -159,7 +160,7 @@ test_page_region( } /* - * Mapping of multi-page buffers into contiguous virtual space + * Mapping of multi-page buffers into contiguous virtual space */ typedef struct a_list { @@ -172,7 +173,7 @@ STATIC int as_list_len; STATIC DEFINE_SPINLOCK(as_lock); /* - * Try to batch vunmaps because they are costly. + * Try to batch vunmaps because they are costly. */ STATIC void free_address( @@ -215,83 +216,83 @@ purge_addresses(void) } /* - * Internal pagebuf object manipulation + * Internal xfs_buf_t object manipulation */ STATIC void -_pagebuf_initialize( - xfs_buf_t *pb, +_xfs_buf_initialize( + xfs_buf_t *bp, xfs_buftarg_t *target, - loff_t range_base, + xfs_off_t range_base, size_t range_length, - page_buf_flags_t flags) + xfs_buf_flags_t flags) { /* - * We don't want certain flags to appear in pb->pb_flags. + * We don't want certain flags to appear in b_flags. */ - flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD); - - memset(pb, 0, sizeof(xfs_buf_t)); - atomic_set(&pb->pb_hold, 1); - init_MUTEX_LOCKED(&pb->pb_iodonesema); - INIT_LIST_HEAD(&pb->pb_list); - INIT_LIST_HEAD(&pb->pb_hash_list); - init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */ - PB_SET_OWNER(pb); - pb->pb_target = target; - pb->pb_file_offset = range_base; + flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); + + memset(bp, 0, sizeof(xfs_buf_t)); + atomic_set(&bp->b_hold, 1); + init_MUTEX_LOCKED(&bp->b_iodonesema); + INIT_LIST_HEAD(&bp->b_list); + INIT_LIST_HEAD(&bp->b_hash_list); + init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ + XB_SET_OWNER(bp); + bp->b_target = target; + bp->b_file_offset = range_base; /* * Set buffer_length and count_desired to the same value initially. * I/O routines should use count_desired, which will be the same in * most cases but may be reset (e.g. XFS recovery). */ - pb->pb_buffer_length = pb->pb_count_desired = range_length; - pb->pb_flags = flags; - pb->pb_bn = XFS_BUF_DADDR_NULL; - atomic_set(&pb->pb_pin_count, 0); - init_waitqueue_head(&pb->pb_waiters); - - XFS_STATS_INC(pb_create); - PB_TRACE(pb, "initialize", target); + bp->b_buffer_length = bp->b_count_desired = range_length; + bp->b_flags = flags; + bp->b_bn = XFS_BUF_DADDR_NULL; + atomic_set(&bp->b_pin_count, 0); + init_waitqueue_head(&bp->b_waiters); + + XFS_STATS_INC(xb_create); + XB_TRACE(bp, "initialize", target); } /* - * Allocate a page array capable of holding a specified number - * of pages, and point the page buf at it. + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. */ STATIC int -_pagebuf_get_pages( - xfs_buf_t *pb, +_xfs_buf_get_pages( + xfs_buf_t *bp, int page_count, - page_buf_flags_t flags) + xfs_buf_flags_t flags) { /* Make sure that we have a page list */ - if (pb->pb_pages == NULL) { - pb->pb_offset = page_buf_poff(pb->pb_file_offset); - pb->pb_page_count = page_count; - if (page_count <= PB_PAGES) { - pb->pb_pages = pb->pb_page_array; + if (bp->b_pages == NULL) { + bp->b_offset = xfs_buf_poff(bp->b_file_offset); + bp->b_page_count = page_count; + if (page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; } else { - pb->pb_pages = kmem_alloc(sizeof(struct page *) * - page_count, pb_to_km(flags)); - if (pb->pb_pages == NULL) + bp->b_pages = kmem_alloc(sizeof(struct page *) * + page_count, xb_to_km(flags)); + if (bp->b_pages == NULL) return -ENOMEM; } - memset(pb->pb_pages, 0, sizeof(struct page *) * page_count); + memset(bp->b_pages, 0, sizeof(struct page *) * page_count); } return 0; } /* - * Frees pb_pages if it was malloced. + * Frees b_pages if it was allocated. */ STATIC void -_pagebuf_free_pages( +_xfs_buf_free_pages( xfs_buf_t *bp) { - if (bp->pb_pages != bp->pb_page_array) { - kmem_free(bp->pb_pages, - bp->pb_page_count * sizeof(struct page *)); + if (bp->b_pages != bp->b_page_array) { + kmem_free(bp->b_pages, + bp->b_page_count * sizeof(struct page *)); } } @@ -299,79 +300,79 @@ _pagebuf_free_pages( * Releases the specified buffer. * * The modification state of any associated pages is left unchanged. - * The buffer most not be on any hash - use pagebuf_rele instead for + * The buffer most not be on any hash - use xfs_buf_rele instead for * hashed and refcounted buffers */ void -pagebuf_free( +xfs_buf_free( xfs_buf_t *bp) { - PB_TRACE(bp, "free", 0); + XB_TRACE(bp, "free", 0); - ASSERT(list_empty(&bp->pb_hash_list)); + ASSERT(list_empty(&bp->b_hash_list)); - if (bp->pb_flags & _PBF_PAGE_CACHE) { + if (bp->b_flags & _XBF_PAGE_CACHE) { uint i; - if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1)) - free_address(bp->pb_addr - bp->pb_offset); + if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) + free_address(bp->b_addr - bp->b_offset); - for (i = 0; i < bp->pb_page_count; i++) - page_cache_release(bp->pb_pages[i]); - _pagebuf_free_pages(bp); - } else if (bp->pb_flags & _PBF_KMEM_ALLOC) { + for (i = 0; i < bp->b_page_count; i++) + page_cache_release(bp->b_pages[i]); + _xfs_buf_free_pages(bp); + } else if (bp->b_flags & _XBF_KMEM_ALLOC) { /* - * XXX(hch): bp->pb_count_desired might be incorrect (see - * pagebuf_associate_memory for details), but fortunately + * XXX(hch): bp->b_count_desired might be incorrect (see + * xfs_buf_associate_memory for details), but fortunately * the Linux version of kmem_free ignores the len argument.. */ - kmem_free(bp->pb_addr, bp->pb_count_desired); - _pagebuf_free_pages(bp); + kmem_free(bp->b_addr, bp->b_count_desired); + _xfs_buf_free_pages(bp); } - pagebuf_deallocate(bp); + xfs_buf_deallocate(bp); } /* * Finds all pages for buffer in question and builds it's page list. */ STATIC int -_pagebuf_lookup_pages( +_xfs_buf_lookup_pages( xfs_buf_t *bp, uint flags) { - struct address_space *mapping = bp->pb_target->pbr_mapping; - size_t blocksize = bp->pb_target->pbr_bsize; - size_t size = bp->pb_count_desired; + struct address_space *mapping = bp->b_target->bt_mapping; + size_t blocksize = bp->b_target->bt_bsize; + size_t size = bp->b_count_desired; size_t nbytes, offset; - gfp_t gfp_mask = pb_to_gfp(flags); + gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; pgoff_t first; - loff_t end; + xfs_off_t end; int error; - end = bp->pb_file_offset + bp->pb_buffer_length; - page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset); + end = bp->b_file_offset + bp->b_buffer_length; + page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); - error = _pagebuf_get_pages(bp, page_count, flags); + error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; - bp->pb_flags |= _PBF_PAGE_CACHE; + bp->b_flags |= _XBF_PAGE_CACHE; - offset = bp->pb_offset; - first = bp->pb_file_offset >> PAGE_CACHE_SHIFT; + offset = bp->b_offset; + first = bp->b_file_offset >> PAGE_CACHE_SHIFT; - for (i = 0; i < bp->pb_page_count; i++) { + for (i = 0; i < bp->b_page_count; i++) { struct page *page; uint retries = 0; retry: page = find_or_create_page(mapping, first + i, gfp_mask); if (unlikely(page == NULL)) { - if (flags & PBF_READ_AHEAD) { - bp->pb_page_count = i; - for (i = 0; i < bp->pb_page_count; i++) - unlock_page(bp->pb_pages[i]); + if (flags & XBF_READ_AHEAD) { + bp->b_page_count = i; + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); return -ENOMEM; } @@ -387,13 +388,13 @@ _pagebuf_lookup_pages( "deadlock in %s (mode:0x%x)\n", __FUNCTION__, gfp_mask); - XFS_STATS_INC(pb_page_retries); + XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); blk_congestion_wait(WRITE, HZ/50); goto retry; } - XFS_STATS_INC(pb_page_found); + XFS_STATS_INC(xb_page_found); nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); size -= nbytes; @@ -401,27 +402,27 @@ _pagebuf_lookup_pages( if (!PageUptodate(page)) { page_count--; if (blocksize >= PAGE_CACHE_SIZE) { - if (flags & PBF_READ) - bp->pb_locked = 1; + if (flags & XBF_READ) + bp->b_locked = 1; } else if (!PagePrivate(page)) { if (test_page_region(page, offset, nbytes)) page_count++; } } - bp->pb_pages[i] = page; + bp->b_pages[i] = page; offset = 0; } - if (!bp->pb_locked) { - for (i = 0; i < bp->pb_page_count; i++) - unlock_page(bp->pb_pages[i]); + if (!bp->b_locked) { + for (i = 0; i < bp->b_page_count; i++) + unlock_page(bp->b_pages[i]); } - if (page_count == bp->pb_page_count) - bp->pb_flags |= PBF_DONE; + if (page_count == bp->b_page_count) + bp->b_flags |= XBF_DONE; - PB_TRACE(bp, "lookup_pages", (long)page_count); + XB_TRACE(bp, "lookup_pages", (long)page_count); return error; } @@ -429,23 +430,23 @@ _pagebuf_lookup_pages( * Map buffer into kernel address-space if nessecary. */ STATIC int -_pagebuf_map_pages( +_xfs_buf_map_pages( xfs_buf_t *bp, uint flags) { /* A single page buffer is always mappable */ - if (bp->pb_page_count == 1) { - bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset; - bp->pb_flags |= PBF_MAPPED; - } else if (flags & PBF_MAPPED) { + if (bp->b_page_count == 1) { + bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + bp->b_flags |= XBF_MAPPED; + } else if (flags & XBF_MAPPED) { if (as_list_len > 64) purge_addresses(); - bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count, - VM_MAP, PAGE_KERNEL); - if (unlikely(bp->pb_addr == NULL)) + bp->b_addr = vmap(bp->b_pages, bp->b_page_count, + VM_MAP, PAGE_KERNEL); + if (unlikely(bp->b_addr == NULL)) return -ENOMEM; - bp->pb_addr += bp->pb_offset; - bp->pb_flags |= PBF_MAPPED; + bp->b_addr += bp->b_offset; + bp->b_flags |= XBF_MAPPED; } return 0; @@ -456,9 +457,7 @@ _pagebuf_map_pages( */ /* - * _pagebuf_find - * - * Looks up, and creates if absent, a lockable buffer for + * Look up, and creates if absent, a lockable buffer for * a given range of an inode. The buffer is returned * locked. If other overlapping buffers exist, they are * released before the new buffer is created and locked, @@ -466,55 +465,55 @@ _pagebuf_map_pages( * are unlocked. No I/O is implied by this call. */ xfs_buf_t * -_pagebuf_find( +_xfs_buf_find( xfs_buftarg_t *btp, /* block device target */ - loff_t ioff, /* starting offset of range */ + xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ - page_buf_flags_t flags, /* PBF_TRYLOCK */ - xfs_buf_t *new_pb)/* newly allocated buffer */ + xfs_buf_flags_t flags, + xfs_buf_t *new_bp) { - loff_t range_base; + xfs_off_t range_base; size_t range_length; xfs_bufhash_t *hash; - xfs_buf_t *pb, *n; + xfs_buf_t *bp, *n; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->pbr_sshift))); - ASSERT(!(range_base & (loff_t)btp->pbr_smask)); + ASSERT(!(range_length < (1 << btp->bt_sshift))); + ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; spin_lock(&hash->bh_lock); - list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) { - ASSERT(btp == pb->pb_target); - if (pb->pb_file_offset == range_base && - pb->pb_buffer_length == range_length) { + list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { + ASSERT(btp == bp->b_target); + if (bp->b_file_offset == range_base && + bp->b_buffer_length == range_length) { /* - * If we look at something bring it to the + * If we look at something, bring it to the * front of the list for next time. */ - atomic_inc(&pb->pb_hold); - list_move(&pb->pb_hash_list, &hash->bh_list); + atomic_inc(&bp->b_hold); + list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } /* No match found */ - if (new_pb) { - _pagebuf_initialize(new_pb, btp, range_base, + if (new_bp) { + _xfs_buf_initialize(new_bp, btp, range_base, range_length, flags); - new_pb->pb_hash = hash; - list_add(&new_pb->pb_hash_list, &hash->bh_list); + new_bp->b_hash = hash; + list_add(&new_bp->b_hash_list, &hash->bh_list); } else { - XFS_STATS_INC(pb_miss_locked); + XFS_STATS_INC(xb_miss_locked); } spin_unlock(&hash->bh_lock); - return new_pb; + return new_bp; found: spin_unlock(&hash->bh_lock); @@ -523,74 +522,72 @@ found: * if this does not work then we need to drop the * spinlock and do a hard attempt on the semaphore. */ - if (down_trylock(&pb->pb_sema)) { - if (!(flags & PBF_TRYLOCK)) { + if (down_trylock(&bp->b_sema)) { + if (!(flags & XBF_TRYLOCK)) { /* wait for buffer ownership */ - PB_TRACE(pb, "get_lock", 0); - pagebuf_lock(pb); - XFS_STATS_INC(pb_get_locked_waited); + XB_TRACE(bp, "get_lock", 0); + xfs_buf_lock(bp); + XFS_STATS_INC(xb_get_locked_waited); } else { /* We asked for a trylock and failed, no need * to look at file offset and length here, we - * know that this pagebuf at least overlaps our - * pagebuf and is locked, therefore our buffer - * either does not exist, or is this buffer + * know that this buffer at least overlaps our + * buffer and is locked, therefore our buffer + * either does not exist, or is this buffer. */ - - pagebuf_rele(pb); - XFS_STATS_INC(pb_busy_locked); - return (NULL); + xfs_buf_rele(bp); + XFS_STATS_INC(xb_busy_locked); + return NULL; } } else { /* trylock worked */ - PB_SET_OWNER(pb); + XB_SET_OWNER(bp); } - if (pb->pb_flags & PBF_STALE) { - ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0); - pb->pb_flags &= PBF_MAPPED; + if (bp->b_flags & XBF_STALE) { + ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + bp->b_flags &= XBF_MAPPED; } - PB_TRACE(pb, "got_lock", 0); - XFS_STATS_INC(pb_get_locked); - return (pb); + XB_TRACE(bp, "got_lock", 0); + XFS_STATS_INC(xb_get_locked); + return bp; } /* - * xfs_buf_get_flags assembles a buffer covering the specified range. - * + * Assembles a buffer covering the specified range. * Storage in memory for all portions of the buffer will be allocated, * although backing storage may not be. */ xfs_buf_t * -xfs_buf_get_flags( /* allocate a buffer */ +xfs_buf_get_flags( xfs_buftarg_t *target,/* target for buffer */ - loff_t ioff, /* starting offset of range */ + xfs_off_t ioff, /* starting offset of range */ size_t isize, /* length of range */ - page_buf_flags_t flags) /* PBF_TRYLOCK */ + xfs_buf_flags_t flags) { - xfs_buf_t *pb, *new_pb; + xfs_buf_t *bp, *new_bp; int error = 0, i; - new_pb = pagebuf_allocate(flags); - if (unlikely(!new_pb)) + new_bp = xfs_buf_allocate(flags); + if (unlikely(!new_bp)) return NULL; - pb = _pagebuf_find(target, ioff, isize, flags, new_pb); - if (pb == new_pb) { - error = _pagebuf_lookup_pages(pb, flags); + bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); + if (bp == new_bp) { + error = _xfs_buf_lookup_pages(bp, flags); if (error) goto no_buffer; } else { - pagebuf_deallocate(new_pb); - if (unlikely(pb == NULL)) + xfs_buf_deallocate(new_bp); + if (unlikely(bp == NULL)) return NULL; } - for (i = 0; i < pb->pb_page_count; i++) - mark_page_accessed(pb->pb_pages[i]); + for (i = 0; i < bp->b_page_count; i++) + mark_page_accessed(bp->b_pages[i]); - if (!(pb->pb_flags & PBF_MAPPED)) { - error = _pagebuf_map_pages(pb, flags); + if (!(bp->b_flags & XBF_MAPPED)) { + error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { printk(KERN_WARNING "%s: failed to map pages\n", __FUNCTION__); @@ -598,97 +595,97 @@ xfs_buf_get_flags( /* allocate a buffer */ } } - XFS_STATS_INC(pb_get); + XFS_STATS_INC(xb_get); /* * Always fill in the block number now, the mapped cases can do * their own overlay of this later. */ - pb->pb_bn = ioff; - pb->pb_count_desired = pb->pb_buffer_length; + bp->b_bn = ioff; + bp->b_count_desired = bp->b_buffer_length; - PB_TRACE(pb, "get", (unsigned long)flags); - return pb; + XB_TRACE(bp, "get", (unsigned long)flags); + return bp; no_buffer: - if (flags & (PBF_LOCK | PBF_TRYLOCK)) - pagebuf_unlock(pb); - pagebuf_rele(pb); + if (flags & (XBF_LOCK | XBF_TRYLOCK)) + xfs_buf_unlock(bp); + xfs_buf_rele(bp); return NULL; } xfs_buf_t * xfs_buf_read_flags( xfs_buftarg_t *target, - loff_t ioff, + xfs_off_t ioff, size_t isize, - page_buf_flags_t flags) + xfs_buf_flags_t flags) { - xfs_buf_t *pb; - - flags |= PBF_READ; - - pb = xfs_buf_get_flags(target, ioff, isize, flags); - if (pb) { - if (!XFS_BUF_ISDONE(pb)) { - PB_TRACE(pb, "read", (unsigned long)flags); - XFS_STATS_INC(pb_get_read); - pagebuf_iostart(pb, flags); - } else if (flags & PBF_ASYNC) { - PB_TRACE(pb, "read_async", (unsigned long)flags); + xfs_buf_t *bp; + + flags |= XBF_READ; + + bp = xfs_buf_get_flags(target, ioff, isize, flags); + if (bp) { + if (!XFS_BUF_ISDONE(bp)) { + XB_TRACE(bp, "read", (unsigned long)flags); + XFS_STATS_INC(xb_get_read); + xfs_buf_iostart(bp, flags); + } else if (flags & XBF_ASYNC) { + XB_TRACE(bp, "read_async", (unsigned long)flags); /* * Read ahead call which is already satisfied, * drop the buffer */ goto no_buffer; } else { - PB_TRACE(pb, "read_done", (unsigned long)flags); + XB_TRACE(bp, "read_done", (unsigned long)flags); /* We do not want read in the flags */ - pb->pb_flags &= ~PBF_READ; + bp->b_flags &= ~XBF_READ; } } - return pb; + return bp; no_buffer: - if (flags & (PBF_LOCK | PBF_TRYLOCK)) - pagebuf_unlock(pb); - pagebuf_rele(pb); + if (flags & (XBF_LOCK | XBF_TRYLOCK)) + xfs_buf_unlock(bp); + xfs_buf_rele(bp); return NULL; } /* - * If we are not low on memory then do the readahead in a deadlock - * safe manner. + * If we are not low on memory then do the readahead in a deadlock + * safe manner. */ void -pagebuf_readahead( +xfs_buf_readahead( xfs_buftarg_t *target, - loff_t ioff, + xfs_off_t ioff, size_t isize, - page_buf_flags_t flags) + xfs_buf_flags_t flags) { struct backing_dev_info *bdi; - bdi = target->pbr_mapping->backing_dev_info; + bdi = target->bt_mapping->backing_dev_info; if (bdi_read_congested(bdi)) return; - flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD); + flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); xfs_buf_read_flags(target, ioff, isize, flags); } xfs_buf_t * -pagebuf_get_empty( +xfs_buf_get_empty( size_t len, xfs_buftarg_t *target) { - xfs_buf_t *pb; + xfs_buf_t *bp; - pb = pagebuf_allocate(0); - if (pb) - _pagebuf_initialize(pb, target, 0, len, 0); - return pb; + bp = xfs_buf_allocate(0); + if (bp) + _xfs_buf_initialize(bp, target, 0, len, 0); + return bp; } static inline struct page * @@ -704,8 +701,8 @@ mem_to_page( } int -pagebuf_associate_memory( - xfs_buf_t *pb, +xfs_buf_associate_memory( + xfs_buf_t *bp, void *mem, size_t len) { @@ -722,40 +719,40 @@ pagebuf_associate_memory( page_count++; /* Free any previous set of page pointers */ - if (pb->pb_pages) - _pagebuf_free_pages(pb); + if (bp->b_pages) + _xfs_buf_free_pages(bp); - pb->pb_pages = NULL; - pb->pb_addr = mem; + bp->b_pages = NULL; + bp->b_addr = mem; - rval = _pagebuf_get_pages(pb, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count, 0); if (rval) return rval; - pb->pb_offset = offset; + bp->b_offset = offset; ptr = (size_t) mem & PAGE_CACHE_MASK; end = PAGE_CACHE_ALIGN((size_t) mem + len); end_cur = end; /* set up first page */ - pb->pb_pages[0] = mem_to_page(mem); + bp->b_pages[0] = mem_to_page(mem); ptr += PAGE_CACHE_SIZE; - pb->pb_page_count = ++i; + bp->b_page_count = ++i; while (ptr < end) { - pb->pb_pages[i] = mem_to_page((void *)ptr); - pb->pb_page_count = ++i; + bp->b_pages[i] = mem_to_page((void *)ptr); + bp->b_page_count = ++i; ptr += PAGE_CACHE_SIZE; } - pb->pb_locked = 0; + bp->b_locked = 0; - pb->pb_count_desired = pb->pb_buffer_length = len; - pb->pb_flags |= PBF_MAPPED; + bp->b_count_desired = bp->b_buffer_length = len; + bp->b_flags |= XBF_MAPPED; return 0; } xfs_buf_t * -pagebuf_get_no_daddr( +xfs_buf_get_noaddr( size_t len, xfs_buftarg_t *target) { @@ -764,10 +761,10 @@ pagebuf_get_no_daddr( void *data; int error; - bp = pagebuf_allocate(0); + bp = xfs_buf_allocate(0); if (unlikely(bp == NULL)) goto fail; - _pagebuf_initialize(bp, target, 0, len, 0); + _xfs_buf_initialize(bp, target, 0, len, 0); try_again: data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); @@ -776,78 +773,73 @@ pagebuf_get_no_daddr( /* check whether alignment matches.. */ if ((__psunsigned_t)data != - ((__psunsigned_t)data & ~target->pbr_smask)) { + ((__psunsigned_t)data & ~target->bt_smask)) { /* .. else double the size and try again */ kmem_free(data, malloc_len); malloc_len <<= 1; goto try_again; } - error = pagebuf_associate_memory(bp, data, len); + error = xfs_buf_associate_memory(bp, data, len); if (error) goto fail_free_mem; - bp->pb_flags |= _PBF_KMEM_ALLOC; + bp->b_flags |= _XBF_KMEM_ALLOC; - pagebuf_unlock(bp); + xfs_buf_unlock(bp); - PB_TRACE(bp, "no_daddr", data); + XB_TRACE(bp, "no_daddr", data); return bp; fail_free_mem: kmem_free(data, malloc_len); fail_free_buf: - pagebuf_free(bp); + xfs_buf_free(bp); fail: return NULL; } /* - * pagebuf_hold - * * Increment reference count on buffer, to hold the buffer concurrently * with another thread which may release (free) the buffer asynchronously. - * * Must hold the buffer already to call this function. */ void -pagebuf_hold( - xfs_buf_t *pb) +xfs_buf_hold( + xfs_buf_t *bp) { - atomic_inc(&pb->pb_hold); - PB_TRACE(pb, "hold", 0); + atomic_inc(&bp->b_hold); + XB_TRACE(bp, "hold", 0); } /* - * pagebuf_rele - * - * pagebuf_rele releases a hold on the specified buffer. If the - * the hold count is 1, pagebuf_rele calls pagebuf_free. + * Releases a hold on the specified buffer. If the + * the hold count is 1, calls xfs_buf_free. */ void -pagebuf_rele( - xfs_buf_t *pb) +xfs_buf_rele( + xfs_buf_t *bp) { - xfs_bufhash_t *hash = pb->pb_hash; + xfs_bufhash_t *hash = bp->b_hash; - PB_TRACE(pb, "rele", pb->pb_relse); + XB_TRACE(bp, "rele", bp->b_relse); - if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) { - if (pb->pb_relse) { - atomic_inc(&pb->pb_hold); + if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { + if (bp->b_relse) { + atomic_inc(&bp->b_hold); spin_unlock(&hash->bh_lock); - (*(pb->pb_relse)) (pb); - } else if (pb->pb_flags & PBF_FS_MANAGED) { + (*(bp->b_relse)) (bp); + } else if (bp->b_flags & XBF_FS_MANAGED) { spin_unlock(&hash->bh_lock); } else { - ASSERT(!(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q))); - list_del_init(&pb->pb_hash_list); + ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); + list_del_init(&bp->b_hash_list); spin_unlock(&hash->bh_lock); - pagebuf_free(pb); + xfs_buf_free(bp); } } else { /* * Catch reference count leaks */ - ASSERT(atomic_read(&pb->pb_hold) >= 0); + ASSERT(atomic_read(&bp->b_hold) >= 0); } } @@ -863,168 +855,122 @@ pagebuf_rele( */ /* - * pagebuf_cond_lock - * - * pagebuf_cond_lock locks a buffer object, if it is not already locked. - * Note that this in no way - * locks the underlying pages, so it is only useful for synchronizing - * concurrent use of page buffer objects, not for synchronizing independent - * access to the underlying pages. + * Locks a buffer object, if it is not already locked. + * Note that this in no way locks the underlying pages, so it is only + * useful for synchronizing concurrent use of buffer objects, not for + * synchronizing independent access to the underlying pages. */ int -pagebuf_cond_lock( /* lock buffer, if not locked */ - /* returns -EBUSY if locked) */ - xfs_buf_t *pb) +xfs_buf_cond_lock( + xfs_buf_t *bp) { int locked; - locked = down_trylock(&pb->pb_sema) == 0; + locked = down_trylock(&bp->b_sema) == 0; if (locked) { - PB_SET_OWNER(pb); + XB_SET_OWNER(bp); } - PB_TRACE(pb, "cond_lock", (long)locked); - return(locked ? 0 : -EBUSY); + XB_TRACE(bp, "cond_lock", (long)locked); + return locked ? 0 : -EBUSY; } #if defined(DEBUG) || defined(XFS_BLI_TRACE) -/* - * pagebuf_lock_value - * - * Return lock value for a pagebuf - */ int -pagebuf_lock_value( - xfs_buf_t *pb) +xfs_buf_lock_value( + xfs_buf_t *bp) { - return(atomic_read(&pb->pb_sema.count)); + return atomic_read(&bp->b_sema.count); } #endif /* - * pagebuf_lock - * - * pagebuf_lock locks a buffer object. Note that this in no way - * locks the underlying pages, so it is only useful for synchronizing - * concurrent use of page buffer objects, not for synchronizing independent - * access to the underlying pages. + * Locks a buffer object. + * Note that this in no way locks the underlying pages, so it is only + * useful for synchronizing concurrent use of buffer objects, not for + * synchronizing independent access to the underlying pages. */ -int -pagebuf_lock( - xfs_buf_t *pb) +void +xfs_buf_lock( + xfs_buf_t *bp) { - PB_TRACE(pb, "lock", 0); - if (atomic_read(&pb->pb_io_remaining)) - blk_run_address_space(pb->pb_target->pbr_mapping); - down(&pb->pb_sema); - PB_SET_OWNER(pb); - PB_TRACE(pb, "locked", 0); - return 0; + XB_TRACE(bp, "lock", 0); + if (atomic_read(&bp->b_io_remaining)) + blk_run_address_space(bp->b_target->bt_mapping); + down(&bp->b_sema); + XB_SET_OWNER(bp); + XB_TRACE(bp, "locked", 0); } /* - * pagebuf_unlock - * - * pagebuf_unlock releases the lock on the buffer object created by - * pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages - * created by pagebuf_pin). - * + * Releases the lock on the buffer object. * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to + * unlock the buffer as we need to set flags correctly. We also need to * take a reference for the delwri queue because the unlocker is going to * drop their's and they don't know we just queued it. */ void -pagebuf_unlock( /* unlock buffer */ - xfs_buf_t *pb) /* buffer to unlock */ +xfs_buf_unlock( + xfs_buf_t *bp) { - if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) { - atomic_inc(&pb->pb_hold); - pb->pb_flags |= PBF_ASYNC; - pagebuf_delwri_queue(pb, 0); + if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { + atomic_inc(&bp->b_hold); + bp->b_flags |= XBF_ASYNC; + xfs_buf_delwri_queue(bp, 0); } - PB_CLEAR_OWNER(pb); - up(&pb->pb_sema); - PB_TRACE(pb, "unlock", 0); + XB_CLEAR_OWNER(bp); + up(&bp->b_sema); + XB_TRACE(bp, "unlock", 0); } /* * Pinning Buffer Storage in Memory - */ - -/* - * pagebuf_pin - * - * pagebuf_pin locks all of the memory represented by a buffer in - * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for - * the same or different buffers affecting a given page, will - * properly count the number of outstanding "pin" requests. The - * buffer may be released after the pagebuf_pin and a different - * buffer used when calling pagebuf_unpin, if desired. - * pagebuf_pin should be used by the file system when it wants be - * assured that no attempt will be made to force the affected - * memory to disk. It does not assure that a given logical page - * will not be moved to a different physical page. + * Ensure that no attempt to force a buffer to disk will succeed. */ void -pagebuf_pin( - xfs_buf_t *pb) +xfs_buf_pin( + xfs_buf_t *bp) { - atomic_inc(&pb->pb_pin_count); - PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter); + atomic_inc(&bp->b_pin_count); + XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); } -/* - * pagebuf_unpin - * - * pagebuf_unpin reverses the locking of memory performed by - * pagebuf_pin. Note that both functions affected the logical - * pages associated with the buffer, not the buffer itself. - */ void -pagebuf_unpin( - xfs_buf_t *pb) +xfs_buf_unpin( + xfs_buf_t *bp) { - if (atomic_dec_and_test(&pb->pb_pin_count)) { - wake_up_all(&pb->pb_waiters); - } - PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter); + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); } int -pagebuf_ispin( - xfs_buf_t *pb) +xfs_buf_ispin( + xfs_buf_t *bp) { - return atomic_read(&pb->pb_pin_count); + return atomic_read(&bp->b_pin_count); } -/* - * pagebuf_wait_unpin - * - * pagebuf_wait_unpin waits until all of the memory associated - * with the buffer is not longer locked in memory. It returns - * immediately if none of the affected pages are locked. - */ -static inline void -_pagebuf_wait_unpin( - xfs_buf_t *pb) +STATIC void +xfs_buf_wait_unpin( + xfs_buf_t *bp) { DECLARE_WAITQUEUE (wait, current); - if (atomic_read(&pb->pb_pin_count) == 0) + if (atomic_read(&bp->b_pin_count) == 0) return; - add_wait_queue(&pb->pb_waiters, &wait); + add_wait_queue(&bp->b_waiters, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&pb->pb_pin_count) == 0) + if (atomic_read(&bp->b_pin_count) == 0) break; - if (atomic_read(&pb->pb_io_remaining)) - blk_run_address_space(pb->pb_target->pbr_mapping); + if (atomic_read(&bp->b_io_remaining)) + blk_run_address_space(bp->b_target->bt_mapping); schedule(); } - remove_wait_queue(&pb->pb_waiters, &wait); + remove_wait_queue(&bp->b_waiters, &wait); set_current_state(TASK_RUNNING); } @@ -1032,241 +978,216 @@ _pagebuf_wait_unpin( * Buffer Utility Routines */ -/* - * pagebuf_iodone - * - * pagebuf_iodone marks a buffer for which I/O is in progress - * done with respect to that I/O. The pb_iodone routine, if - * present, will be called as a side-effect. - */ STATIC void -pagebuf_iodone_work( +xfs_buf_iodone_work( void *v) { xfs_buf_t *bp = (xfs_buf_t *)v; - if (bp->pb_iodone) - (*(bp->pb_iodone))(bp); - else if (bp->pb_flags & PBF_ASYNC) + if (bp->b_iodone) + (*(bp->b_iodone))(bp); + else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); } void -pagebuf_iodone( - xfs_buf_t *pb, +xfs_buf_ioend( + xfs_buf_t *bp, int schedule) { - pb->pb_flags &= ~(PBF_READ | PBF_WRITE); - if (pb->pb_error == 0) - pb->pb_flags |= PBF_DONE; + bp->b_flags &= ~(XBF_READ | XBF_WRITE); + if (bp->b_error == 0) + bp->b_flags |= XBF_DONE; - PB_TRACE(pb, "iodone", pb->pb_iodone); + XB_TRACE(bp, "iodone", bp->b_iodone); - if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) { + if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { if (schedule) { - INIT_WORK(&pb->pb_iodone_work, pagebuf_iodone_work, pb); - queue_work(xfslogd_workqueue, &pb->pb_iodone_work); + INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp); + queue_work(xfslogd_workqueue, &bp->b_iodone_work); } else { - pagebuf_iodone_work(pb); + xfs_buf_iodone_work(bp); } } else { - up(&pb->pb_iodonesema); + up(&bp->b_iodonesema); } } -/* - * pagebuf_ioerror - * - * pagebuf_ioerror sets the error code for a buffer. - */ void -pagebuf_ioerror( /* mark/clear buffer error flag */ - xfs_buf_t *pb, /* buffer to mark */ - int error) /* error to store (0 if none) */ +xfs_buf_ioerror( + xfs_buf_t *bp, + int error) { ASSERT(error >= 0 && error <= 0xffff); - pb->pb_error = (unsigned short)error; - PB_TRACE(pb, "ioerror", (unsigned long)error); + bp->b_error = (unsigned short)error; + XB_TRACE(bp, "ioerror", (unsigned long)error); } /* - * pagebuf_iostart - * - * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied. - * If necessary, it will arrange for any disk space allocation required, - * and it will break up the request if the block mappings require it. - * The pb_iodone routine in the buffer supplied will only be called + * Initiate I/O on a buffer, based on the flags supplied. + * The b_iodone routine in the buffer supplied will only be called * when all of the subsidiary I/O requests, if any, have been completed. - * pagebuf_iostart calls the pagebuf_ioinitiate routine or - * pagebuf_iorequest, if the former routine is not defined, to start - * the I/O on a given low-level request. */ int -pagebuf_iostart( /* start I/O on a buffer */ - xfs_buf_t *pb, /* buffer to start */ - page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */ - /* PBF_WRITE, PBF_DELWRI, */ - /* PBF_DONT_BLOCK */ +xfs_buf_iostart( + xfs_buf_t *bp, + xfs_buf_flags_t flags) { int status = 0; - PB_TRACE(pb, "iostart", (unsigned long)flags); + XB_TRACE(bp, "iostart", (unsigned long)flags); - if (flags & PBF_DELWRI) { - pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC); - pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC); - pagebuf_delwri_queue(pb, 1); + if (flags & XBF_DELWRI) { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); + bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); + xfs_buf_delwri_queue(bp, 1); return status; } - pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \ - PBF_READ_AHEAD | _PBF_RUN_QUEUES); - pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \ - PBF_READ_AHEAD | _PBF_RUN_QUEUES); + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ + XBF_READ_AHEAD | _XBF_RUN_QUEUES); + bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \ + XBF_READ_AHEAD | _XBF_RUN_QUEUES); - BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL); + BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL); /* For writes allow an alternate strategy routine to precede * the actual I/O request (which may not be issued at all in * a shutdown situation, for example). */ - status = (flags & PBF_WRITE) ? - pagebuf_iostrategy(pb) : pagebuf_iorequest(pb); + status = (flags & XBF_WRITE) ? + xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp); /* Wait for I/O if we are not an async request. * Note: async I/O request completion will release the buffer, * and that can already be done by this point. So using the * buffer pointer from here on, after async I/O, is invalid. */ - if (!status && !(flags & PBF_ASYNC)) - status = pagebuf_iowait(pb); + if (!status && !(flags & XBF_ASYNC)) + status = xfs_buf_iowait(bp); return status; } -/* - * Helper routine for pagebuf_iorequest - */ - STATIC __inline__ int -_pagebuf_iolocked( - xfs_buf_t *pb) +_xfs_buf_iolocked( + xfs_buf_t *bp) { - ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE)); - if (pb->pb_flags & PBF_READ) - return pb->pb_locked; + ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE)); + if (bp->b_flags & XBF_READ) + return bp->b_locked; return 0; } STATIC __inline__ void -_pagebuf_iodone( - xfs_buf_t *pb, +_xfs_buf_ioend( + xfs_buf_t *bp, int schedule) { - if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { - pb->pb_locked = 0; - pagebuf_iodone(pb, schedule); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + bp->b_locked = 0; + xfs_buf_ioend(bp, schedule); } } STATIC int -bio_end_io_pagebuf( +xfs_buf_bio_end_io( struct bio *bio, unsigned int bytes_done, int error) { - xfs_buf_t *pb = (xfs_buf_t *)bio->bi_private; - unsigned int blocksize = pb->pb_target->pbr_bsize; + xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + unsigned int blocksize = bp->b_target->bt_bsize; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; if (bio->bi_size) return 1; if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - pb->pb_error = EIO; + bp->b_error = EIO; do { struct page *page = bvec->bv_page; - if (unlikely(pb->pb_error)) { - if (pb->pb_flags & PBF_READ) + if (unlikely(bp->b_error)) { + if (bp->b_flags & XBF_READ) ClearPageUptodate(page); SetPageError(page); - } else if (blocksize == PAGE_CACHE_SIZE) { + } else if (blocksize >= PAGE_CACHE_SIZE) { SetPageUptodate(page); } else if (!PagePrivate(page) && - (pb->pb_flags & _PBF_PAGE_CACHE)) { + (bp->b_flags & _XBF_PAGE_CACHE)) { set_page_region(page, bvec->bv_offset, bvec->bv_len); } if (--bvec >= bio->bi_io_vec) prefetchw(&bvec->bv_page->flags); - if (_pagebuf_iolocked(pb)) { + if (_xfs_buf_iolocked(bp)) { unlock_page(page); } } while (bvec >= bio->bi_io_vec); - _pagebuf_iodone(pb, 1); + _xfs_buf_ioend(bp, 1); bio_put(bio); return 0; } STATIC void -_pagebuf_ioapply( - xfs_buf_t *pb) +_xfs_buf_ioapply( + xfs_buf_t *bp) { int i, rw, map_i, total_nr_pages, nr_pages; struct bio *bio; - int offset = pb->pb_offset; - int size = pb->pb_count_desired; - sector_t sector = pb->pb_bn; - unsigned int blocksize = pb->pb_target->pbr_bsize; - int locking = _pagebuf_iolocked(pb); + int offset = bp->b_offset; + int size = bp->b_count_desired; + sector_t sector = bp->b_bn; + unsigned int blocksize = bp->b_target->bt_bsize; + int locking = _xfs_buf_iolocked(bp); - total_nr_pages = pb->pb_page_count; + total_nr_pages = bp->b_page_count; map_i = 0; - if (pb->pb_flags & _PBF_RUN_QUEUES) { - pb->pb_flags &= ~_PBF_RUN_QUEUES; - rw = (pb->pb_flags & PBF_READ) ? READ_SYNC : WRITE_SYNC; + if (bp->b_flags & _XBF_RUN_QUEUES) { + bp->b_flags &= ~_XBF_RUN_QUEUES; + rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC; } else { - rw = (pb->pb_flags & PBF_READ) ? READ : WRITE; + rw = (bp->b_flags & XBF_READ) ? READ : WRITE; } - if (pb->pb_flags & PBF_ORDERED) { - ASSERT(!(pb->pb_flags & PBF_READ)); + if (bp->b_flags & XBF_ORDERED) { + ASSERT(!(bp->b_flags & XBF_READ)); rw = WRITE_BARRIER; } - /* Special code path for reading a sub page size pagebuf in -- + /* Special code path for reading a sub page size buffer in -- * we populate up the whole page, and hence the other metadata * in the same page. This optimization is only valid when the - * filesystem block size and the page size are equal. + * filesystem block size is not smaller than the page size. */ - if ((pb->pb_buffer_length < PAGE_CACHE_SIZE) && - (pb->pb_flags & PBF_READ) && locking && - (blocksize == PAGE_CACHE_SIZE)) { + if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && + (bp->b_flags & XBF_READ) && locking && + (blocksize >= PAGE_CACHE_SIZE)) { bio = bio_alloc(GFP_NOIO, 1); - bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_sector = sector - (offset >> BBSHIFT); - bio->bi_end_io = bio_end_io_pagebuf; - bio->bi_private = pb; + bio->bi_end_io = xfs_buf_bio_end_io; + bio->bi_private = bp; - bio_add_page(bio, pb->pb_pages[0], PAGE_CACHE_SIZE, 0); + bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); size = 0; - atomic_inc(&pb->pb_io_remaining); + atomic_inc(&bp->b_io_remaining); goto submit_io; } /* Lock down the pages which we need to for the request */ - if (locking && (pb->pb_flags & PBF_WRITE) && (pb->pb_locked == 0)) { + if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) { for (i = 0; size; i++) { int nbytes = PAGE_CACHE_SIZE - offset; - struct page *page = pb->pb_pages[i]; + struct page *page = bp->b_pages[i]; if (nbytes > size) nbytes = size; @@ -1276,30 +1197,30 @@ _pagebuf_ioapply( size -= nbytes; offset = 0; } - offset = pb->pb_offset; - size = pb->pb_count_desired; + offset = bp->b_offset; + size = bp->b_count_desired; } next_chunk: - atomic_inc(&pb->pb_io_remaining); + atomic_inc(&bp->b_io_remaining); nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); if (nr_pages > total_nr_pages) nr_pages = total_nr_pages; bio = bio_alloc(GFP_NOIO, nr_pages); - bio->bi_bdev = pb->pb_target->pbr_bdev; + bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_sector = sector; - bio->bi_end_io = bio_end_io_pagebuf; - bio->bi_private = pb; + bio->bi_end_io = xfs_buf_bio_end_io; + bio->bi_private = bp; for (; size && nr_pages; nr_pages--, map_i++) { - int nbytes = PAGE_CACHE_SIZE - offset; + int rbytes, nbytes = PAGE_CACHE_SIZE - offset; if (nbytes > size) nbytes = size; - if (bio_add_page(bio, pb->pb_pages[map_i], - nbytes, offset) < nbytes) + rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); + if (rbytes < nbytes) break; offset = 0; @@ -1315,107 +1236,102 @@ submit_io: goto next_chunk; } else { bio_put(bio); - pagebuf_ioerror(pb, EIO); + xfs_buf_ioerror(bp, EIO); } } -/* - * pagebuf_iorequest -- the core I/O request routine. - */ int -pagebuf_iorequest( /* start real I/O */ - xfs_buf_t *pb) /* buffer to convey to device */ +xfs_buf_iorequest( + xfs_buf_t *bp) { - PB_TRACE(pb, "iorequest", 0); + XB_TRACE(bp, "iorequest", 0); - if (pb->pb_flags & PBF_DELWRI) { - pagebuf_delwri_queue(pb, 1); + if (bp->b_flags & XBF_DELWRI) { + xfs_buf_delwri_queue(bp, 1); return 0; } - if (pb->pb_flags & PBF_WRITE) { - _pagebuf_wait_unpin(pb); + if (bp->b_flags & XBF_WRITE) { + xfs_buf_wait_unpin(bp); } - pagebuf_hold(pb); + xfs_buf_hold(bp); /* Set the count to 1 initially, this will stop an I/O * completion callout which happens before we have started - * all the I/O from calling pagebuf_iodone too early. + * all the I/O from calling xfs_buf_ioend too early. */ - atomic_set(&pb->pb_io_remaining, 1); - _pagebuf_ioapply(pb); - _pagebuf_iodone(pb, 0); + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + _xfs_buf_ioend(bp, 0); - pagebuf_rele(pb); + xfs_buf_rele(bp); return 0; } /* - * pagebuf_iowait - * - * pagebuf_iowait waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. In any case, it returns - * the error code, if any, or 0 if there is no error. + * Waits for I/O to complete on the buffer supplied. + * It returns immediately if no I/O is pending. + * It returns the I/O error code, if any, or 0 if there was no error. */ int -pagebuf_iowait( - xfs_buf_t *pb) +xfs_buf_iowait( + xfs_buf_t *bp) { - PB_TRACE(pb, "iowait", 0); - if (atomic_read(&pb->pb_io_remaining)) - blk_run_address_space(pb->pb_target->pbr_mapping); - down(&pb->pb_iodonesema); - PB_TRACE(pb, "iowaited", (long)pb->pb_error); - return pb->pb_error; + XB_TRACE(bp, "iowait", 0); + if (atomic_read(&bp->b_io_remaining)) + blk_run_address_space(bp->b_target->bt_mapping); + down(&bp->b_iodonesema); + XB_TRACE(bp, "iowaited", (long)bp->b_error); + return bp->b_error; } -caddr_t -pagebuf_offset( - xfs_buf_t *pb, +xfs_caddr_t +xfs_buf_offset( + xfs_buf_t *bp, size_t offset) { struct page *page; - offset += pb->pb_offset; + if (bp->b_flags & XBF_MAPPED) + return XFS_BUF_PTR(bp) + offset; - page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT]; - return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1)); + offset += bp->b_offset; + page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); } /* - * pagebuf_iomove - * * Move data into or out of a buffer. */ void -pagebuf_iomove( - xfs_buf_t *pb, /* buffer to process */ +xfs_buf_iomove( + xfs_buf_t *bp, /* buffer to process */ size_t boff, /* starting buffer offset */ size_t bsize, /* length to copy */ caddr_t data, /* data address */ - page_buf_rw_t mode) /* read/write flag */ + xfs_buf_rw_t mode) /* read/write/zero flag */ { size_t bend, cpoff, csize; struct page *page; bend = boff + bsize; while (boff < bend) { - page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)]; - cpoff = page_buf_poff(boff + pb->pb_offset); + page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; + cpoff = xfs_buf_poff(boff + bp->b_offset); csize = min_t(size_t, - PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff); + PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); switch (mode) { - case PBRW_ZERO: + case XBRW_ZERO: memset(page_address(page) + cpoff, 0, csize); break; - case PBRW_READ: + case XBRW_READ: memcpy(data, page_address(page) + cpoff, csize); break; - case PBRW_WRITE: + case XBRW_WRITE: memcpy(page_address(page) + cpoff, data, csize); } @@ -1425,12 +1341,12 @@ pagebuf_iomove( } /* - * Handling of buftargs. + * Handling of buffer targets (buftargs). */ /* - * Wait for any bufs with callbacks that have been submitted but - * have not yet returned... walk the hash list for the target. + * Wait for any bufs with callbacks that have been submitted but + * have not yet returned... walk the hash list for the target. */ void xfs_wait_buftarg( @@ -1444,15 +1360,15 @@ xfs_wait_buftarg( hash = &btp->bt_hash[i]; again: spin_lock(&hash->bh_lock); - list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) { - ASSERT(btp == bp->pb_target); - if (!(bp->pb_flags & PBF_FS_MANAGED)) { + list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { + ASSERT(btp == bp->b_target); + if (!(bp->b_flags & XBF_FS_MANAGED)) { spin_unlock(&hash->bh_lock); /* * Catch superblock reference count leaks * immediately */ - BUG_ON(bp->pb_bn == 0); + BUG_ON(bp->b_bn == 0); delay(100); goto again; } @@ -1462,9 +1378,9 @@ again: } /* - * Allocate buffer hash table for a given target. - * For devices containing metadata (i.e. not the log/realtime devices) - * we need to allocate a much larger hash table. + * Allocate buffer hash table for a given target. + * For devices containing metadata (i.e. not the log/realtime devices) + * we need to allocate a much larger hash table. */ STATIC void xfs_alloc_bufhash( @@ -1487,11 +1403,34 @@ STATIC void xfs_free_bufhash( xfs_buftarg_t *btp) { - kmem_free(btp->bt_hash, - (1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); + kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t)); btp->bt_hash = NULL; } +/* + * buftarg list for delwrite queue processing + */ +STATIC LIST_HEAD(xfs_buftarg_list); +STATIC DEFINE_SPINLOCK(xfs_buftarg_lock); + +STATIC void +xfs_register_buftarg( + xfs_buftarg_t *btp) +{ + spin_lock(&xfs_buftarg_lock); + list_add(&btp->bt_list, &xfs_buftarg_list); + spin_unlock(&xfs_buftarg_lock); +} + +STATIC void +xfs_unregister_buftarg( + xfs_buftarg_t *btp) +{ + spin_lock(&xfs_buftarg_lock); + list_del(&btp->bt_list); + spin_unlock(&xfs_buftarg_lock); +} + void xfs_free_buftarg( xfs_buftarg_t *btp, @@ -1499,9 +1438,16 @@ xfs_free_buftarg( { xfs_flush_buftarg(btp, 1); if (external) - xfs_blkdev_put(btp->pbr_bdev); + xfs_blkdev_put(btp->bt_bdev); xfs_free_bufhash(btp); - iput(btp->pbr_mapping->host); + iput(btp->bt_mapping->host); + + /* Unregister the buftarg first so that we don't get a + * wakeup finding a non-existent task + */ + xfs_unregister_buftarg(btp); + kthread_stop(btp->bt_task); + kmem_free(btp, sizeof(*btp)); } @@ -1512,11 +1458,11 @@ xfs_setsize_buftarg_flags( unsigned int sectorsize, int verbose) { - btp->pbr_bsize = blocksize; - btp->pbr_sshift = ffs(sectorsize) - 1; - btp->pbr_smask = sectorsize - 1; + btp->bt_bsize = blocksize; + btp->bt_sshift = ffs(sectorsize) - 1; + btp->bt_smask = sectorsize - 1; - if (set_blocksize(btp->pbr_bdev, sectorsize)) { + if (set_blocksize(btp->bt_bdev, sectorsize)) { printk(KERN_WARNING "XFS: Cannot set_blocksize to %u on device %s\n", sectorsize, XFS_BUFTARG_NAME(btp)); @@ -1536,10 +1482,10 @@ xfs_setsize_buftarg_flags( } /* -* When allocating the initial buffer target we have not yet -* read in the superblock, so don't know what sized sectors -* are being used is at this early stage. Play safe. -*/ + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used is at this early stage. Play safe. + */ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp, @@ -1587,10 +1533,30 @@ xfs_mapping_buftarg( mapping->a_ops = &mapping_aops; mapping->backing_dev_info = bdi; mapping_set_gfp_mask(mapping, GFP_NOFS); - btp->pbr_mapping = mapping; + btp->bt_mapping = mapping; return 0; } +STATIC int +xfs_alloc_delwrite_queue( + xfs_buftarg_t *btp) +{ + int error = 0; + + INIT_LIST_HEAD(&btp->bt_list); + INIT_LIST_HEAD(&btp->bt_delwrite_queue); + spinlock_init(&btp->bt_delwrite_lock, "delwri_lock"); + btp->bt_flags = 0; + btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); + if (IS_ERR(btp->bt_task)) { + error = PTR_ERR(btp->bt_task); + goto out_error; + } + xfs_register_buftarg(btp); +out_error: + return error; +} + xfs_buftarg_t * xfs_alloc_buftarg( struct block_device *bdev, @@ -1600,12 +1566,14 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); - btp->pbr_dev = bdev->bd_dev; - btp->pbr_bdev = bdev; + btp->bt_dev = bdev->bd_dev; + btp->bt_bdev = bdev; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; if (xfs_mapping_buftarg(btp, bdev)) goto error; + if (xfs_alloc_delwrite_queue(btp)) + goto error; xfs_alloc_bufhash(btp, external); return btp; @@ -1616,83 +1584,81 @@ error: /* - * Pagebuf delayed write buffer handling + * Delayed write buffer handling */ - -STATIC LIST_HEAD(pbd_delwrite_queue); -STATIC DEFINE_SPINLOCK(pbd_delwrite_lock); - STATIC void -pagebuf_delwri_queue( - xfs_buf_t *pb, +xfs_buf_delwri_queue( + xfs_buf_t *bp, int unlock) { - PB_TRACE(pb, "delwri_q", (long)unlock); - ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) == - (PBF_DELWRI|PBF_ASYNC)); + struct list_head *dwq = &bp->b_target->bt_delwrite_queue; + spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; + + XB_TRACE(bp, "delwri_q", (long)unlock); + ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); - spin_lock(&pbd_delwrite_lock); + spin_lock(dwlk); /* If already in the queue, dequeue and place at tail */ - if (!list_empty(&pb->pb_list)) { - ASSERT(pb->pb_flags & _PBF_DELWRI_Q); - if (unlock) { - atomic_dec(&pb->pb_hold); - } - list_del(&pb->pb_list); + if (!list_empty(&bp->b_list)) { + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + if (unlock) + atomic_dec(&bp->b_hold); + list_del(&bp->b_list); } - pb->pb_flags |= _PBF_DELWRI_Q; - list_add_tail(&pb->pb_list, &pbd_delwrite_queue); - pb->pb_queuetime = jiffies; - spin_unlock(&pbd_delwrite_lock); + bp->b_flags |= _XBF_DELWRI_Q; + list_add_tail(&bp->b_list, dwq); + bp->b_queuetime = jiffies; + spin_unlock(dwlk); if (unlock) - pagebuf_unlock(pb); + xfs_buf_unlock(bp); } void -pagebuf_delwri_dequeue( - xfs_buf_t *pb) +xfs_buf_delwri_dequeue( + xfs_buf_t *bp) { + spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; int dequeued = 0; - spin_lock(&pbd_delwrite_lock); - if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) { - ASSERT(pb->pb_flags & _PBF_DELWRI_Q); - list_del_init(&pb->pb_list); + spin_lock(dwlk); + if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { + ASSERT(bp->b_flags & _XBF_DELWRI_Q); + list_del_init(&bp->b_list); dequeued = 1; } - pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); - spin_unlock(&pbd_delwrite_lock); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + spin_unlock(dwlk); if (dequeued) - pagebuf_rele(pb); + xfs_buf_rele(bp); - PB_TRACE(pb, "delwri_dq", (long)dequeued); + XB_TRACE(bp, "delwri_dq", (long)dequeued); } STATIC void -pagebuf_runall_queues( +xfs_buf_runall_queues( struct workqueue_struct *queue) { flush_workqueue(queue); } -/* Defines for pagebuf daemon */ -STATIC struct task_struct *xfsbufd_task; -STATIC int xfsbufd_force_flush; -STATIC int xfsbufd_force_sleep; - STATIC int xfsbufd_wakeup( int priority, gfp_t mask) { - if (xfsbufd_force_sleep) - return 0; - xfsbufd_force_flush = 1; - barrier(); - wake_up_process(xfsbufd_task); + xfs_buftarg_t *btp; + + spin_lock(&xfs_buftarg_lock); + list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { + if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) + continue; + set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); + wake_up_process(btp->bt_task); + } + spin_unlock(&xfs_buftarg_lock); return 0; } @@ -1702,67 +1668,70 @@ xfsbufd( { struct list_head tmp; unsigned long age; - xfs_buftarg_t *target; - xfs_buf_t *pb, *n; + xfs_buftarg_t *target = (xfs_buftarg_t *)data; + xfs_buf_t *bp, *n; + struct list_head *dwq = &target->bt_delwrite_queue; + spinlock_t *dwlk = &target->bt_delwrite_lock; current->flags |= PF_MEMALLOC; INIT_LIST_HEAD(&tmp); do { if (unlikely(freezing(current))) { - xfsbufd_force_sleep = 1; + set_bit(XBT_FORCE_SLEEP, &target->bt_flags); refrigerator(); } else { - xfsbufd_force_sleep = 0; + clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); } schedule_timeout_interruptible( xfs_buf_timer_centisecs * msecs_to_jiffies(10)); age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - spin_lock(&pbd_delwrite_lock); - list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { - PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb)); - ASSERT(pb->pb_flags & PBF_DELWRI); - - if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) { - if (!xfsbufd_force_flush && + spin_lock(dwlk); + list_for_each_entry_safe(bp, n, dwq, b_list) { + XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); + ASSERT(bp->b_flags & XBF_DELWRI); + + if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { + if (!test_bit(XBT_FORCE_FLUSH, + &target->bt_flags) && time_before(jiffies, - pb->pb_queuetime + age)) { - pagebuf_unlock(pb); + bp->b_queuetime + age)) { + xfs_buf_unlock(bp); break; } - pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); - pb->pb_flags |= PBF_WRITE; - list_move(&pb->pb_list, &tmp); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + bp->b_flags |= XBF_WRITE; + list_move(&bp->b_list, &tmp); } } - spin_unlock(&pbd_delwrite_lock); + spin_unlock(dwlk); while (!list_empty(&tmp)) { - pb = list_entry(tmp.next, xfs_buf_t, pb_list); - target = pb->pb_target; + bp = list_entry(tmp.next, xfs_buf_t, b_list); + ASSERT(target == bp->b_target); - list_del_init(&pb->pb_list); - pagebuf_iostrategy(pb); + list_del_init(&bp->b_list); + xfs_buf_iostrategy(bp); - blk_run_address_space(target->pbr_mapping); + blk_run_address_space(target->bt_mapping); } if (as_list_len > 0) purge_addresses(); - xfsbufd_force_flush = 0; + clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); } while (!kthread_should_stop()); return 0; } /* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. + * Go through all incore buffers, and release buffers if they belong to + * the given device. This is used in filesystem error handling to + * preserve the consistency of its metadata. */ int xfs_flush_buftarg( @@ -1770,73 +1739,72 @@ xfs_flush_buftarg( int wait) { struct list_head tmp; - xfs_buf_t *pb, *n; + xfs_buf_t *bp, *n; int pincount = 0; + struct list_head *dwq = &target->bt_delwrite_queue; + spinlock_t *dwlk = &target->bt_delwrite_lock; - pagebuf_runall_queues(xfsdatad_workqueue); - pagebuf_runall_queues(xfslogd_workqueue); + xfs_buf_runall_queues(xfsdatad_workqueue); + xfs_buf_runall_queues(xfslogd_workqueue); INIT_LIST_HEAD(&tmp); - spin_lock(&pbd_delwrite_lock); - list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) { - - if (pb->pb_target != target) - continue; - - ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)); - PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb)); - if (pagebuf_ispin(pb)) { + spin_lock(dwlk); + list_for_each_entry_safe(bp, n, dwq, b_list) { + ASSERT(bp->b_target == target); + ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q)); + XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp)); + if (xfs_buf_ispin(bp)) { pincount++; continue; } - list_move(&pb->pb_list, &tmp); + list_move(&bp->b_list, &tmp); } - spin_unlock(&pbd_delwrite_lock); + spin_unlock(dwlk); /* * Dropped the delayed write list lock, now walk the temporary list */ - list_for_each_entry_safe(pb, n, &tmp, pb_list) { - pagebuf_lock(pb); - pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q); - pb->pb_flags |= PBF_WRITE; + list_for_each_entry_safe(bp, n, &tmp, b_list) { + xfs_buf_lock(bp); + bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); + bp->b_flags |= XBF_WRITE; if (wait) - pb->pb_flags &= ~PBF_ASYNC; + bp->b_flags &= ~XBF_ASYNC; else - list_del_init(&pb->pb_list); + list_del_init(&bp->b_list); - pagebuf_iostrategy(pb); + xfs_buf_iostrategy(bp); } /* * Remaining list items must be flushed before returning */ while (!list_empty(&tmp)) { - pb = list_entry(tmp.next, xfs_buf_t, pb_list); + bp = list_entry(tmp.next, xfs_buf_t, b_list); - list_del_init(&pb->pb_list); - xfs_iowait(pb); - xfs_buf_relse(pb); + list_del_init(&bp->b_list); + xfs_iowait(bp); + xfs_buf_relse(bp); } if (wait) - blk_run_address_space(target->pbr_mapping); + blk_run_address_space(target->bt_mapping); return pincount; } int __init -pagebuf_init(void) +xfs_buf_init(void) { int error = -ENOMEM; -#ifdef PAGEBUF_TRACE - pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP); +#ifdef XFS_BUF_TRACE + xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); #endif - pagebuf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf"); - if (!pagebuf_zone) + xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf"); + if (!xfs_buf_zone) goto out_free_trace_buf; xfslogd_workqueue = create_workqueue("xfslogd"); @@ -1847,42 +1815,33 @@ pagebuf_init(void) if (!xfsdatad_workqueue) goto out_destroy_xfslogd_workqueue; - xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd"); - if (IS_ERR(xfsbufd_task)) { - error = PTR_ERR(xfsbufd_task); + xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup); + if (!xfs_buf_shake) goto out_destroy_xfsdatad_workqueue; - } - - pagebuf_shake = kmem_shake_register(xfsbufd_wakeup); - if (!pagebuf_shake) - goto out_stop_xfsbufd; return 0; - out_stop_xfsbufd: - kthread_stop(xfsbufd_task); out_destroy_xfsdatad_workqueue: destroy_workqueue(xfsdatad_workqueue); out_destroy_xfslogd_workqueue: destroy_workqueue(xfslogd_workqueue); out_free_buf_zone: - kmem_zone_destroy(pagebuf_zone); + kmem_zone_destroy(xfs_buf_zone); out_free_trace_buf: -#ifdef PAGEBUF_TRACE - ktrace_free(pagebuf_trace_buf); +#ifdef XFS_BUF_TRACE + ktrace_free(xfs_buf_trace_buf); #endif return error; } void -pagebuf_terminate(void) +xfs_buf_terminate(void) { - kmem_shake_deregister(pagebuf_shake); - kthread_stop(xfsbufd_task); + kmem_shake_deregister(xfs_buf_shake); destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfslogd_workqueue); - kmem_zone_destroy(pagebuf_zone); -#ifdef PAGEBUF_TRACE - ktrace_free(pagebuf_trace_buf); + kmem_zone_destroy(xfs_buf_zone); +#ifdef XFS_BUF_TRACE + ktrace_free(xfs_buf_trace_buf); #endif } diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 237a35b915d1..4dd6592d5a4c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -32,44 +32,47 @@ * Base types */ -#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) - -#define page_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define page_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -#define page_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define page_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - -typedef enum page_buf_rw_e { - PBRW_READ = 1, /* transfer into target memory */ - PBRW_WRITE = 2, /* transfer from target memory */ - PBRW_ZERO = 3 /* Zero target memory */ -} page_buf_rw_t; - - -typedef enum page_buf_flags_e { /* pb_flags values */ - PBF_READ = (1 << 0), /* buffer intended for reading from device */ - PBF_WRITE = (1 << 1), /* buffer intended for writing to device */ - PBF_MAPPED = (1 << 2), /* buffer mapped (pb_addr valid) */ - PBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ - PBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ - PBF_DELWRI = (1 << 6), /* buffer has dirty pages */ - PBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ - PBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ - PBF_ORDERED = (1 << 11), /* use ordered writes */ - PBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) +#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) +#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) +#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) + +typedef enum { + XBRW_READ = 1, /* transfer into target memory */ + XBRW_WRITE = 2, /* transfer from target memory */ + XBRW_ZERO = 3, /* Zero target memory */ +} xfs_buf_rw_t; + +typedef enum { + XBF_READ = (1 << 0), /* buffer intended for reading from device */ + XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ + XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ + XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ + XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ + XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ + XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ + XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ + XBF_ORDERED = (1 << 11), /* use ordered writes */ + XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ /* flags used only as arguments to access routines */ - PBF_LOCK = (1 << 14), /* lock requested */ - PBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ - PBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ + XBF_LOCK = (1 << 14), /* lock requested */ + XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ + XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ /* flags used only internally */ - _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ - _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ - _PBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ -} page_buf_flags_t; + _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ + _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ + _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ + _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ +} xfs_buf_flags_t; +typedef enum { + XBT_FORCE_SLEEP = (0 << 1), + XBT_FORCE_FLUSH = (1 << 1), +} xfs_buftarg_flags_t; typedef struct xfs_bufhash { struct list_head bh_list; @@ -77,477 +80,350 @@ typedef struct xfs_bufhash { } xfs_bufhash_t; typedef struct xfs_buftarg { - dev_t pbr_dev; - struct block_device *pbr_bdev; - struct address_space *pbr_mapping; - unsigned int pbr_bsize; - unsigned int pbr_sshift; - size_t pbr_smask; - - /* per-device buffer hash table */ + dev_t bt_dev; + struct block_device *bt_bdev; + struct address_space *bt_mapping; + unsigned int bt_bsize; + unsigned int bt_sshift; + size_t bt_smask; + + /* per device buffer hash table */ uint bt_hashmask; uint bt_hashshift; xfs_bufhash_t *bt_hash; + + /* per device delwri queue */ + struct task_struct *bt_task; + struct list_head bt_list; + struct list_head bt_delwrite_queue; + spinlock_t bt_delwrite_lock; + unsigned long bt_flags; } xfs_buftarg_t; /* - * xfs_buf_t: Buffer structure for page cache-based buffers + * xfs_buf_t: Buffer structure for pagecache-based buffers + * + * This buffer structure is used by the pagecache buffer management routines + * to refer to an assembly of pages forming a logical buffer. * - * This buffer structure is used by the page cache buffer management routines - * to refer to an assembly of pages forming a logical buffer. The actual I/O - * is performed with buffer_head structures, as required by drivers. - * - * The buffer structure is used on temporary basis only, and discarded when - * released. The real data storage is recorded in the page cache. Metadata is + * The buffer structure is used on a temporary basis only, and discarded when + * released. The real data storage is recorded in the pagecache. Buffers are * hashed to the block device on which the file system resides. */ struct xfs_buf; +typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); +typedef void (*xfs_buf_relse_t)(struct xfs_buf *); +typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); -/* call-back function on I/O completion */ -typedef void (*page_buf_iodone_t)(struct xfs_buf *); -/* call-back function on I/O completion */ -typedef void (*page_buf_relse_t)(struct xfs_buf *); -/* pre-write function */ -typedef int (*page_buf_bdstrat_t)(struct xfs_buf *); - -#define PB_PAGES 2 +#define XB_PAGES 2 typedef struct xfs_buf { - struct semaphore pb_sema; /* semaphore for lockables */ - unsigned long pb_queuetime; /* time buffer was queued */ - atomic_t pb_pin_count; /* pin count */ - wait_queue_head_t pb_waiters; /* unpin waiters */ - struct list_head pb_list; - page_buf_flags_t pb_flags; /* status flags */ - struct list_head pb_hash_list; /* hash table list */ - xfs_bufhash_t *pb_hash; /* hash table list start */ - xfs_buftarg_t *pb_target; /* buffer target (device) */ - atomic_t pb_hold; /* reference count */ - xfs_daddr_t pb_bn; /* block number for I/O */ - loff_t pb_file_offset; /* offset in file */ - size_t pb_buffer_length; /* size of buffer in bytes */ - size_t pb_count_desired; /* desired transfer size */ - void *pb_addr; /* virtual address of buffer */ - struct work_struct pb_iodone_work; - atomic_t pb_io_remaining;/* #outstanding I/O requests */ - page_buf_iodone_t pb_iodone; /* I/O completion function */ - page_buf_relse_t pb_relse; /* releasing function */ - page_buf_bdstrat_t pb_strat; /* pre-write function */ - struct semaphore pb_iodonesema; /* Semaphore for I/O waiters */ - void *pb_fspriv; - void *pb_fspriv2; - void *pb_fspriv3; - unsigned short pb_error; /* error code on I/O */ - unsigned short pb_locked; /* page array is locked */ - unsigned int pb_page_count; /* size of page array */ - unsigned int pb_offset; /* page offset in first page */ - struct page **pb_pages; /* array of page pointers */ - struct page *pb_page_array[PB_PAGES]; /* inline pages */ -#ifdef PAGEBUF_LOCK_TRACKING - int pb_last_holder; + struct semaphore b_sema; /* semaphore for lockables */ + unsigned long b_queuetime; /* time buffer was queued */ + atomic_t b_pin_count; /* pin count */ + wait_queue_head_t b_waiters; /* unpin waiters */ + struct list_head b_list; + xfs_buf_flags_t b_flags; /* status flags */ + struct list_head b_hash_list; /* hash table list */ + xfs_bufhash_t *b_hash; /* hash table list start */ + xfs_buftarg_t *b_target; /* buffer target (device) */ + atomic_t b_hold; /* reference count */ + xfs_daddr_t b_bn; /* block number for I/O */ + xfs_off_t b_file_offset; /* offset in file */ + size_t b_buffer_length;/* size of buffer in bytes */ + size_t b_count_desired;/* desired transfer size */ + void *b_addr; /* virtual address of buffer */ + struct work_struct b_iodone_work; + atomic_t b_io_remaining; /* #outstanding I/O requests */ + xfs_buf_iodone_t b_iodone; /* I/O completion function */ + xfs_buf_relse_t b_relse; /* releasing function */ + xfs_buf_bdstrat_t b_strat; /* pre-write function */ + struct semaphore b_iodonesema; /* Semaphore for I/O waiters */ + void *b_fspriv; + void *b_fspriv2; + void *b_fspriv3; + unsigned short b_error; /* error code on I/O */ + unsigned short b_locked; /* page array is locked */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + struct page **b_pages; /* array of page pointers */ + struct page *b_page_array[XB_PAGES]; /* inline pages */ +#ifdef XFS_BUF_LOCK_TRACKING + int b_last_holder; #endif } xfs_buf_t; /* Finding and Reading Buffers */ - -extern xfs_buf_t *_pagebuf_find( /* find buffer for block if */ - /* the block is in memory */ - xfs_buftarg_t *, /* inode for block */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t, /* PBF_LOCK */ - xfs_buf_t *); /* newly allocated buffer */ - +extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t, xfs_buf_t *); #define xfs_incore(buftarg,blkno,len,lockit) \ - _pagebuf_find(buftarg, blkno ,len, lockit, NULL) - -extern xfs_buf_t *xfs_buf_get_flags( /* allocate a buffer */ - xfs_buftarg_t *, /* inode for buffer */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* PBF_LOCK, PBF_READ, */ - /* PBF_ASYNC */ + _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) +extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t); #define xfs_buf_get(target, blkno, len, flags) \ - xfs_buf_get_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) - -extern xfs_buf_t *xfs_buf_read_flags( /* allocate and read a buffer */ - xfs_buftarg_t *, /* inode for buffer */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC */ + xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) +extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t); #define xfs_buf_read(target, blkno, len, flags) \ - xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED) - -extern xfs_buf_t *pagebuf_get_empty( /* allocate pagebuf struct with */ - /* no memory or disk address */ - size_t len, - xfs_buftarg_t *); /* mount point "fake" inode */ - -extern xfs_buf_t *pagebuf_get_no_daddr(/* allocate pagebuf struct */ - /* without disk address */ - size_t len, - xfs_buftarg_t *); /* mount point "fake" inode */ - -extern int pagebuf_associate_memory( - xfs_buf_t *, - void *, - size_t); - -extern void pagebuf_hold( /* increment reference count */ - xfs_buf_t *); /* buffer to hold */ + xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) -extern void pagebuf_readahead( /* read ahead into cache */ - xfs_buftarg_t *, /* target for buffer (or NULL) */ - loff_t, /* starting offset of range */ - size_t, /* length of range */ - page_buf_flags_t); /* additional read flags */ +extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); +extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); +extern void xfs_buf_hold(xfs_buf_t *); +extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, + xfs_buf_flags_t); /* Releasing Buffers */ - -extern void pagebuf_free( /* deallocate a buffer */ - xfs_buf_t *); /* buffer to deallocate */ - -extern void pagebuf_rele( /* release hold on a buffer */ - xfs_buf_t *); /* buffer to release */ +extern void xfs_buf_free(xfs_buf_t *); +extern void xfs_buf_rele(xfs_buf_t *); /* Locking and Unlocking Buffers */ - -extern int pagebuf_cond_lock( /* lock buffer, if not locked */ - /* (returns -EBUSY if locked) */ - xfs_buf_t *); /* buffer to lock */ - -extern int pagebuf_lock_value( /* return count on lock */ - xfs_buf_t *); /* buffer to check */ - -extern int pagebuf_lock( /* lock buffer */ - xfs_buf_t *); /* buffer to lock */ - -extern void pagebuf_unlock( /* unlock buffer */ - xfs_buf_t *); /* buffer to unlock */ +extern int xfs_buf_cond_lock(xfs_buf_t *); +extern int xfs_buf_lock_value(xfs_buf_t *); +extern void xfs_buf_lock(xfs_buf_t *); +extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ - -extern void pagebuf_iodone( /* mark buffer I/O complete */ - xfs_buf_t *, /* buffer to mark */ - int); /* run completion locally, or in - * a helper thread. */ - -extern void pagebuf_ioerror( /* mark buffer in error (or not) */ - xfs_buf_t *, /* buffer to mark */ - int); /* error to store (0 if none) */ - -extern int pagebuf_iostart( /* start I/O on a buffer */ - xfs_buf_t *, /* buffer to start */ - page_buf_flags_t); /* PBF_LOCK, PBF_ASYNC, */ - /* PBF_READ, PBF_WRITE, */ - /* PBF_DELWRI */ - -extern int pagebuf_iorequest( /* start real I/O */ - xfs_buf_t *); /* buffer to convey to device */ - -extern int pagebuf_iowait( /* wait for buffer I/O done */ - xfs_buf_t *); /* buffer to wait on */ - -extern void pagebuf_iomove( /* move data in/out of pagebuf */ - xfs_buf_t *, /* buffer to manipulate */ - size_t, /* starting buffer offset */ - size_t, /* length in buffer */ - caddr_t, /* data pointer */ - page_buf_rw_t); /* direction */ - -static inline int pagebuf_iostrategy(xfs_buf_t *pb) +extern void xfs_buf_ioend(xfs_buf_t *, int); +extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t); +extern int xfs_buf_iorequest(xfs_buf_t *); +extern int xfs_buf_iowait(xfs_buf_t *); +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, + xfs_buf_rw_t); + +static inline int xfs_buf_iostrategy(xfs_buf_t *bp) { - return pb->pb_strat ? pb->pb_strat(pb) : pagebuf_iorequest(pb); + return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp); } -static inline int pagebuf_geterror(xfs_buf_t *pb) +static inline int xfs_buf_geterror(xfs_buf_t *bp) { - return pb ? pb->pb_error : ENOMEM; + return bp ? bp->b_error : ENOMEM; } /* Buffer Utility Routines */ - -extern caddr_t pagebuf_offset( /* pointer at offset in buffer */ - xfs_buf_t *, /* buffer to offset into */ - size_t); /* offset */ +extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); /* Pinning Buffer Storage in Memory */ - -extern void pagebuf_pin( /* pin buffer in memory */ - xfs_buf_t *); /* buffer to pin */ - -extern void pagebuf_unpin( /* unpin buffered data */ - xfs_buf_t *); /* buffer to unpin */ - -extern int pagebuf_ispin( /* check if buffer is pinned */ - xfs_buf_t *); /* buffer to check */ +extern void xfs_buf_pin(xfs_buf_t *); +extern void xfs_buf_unpin(xfs_buf_t *); +extern int xfs_buf_ispin(xfs_buf_t *); /* Delayed Write Buffer Routines */ - -extern void pagebuf_delwri_dequeue(xfs_buf_t *); +extern void xfs_buf_delwri_dequeue(xfs_buf_t *); /* Buffer Daemon Setup Routines */ +extern int xfs_buf_init(void); +extern void xfs_buf_terminate(void); -extern int pagebuf_init(void); -extern void pagebuf_terminate(void); - - -#ifdef PAGEBUF_TRACE -extern ktrace_t *pagebuf_trace_buf; -extern void pagebuf_trace( - xfs_buf_t *, /* buffer being traced */ - char *, /* description of operation */ - void *, /* arbitrary diagnostic value */ - void *); /* return address */ +#ifdef XFS_BUF_TRACE +extern ktrace_t *xfs_buf_trace_buf; +extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); #else -# define pagebuf_trace(pb, id, ptr, ra) do { } while (0) +#define xfs_buf_trace(bp,id,ptr,ra) do { } while (0) #endif -#define pagebuf_target_name(target) \ - ({ char __b[BDEVNAME_SIZE]; bdevname((target)->pbr_bdev, __b); __b; }) +#define xfs_buf_target_name(target) \ + ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) +#define XFS_B_ASYNC XBF_ASYNC +#define XFS_B_DELWRI XBF_DELWRI +#define XFS_B_READ XBF_READ +#define XFS_B_WRITE XBF_WRITE +#define XFS_B_STALE XBF_STALE -/* These are just for xfs_syncsub... it sets an internal variable - * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t - */ -#define XFS_B_ASYNC PBF_ASYNC -#define XFS_B_DELWRI PBF_DELWRI -#define XFS_B_READ PBF_READ -#define XFS_B_WRITE PBF_WRITE -#define XFS_B_STALE PBF_STALE - -#define XFS_BUF_TRYLOCK PBF_TRYLOCK -#define XFS_INCORE_TRYLOCK PBF_TRYLOCK -#define XFS_BUF_LOCK PBF_LOCK -#define XFS_BUF_MAPPED PBF_MAPPED - -#define BUF_BUSY PBF_DONT_BLOCK - -#define XFS_BUF_BFLAGS(x) ((x)->pb_flags) -#define XFS_BUF_ZEROFLAGS(x) \ - ((x)->pb_flags &= ~(PBF_READ|PBF_WRITE|PBF_ASYNC|PBF_DELWRI)) - -#define XFS_BUF_STALE(x) ((x)->pb_flags |= XFS_B_STALE) -#define XFS_BUF_UNSTALE(x) ((x)->pb_flags &= ~XFS_B_STALE) -#define XFS_BUF_ISSTALE(x) ((x)->pb_flags & XFS_B_STALE) -#define XFS_BUF_SUPER_STALE(x) do { \ - XFS_BUF_STALE(x); \ - pagebuf_delwri_dequeue(x); \ - XFS_BUF_DONE(x); \ - } while (0) +#define XFS_BUF_TRYLOCK XBF_TRYLOCK +#define XFS_INCORE_TRYLOCK XBF_TRYLOCK +#define XFS_BUF_LOCK XBF_LOCK +#define XFS_BUF_MAPPED XBF_MAPPED -#define XFS_BUF_MANAGE PBF_FS_MANAGED -#define XFS_BUF_UNMANAGE(x) ((x)->pb_flags &= ~PBF_FS_MANAGED) - -#define XFS_BUF_DELAYWRITE(x) ((x)->pb_flags |= PBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(x) pagebuf_delwri_dequeue(x) -#define XFS_BUF_ISDELAYWRITE(x) ((x)->pb_flags & PBF_DELWRI) - -#define XFS_BUF_ERROR(x,no) pagebuf_ioerror(x,no) -#define XFS_BUF_GETERROR(x) pagebuf_geterror(x) -#define XFS_BUF_ISERROR(x) (pagebuf_geterror(x)?1:0) - -#define XFS_BUF_DONE(x) ((x)->pb_flags |= PBF_DONE) -#define XFS_BUF_UNDONE(x) ((x)->pb_flags &= ~PBF_DONE) -#define XFS_BUF_ISDONE(x) ((x)->pb_flags & PBF_DONE) - -#define XFS_BUF_BUSY(x) do { } while (0) -#define XFS_BUF_UNBUSY(x) do { } while (0) -#define XFS_BUF_ISBUSY(x) (1) - -#define XFS_BUF_ASYNC(x) ((x)->pb_flags |= PBF_ASYNC) -#define XFS_BUF_UNASYNC(x) ((x)->pb_flags &= ~PBF_ASYNC) -#define XFS_BUF_ISASYNC(x) ((x)->pb_flags & PBF_ASYNC) - -#define XFS_BUF_ORDERED(x) ((x)->pb_flags |= PBF_ORDERED) -#define XFS_BUF_UNORDERED(x) ((x)->pb_flags &= ~PBF_ORDERED) -#define XFS_BUF_ISORDERED(x) ((x)->pb_flags & PBF_ORDERED) - -#define XFS_BUF_SHUT(x) printk("XFS_BUF_SHUT not implemented yet\n") -#define XFS_BUF_UNSHUT(x) printk("XFS_BUF_UNSHUT not implemented yet\n") -#define XFS_BUF_ISSHUT(x) (0) - -#define XFS_BUF_HOLD(x) pagebuf_hold(x) -#define XFS_BUF_READ(x) ((x)->pb_flags |= PBF_READ) -#define XFS_BUF_UNREAD(x) ((x)->pb_flags &= ~PBF_READ) -#define XFS_BUF_ISREAD(x) ((x)->pb_flags & PBF_READ) - -#define XFS_BUF_WRITE(x) ((x)->pb_flags |= PBF_WRITE) -#define XFS_BUF_UNWRITE(x) ((x)->pb_flags &= ~PBF_WRITE) -#define XFS_BUF_ISWRITE(x) ((x)->pb_flags & PBF_WRITE) - -#define XFS_BUF_ISUNINITIAL(x) (0) -#define XFS_BUF_UNUNINITIAL(x) (0) - -#define XFS_BUF_BP_ISMAPPED(bp) 1 - -#define XFS_BUF_IODONE_FUNC(buf) (buf)->pb_iodone -#define XFS_BUF_SET_IODONE_FUNC(buf, func) \ - (buf)->pb_iodone = (func) -#define XFS_BUF_CLR_IODONE_FUNC(buf) \ - (buf)->pb_iodone = NULL -#define XFS_BUF_SET_BDSTRAT_FUNC(buf, func) \ - (buf)->pb_strat = (func) -#define XFS_BUF_CLR_BDSTRAT_FUNC(buf) \ - (buf)->pb_strat = NULL - -#define XFS_BUF_FSPRIVATE(buf, type) \ - ((type)(buf)->pb_fspriv) -#define XFS_BUF_SET_FSPRIVATE(buf, value) \ - (buf)->pb_fspriv = (void *)(value) -#define XFS_BUF_FSPRIVATE2(buf, type) \ - ((type)(buf)->pb_fspriv2) -#define XFS_BUF_SET_FSPRIVATE2(buf, value) \ - (buf)->pb_fspriv2 = (void *)(value) -#define XFS_BUF_FSPRIVATE3(buf, type) \ - ((type)(buf)->pb_fspriv3) -#define XFS_BUF_SET_FSPRIVATE3(buf, value) \ - (buf)->pb_fspriv3 = (void *)(value) -#define XFS_BUF_SET_START(buf) - -#define XFS_BUF_SET_BRELSE_FUNC(buf, value) \ - (buf)->pb_relse = (value) - -#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->pb_addr) - -static inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset) -{ - if (bp->pb_flags & PBF_MAPPED) - return XFS_BUF_PTR(bp) + offset; - return (xfs_caddr_t) pagebuf_offset(bp, offset); -} +#define BUF_BUSY XBF_DONT_BLOCK + +#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) +#define XFS_BUF_ZEROFLAGS(bp) \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI)) + +#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) +#define XFS_BUF_SUPER_STALE(bp) do { \ + XFS_BUF_STALE(bp); \ + xfs_buf_delwri_dequeue(bp); \ + XFS_BUF_DONE(bp); \ + } while (0) -#define XFS_BUF_SET_PTR(bp, val, count) \ - pagebuf_associate_memory(bp, val, count) -#define XFS_BUF_ADDR(bp) ((bp)->pb_bn) -#define XFS_BUF_SET_ADDR(bp, blk) \ - ((bp)->pb_bn = (xfs_daddr_t)(blk)) -#define XFS_BUF_OFFSET(bp) ((bp)->pb_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) \ - ((bp)->pb_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->pb_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) \ - ((bp)->pb_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->pb_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) \ - ((bp)->pb_buffer_length = (cnt)) -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) -#define XFS_BUF_SET_VTYPE(bp, type) -#define XFS_BUF_SET_REF(bp, ref) - -#define XFS_BUF_ISPINNED(bp) pagebuf_ispin(bp) - -#define XFS_BUF_VALUSEMA(bp) pagebuf_lock_value(bp) -#define XFS_BUF_CPSEMA(bp) (pagebuf_cond_lock(bp) == 0) -#define XFS_BUF_VSEMA(bp) pagebuf_unlock(bp) -#define XFS_BUF_PSEMA(bp,x) pagebuf_lock(bp) -#define XFS_BUF_V_IODONESEMA(bp) up(&bp->pb_iodonesema); - -/* setup the buffer target from a buftarg structure */ -#define XFS_BUF_SET_TARGET(bp, target) \ - (bp)->pb_target = (target) -#define XFS_BUF_TARGET(bp) ((bp)->pb_target) -#define XFS_BUFTARG_NAME(target) \ - pagebuf_target_name(target) - -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) -#define XFS_BUF_SET_VTYPE(bp, type) -#define XFS_BUF_SET_REF(bp, ref) - -static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) +#define XFS_BUF_MANAGE XBF_FS_MANAGED +#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) + +#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) +#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) +#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) + +#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) +#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) +#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) + +#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) +#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) +#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) + +#define XFS_BUF_BUSY(bp) do { } while (0) +#define XFS_BUF_UNBUSY(bp) do { } while (0) +#define XFS_BUF_ISBUSY(bp) (1) + +#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) +#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) +#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) + +#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED) +#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) +#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) + +#define XFS_BUF_SHUT(bp) do { } while (0) +#define XFS_BUF_UNSHUT(bp) do { } while (0) +#define XFS_BUF_ISSHUT(bp) (0) + +#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) +#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) +#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) +#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) + +#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) +#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) +#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) + +#define XFS_BUF_ISUNINITIAL(bp) (0) +#define XFS_BUF_UNUNINITIAL(bp) (0) + +#define XFS_BUF_BP_ISMAPPED(bp) (1) + +#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) +#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) +#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) +#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func)) +#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL) + +#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) +#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) +#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) +#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) +#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3) +#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val)) +#define XFS_BUF_SET_START(bp) do { } while (0) +#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) + +#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) +#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) +#define XFS_BUF_ADDR(bp) ((bp)->b_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) +#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) +#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) +#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) +#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) +#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) +#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) + +#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) +#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) +#define XFS_BUF_SET_REF(bp, ref) do { } while (0) + +#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) + +#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) +#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) +#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) +#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) +#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema); + +#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) +#define XFS_BUF_TARGET(bp) ((bp)->b_target) +#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) + +static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) { - bp->pb_fspriv3 = mp; - bp->pb_strat = xfs_bdstrat_cb; - pagebuf_delwri_dequeue(bp); - return pagebuf_iostart(bp, PBF_WRITE | PBF_ASYNC | _PBF_RUN_QUEUES); + bp->b_fspriv3 = mp; + bp->b_strat = xfs_bdstrat_cb; + xfs_buf_delwri_dequeue(bp); + return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); } -static inline void xfs_buf_relse(xfs_buf_t *bp) +static inline void xfs_buf_relse(xfs_buf_t *bp) { - if (!bp->pb_relse) - pagebuf_unlock(bp); - pagebuf_rele(bp); + if (!bp->b_relse) + xfs_buf_unlock(bp); + xfs_buf_rele(bp); } -#define xfs_bpin(bp) pagebuf_pin(bp) -#define xfs_bunpin(bp) pagebuf_unpin(bp) +#define xfs_bpin(bp) xfs_buf_pin(bp) +#define xfs_bunpin(bp) xfs_buf_unpin(bp) #define xfs_buftrace(id, bp) \ - pagebuf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) + xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) -#define xfs_biodone(pb) \ - pagebuf_iodone(pb, 0) +#define xfs_biodone(bp) xfs_buf_ioend(bp, 0) -#define xfs_biomove(pb, off, len, data, rw) \ - pagebuf_iomove((pb), (off), (len), (data), \ - ((rw) == XFS_B_WRITE) ? PBRW_WRITE : PBRW_READ) +#define xfs_biomove(bp, off, len, data, rw) \ + xfs_buf_iomove((bp), (off), (len), (data), \ + ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) -#define xfs_biozero(pb, off, len) \ - pagebuf_iomove((pb), (off), (len), NULL, PBRW_ZERO) +#define xfs_biozero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) -static inline int XFS_bwrite(xfs_buf_t *pb) +static inline int XFS_bwrite(xfs_buf_t *bp) { - int iowait = (pb->pb_flags & PBF_ASYNC) == 0; + int iowait = (bp->b_flags & XBF_ASYNC) == 0; int error = 0; if (!iowait) - pb->pb_flags |= _PBF_RUN_QUEUES; + bp->b_flags |= _XBF_RUN_QUEUES; - pagebuf_delwri_dequeue(pb); - pagebuf_iostrategy(pb); + xfs_buf_delwri_dequeue(bp); + xfs_buf_iostrategy(bp); if (iowait) { - error = pagebuf_iowait(pb); - xfs_buf_relse(pb); + error = xfs_buf_iowait(bp); + xfs_buf_relse(bp); } return error; } -#define XFS_bdwrite(pb) \ - pagebuf_iostart(pb, PBF_DELWRI | PBF_ASYNC) +#define XFS_bdwrite(bp) xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC) static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) { - bp->pb_strat = xfs_bdstrat_cb; - bp->pb_fspriv3 = mp; - - return pagebuf_iostart(bp, PBF_DELWRI | PBF_ASYNC); + bp->b_strat = xfs_bdstrat_cb; + bp->b_fspriv3 = mp; + return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); } -#define XFS_bdstrat(bp) pagebuf_iorequest(bp) +#define XFS_bdstrat(bp) xfs_buf_iorequest(bp) -#define xfs_iowait(pb) pagebuf_iowait(pb) +#define xfs_iowait(bp) xfs_buf_iowait(bp) #define xfs_baread(target, rablkno, ralen) \ - pagebuf_readahead((target), (rablkno), (ralen), PBF_DONT_BLOCK) - -#define xfs_buf_get_empty(len, target) pagebuf_get_empty((len), (target)) -#define xfs_buf_get_noaddr(len, target) pagebuf_get_no_daddr((len), (target)) -#define xfs_buf_free(bp) pagebuf_free(bp) + xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK) /* * Handling of buftargs. */ - extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); extern void xfs_free_buftarg(xfs_buftarg_t *, int); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); -#define xfs_getsize_buftarg(buftarg) \ - block_size((buftarg)->pbr_bdev) -#define xfs_readonly_buftarg(buftarg) \ - bdev_read_only((buftarg)->pbr_bdev) -#define xfs_binval(buftarg) \ - xfs_flush_buftarg(buftarg, 1) -#define XFS_bflush(buftarg) \ - xfs_flush_buftarg(buftarg, 1) +#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) +#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) + +#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) +#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h index 4af491024727..e7f3da61c6c3 100644 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ b/fs/xfs/linux-2.6/xfs_cred.h @@ -18,6 +18,8 @@ #ifndef __XFS_CRED_H__ #define __XFS_CRED_H__ +#include <linux/capability.h> + /* * Credentials */ @@ -27,7 +29,7 @@ typedef struct cred { extern struct cred *sys_cred; -/* this is a hack.. (assums sys_cred is the only cred_t in the system) */ +/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ static __inline int capable_cred(cred_t *cr, int cid) { return (cr == sys_cred) ? 1 : capable(cid); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 06111d0bbae4..ced4404339c7 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -509,16 +509,14 @@ linvfs_open_exec( vnode_t *vp = LINVFS_GET_VP(inode); xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); int error = 0; - bhv_desc_t *bdp; xfs_inode_t *ip; if (vp->v_vfsp->vfs_flag & VFS_DMI) { - bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops); - if (!bdp) { + ip = xfs_vtoi(vp); + if (!ip) { error = -EINVAL; goto open_exec_out; } - ip = XFS_BHVTOI(bdp); if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) { error = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL); diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index f89340c61bf2..4fa4b1a5187e 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -79,8 +79,7 @@ fs_flushinval_pages( struct inode *ip = LINVFS_GET_IP(vp); if (VN_CACHED(vp)) { - filemap_fdatawrite(ip->i_mapping); - filemap_fdatawait(ip->i_mapping); + filemap_write_and_wait(ip->i_mapping); truncate_inode_pages(ip->i_mapping, first); } diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index b78b5eb9e96c..4db47790415c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -52,6 +52,7 @@ #include "xfs_dfrag.h" #include "xfs_fsops.h" +#include <linux/capability.h> #include <linux/dcache.h> #include <linux/mount.h> #include <linux/namei.h> @@ -145,13 +146,10 @@ xfs_find_handle( if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { xfs_inode_t *ip; - bhv_desc_t *bhv; int lock_mode; /* need to get access to the xfs_inode to read the generation */ - bhv = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); - ASSERT(bhv); - ip = XFS_BHVTOI(bhv); + ip = xfs_vtoi(vp); ASSERT(ip); lock_mode = xfs_ilock_map_shared(ip); @@ -530,6 +528,8 @@ xfs_attrmulti_attr_set( char *kbuf; int error = EFAULT; + if (IS_RDONLY(&vp->v_inode)) + return -EROFS; if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) return EPERM; if (len > XATTR_SIZE_MAX) @@ -557,6 +557,9 @@ xfs_attrmulti_attr_remove( { int error; + + if (IS_RDONLY(&vp->v_inode)) + return -EROFS; if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) return EPERM; @@ -745,9 +748,8 @@ xfs_ioctl( (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; - da.d_mem = da.d_miniosz = 1 << target->pbr_sshift; - /* The size dio will do in one go */ - da.d_maxiosz = 64 * PAGE_CACHE_SIZE; + da.d_mem = da.d_miniosz = 1 << target->bt_sshift; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); if (copy_to_user(arg, &da, sizeof(da))) return -XFS_ERROR(EFAULT); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index c83ae15bb0e6..a7c9ba1a9f7b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -19,7 +19,6 @@ #include <linux/compat.h> #include <linux/init.h> #include <linux/ioctl.h> -#include <linux/ioctl32.h> #include <linux/syscalls.h> #include <linux/types.h> #include <linux/fs.h> diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 14215a7db59f..4bd3d03b23ed 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -51,8 +51,47 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" +#include <linux/capability.h> #include <linux/xattr.h> #include <linux/namei.h> +#include <linux/security.h> + +#define IS_NOATIME(inode) ((inode->i_sb->s_flags & MS_NOATIME) || \ + (S_ISDIR(inode->i_mode) && inode->i_sb->s_flags & MS_NODIRATIME)) + +/* + * Get a XFS inode from a given vnode. + */ +xfs_inode_t * +xfs_vtoi( + struct vnode *vp) +{ + bhv_desc_t *bdp; + + bdp = bhv_lookup_range(VN_BHV_HEAD(vp), + VNODE_POSITION_XFS, VNODE_POSITION_XFS); + if (unlikely(bdp == NULL)) + return NULL; + return XFS_BHVTOI(bdp); +} + +/* + * Bring the atime in the XFS inode uptodate. + * Used before logging the inode to disk or when the Linux inode goes away. + */ +void +xfs_synchronize_atime( + xfs_inode_t *ip) +{ + vnode_t *vp; + + vp = XFS_ITOV_NULL(ip); + if (vp) { + struct inode *inode = &vp->v_inode; + ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; + } +} /* * Change the requested timestamp in the given inode. @@ -73,23 +112,6 @@ xfs_ichgtime( struct inode *inode = LINVFS_GET_IP(XFS_ITOV(ip)); timespec_t tv; - /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. - */ - if (unlikely(IS_RDONLY(inode))) - return; - - /* - * Don't update access timestamps on reads if mounted "noatime". - * Throw it away if anyone asks us. - */ - if (unlikely( - (ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) && - (flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) == - XFS_ICHGTIME_ACC)) - return; - nanotime(&tv); if (flags & XFS_ICHGTIME_MOD) { inode->i_mtime = tv; @@ -126,8 +148,6 @@ xfs_ichgtime( * Variant on the above which avoids querying the system clock * in situations where we know the Linux inode timestamps have * just been updated (and so we can update our inode cheaply). - * We also skip the readonly and noatime checks here, they are - * also catered for already. */ void xfs_ichgtime_fast( @@ -138,20 +158,16 @@ xfs_ichgtime_fast( timespec_t *tvp; /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. + * Atime updates for read() & friends are handled lazily now, and + * explicit updates must go through xfs_ichgtime() */ - if (unlikely(IS_RDONLY(inode))) - return; + ASSERT((flags & XFS_ICHGTIME_ACC) == 0); /* - * Don't update access timestamps on reads if mounted "noatime". - * Throw it away if anyone asks us. + * We're not supposed to change timestamps in readonly-mounted + * filesystems. Throw it away if anyone asks us. */ - if (unlikely( - (ip->i_mount->m_flags & XFS_MOUNT_NOATIME || IS_NOATIME(inode)) && - ((flags & (XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD|XFS_ICHGTIME_CHG)) == - XFS_ICHGTIME_ACC))) + if (unlikely(IS_RDONLY(inode))) return; if (flags & XFS_ICHGTIME_MOD) { @@ -159,11 +175,6 @@ xfs_ichgtime_fast( ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec; } - if (flags & XFS_ICHGTIME_ACC) { - tvp = &inode->i_atime; - ip->i_d.di_atime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)tvp->tv_nsec; - } if (flags & XFS_ICHGTIME_CHG) { tvp = &inode->i_ctime; ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec; @@ -203,13 +214,46 @@ validate_fields( ip->i_nlink = va.va_nlink; ip->i_blocks = va.va_nblocks; - /* we're under i_sem so i_size can't change under us */ + /* we're under i_mutex so i_size can't change under us */ if (i_size_read(ip) != va.va_size) i_size_write(ip, va.va_size); } } /* + * Hook in SELinux. This is not quite correct yet, what we really need + * here (as we do for default ACLs) is a mechanism by which creation of + * these attrs can be journalled at inode creation time (along with the + * inode, of course, such that log replay can't cause these to be lost). + */ +STATIC int +linvfs_init_security( + struct vnode *vp, + struct inode *dir) +{ + struct inode *ip = LINVFS_GET_IP(vp); + size_t length; + void *value; + char *name; + int error; + + error = security_inode_init_security(ip, dir, &name, &value, &length); + if (error) { + if (error == -EOPNOTSUPP) + return 0; + return -error; + } + + VOP_ATTR_SET(vp, name, value, length, ATTR_SECURE, NULL, error); + if (!error) + VMODIFY(vp); + + kfree(name); + kfree(value); + return error; +} + +/* * Determine whether a process has a valid fs_struct (kernel daemons * like knfsd don't have an fs_struct). * @@ -274,6 +318,9 @@ linvfs_mknod( break; } + if (!error) + error = linvfs_init_security(vp, dir); + if (default_acl) { if (!error) { error = _ACL_INHERIT(vp, &va, default_acl); @@ -290,8 +337,6 @@ linvfs_mknod( teardown.d_inode = ip = LINVFS_GET_IP(vp); teardown.d_name = dentry->d_name; - vn_mark_bad(vp); - if (S_ISDIR(mode)) VOP_RMDIR(dvp, &teardown, NULL, err2); else @@ -502,7 +547,7 @@ linvfs_follow_link( ASSERT(dentry); ASSERT(nd); - link = (char *)kmalloc(MAXNAMELEN+1, GFP_KERNEL); + link = (char *)kmalloc(MAXPATHLEN+1, GFP_KERNEL); if (!link) { nd_set_link(nd, ERR_PTR(-ENOMEM)); return NULL; @@ -518,12 +563,12 @@ linvfs_follow_link( vp = LINVFS_GET_VP(dentry->d_inode); iov.iov_base = link; - iov.iov_len = MAXNAMELEN; + iov.iov_len = MAXPATHLEN; uio->uio_iov = &iov; uio->uio_offset = 0; uio->uio_segflg = UIO_SYSSPACE; - uio->uio_resid = MAXNAMELEN; + uio->uio_resid = MAXPATHLEN; uio->uio_iovcnt = 1; VOP_READLINK(vp, uio, 0, NULL, error); @@ -531,7 +576,7 @@ linvfs_follow_link( kfree(link); link = ERR_PTR(-error); } else { - link[MAXNAMELEN - uio->uio_resid] = '\0'; + link[MAXPATHLEN - uio->uio_resid] = '\0'; } kfree(uio); diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h index ee784b63acbf..6899a6b4a50a 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/linux-2.6/xfs_iops.h @@ -26,11 +26,6 @@ extern struct file_operations linvfs_file_operations; extern struct file_operations linvfs_invis_file_operations; extern struct file_operations linvfs_dir_operations; -extern struct address_space_operations linvfs_aops; - -extern int linvfs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern void linvfs_unwritten_done(struct buffer_head *, int); - extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, int, unsigned int, void __user *); diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index d8e21ba0cccc..67389b745526 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -110,10 +110,6 @@ * delalloc and these ondisk-uninitialised buffers. */ BUFFER_FNS(PrivateStart, unwritten); -static inline void set_buffer_unwritten_io(struct buffer_head *bh) -{ - bh->b_end_io = linvfs_unwritten_done; -} #define restricted_chown xfs_params.restrict_chown.val #define irix_sgid_inherit xfs_params.sgid_inherit.val @@ -232,7 +228,7 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh) #define xfs_itruncate_data(ip, off) \ (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off))) #define xfs_statvfs_fsid(statp, mp) \ - ({ u64 id = huge_encode_dev((mp)->m_dev); \ + ({ u64 id = huge_encode_dev((mp)->m_ddev_targp->bt_dev); \ __kernel_fsid_t *fsid = &(statp)->f_fsid; \ (fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); }) diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 279e9bc92aba..e0ab45fbfebd 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -233,8 +233,8 @@ xfs_read( xfs_buftarg_t *target = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((*offset & target->pbr_smask) || - (size & target->pbr_smask)) { + if ((*offset & target->bt_smask) || + (size & target->bt_smask)) { if (*offset == ip->i_d.di_size) { return (0); } @@ -254,7 +254,7 @@ xfs_read( } if (unlikely(ioflags & IO_ISDIRECT)) - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_IOLOCK_SHARED); if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && @@ -281,12 +281,9 @@ xfs_read( xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime_fast(ip, inode, XFS_ICHGTIME_ACC); - unlock_isem: if (unlikely(ioflags & IO_ISDIRECT)) - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); return ret; } @@ -346,9 +343,6 @@ xfs_sendfile( if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - if (likely(!(ioflags & IO_INVIS))) - xfs_ichgtime_fast(ip, LINVFS_GET_IP(vp), XFS_ICHGTIME_ACC); - return ret; } @@ -362,7 +356,6 @@ STATIC int /* error (positive) */ xfs_zero_last_block( struct inode *ip, xfs_iocore_t *io, - xfs_off_t offset, xfs_fsize_t isize, xfs_fsize_t end_size) { @@ -371,19 +364,16 @@ xfs_zero_last_block( int nimaps; int zero_offset; int zero_len; - int isize_fsb_offset; int error = 0; xfs_bmbt_irec_t imap; loff_t loff; - size_t lsize; ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); - ASSERT(offset > isize); mp = io->io_mount; - isize_fsb_offset = XFS_B_FSB_OFFSET(mp, isize); - if (isize_fsb_offset == 0) { + zero_offset = XFS_B_FSB_OFFSET(mp, isize); + if (zero_offset == 0) { /* * There are no extra bytes in the last block on disk to * zero, so return. @@ -413,10 +403,8 @@ xfs_zero_last_block( */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); loff = XFS_FSB_TO_B(mp, last_fsb); - lsize = XFS_FSB_TO_B(mp, 1); - zero_offset = isize_fsb_offset; - zero_len = mp->m_sb.sb_blocksize - isize_fsb_offset; + zero_len = mp->m_sb.sb_blocksize - zero_offset; error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); @@ -447,20 +435,17 @@ xfs_zero_eof( struct inode *ip = LINVFS_GET_IP(vp); xfs_fileoff_t start_zero_fsb; xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t prev_zero_fsb; xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; xfs_extlen_t buf_len_fsb; - xfs_extlen_t prev_zero_count; xfs_mount_t *mp; int nimaps; int error = 0; xfs_bmbt_irec_t imap; - loff_t loff; - size_t lsize; ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); + ASSERT(offset > isize); mp = io->io_mount; @@ -468,7 +453,7 @@ xfs_zero_eof( * First handle zeroing the block on which isize resides. * We only zero a part of that block so it is handled specially. */ - error = xfs_zero_last_block(ip, io, offset, isize, end_size); + error = xfs_zero_last_block(ip, io, isize, end_size); if (error) { ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); @@ -496,8 +481,6 @@ xfs_zero_eof( } ASSERT(start_zero_fsb <= end_zero_fsb); - prev_zero_fsb = NULLFILEOFF; - prev_zero_count = 0; while (start_zero_fsb <= end_zero_fsb) { nimaps = 1; zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; @@ -519,10 +502,7 @@ xfs_zero_eof( * that sits on a hole and sets the page as P_HOLE * and calls remapf if it is a mapped file. */ - prev_zero_fsb = NULLFILEOFF; - prev_zero_count = 0; - start_zero_fsb = imap.br_startoff + - imap.br_blockcount; + start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); continue; } @@ -543,17 +523,15 @@ xfs_zero_eof( */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); - loff = XFS_FSB_TO_B(mp, start_zero_fsb); - lsize = XFS_FSB_TO_B(mp, buf_len_fsb); - - error = xfs_iozero(ip, loff, lsize, end_size); + error = xfs_iozero(ip, + XFS_FSB_TO_B(mp, start_zero_fsb), + XFS_FSB_TO_B(mp, buf_len_fsb), + end_size); if (error) { goto out_lock; } - prev_zero_fsb = start_zero_fsb; - prev_zero_count = buf_len_fsb; start_zero_fsb = imap.br_startoff + buf_len_fsb; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); @@ -640,7 +618,7 @@ xfs_write( (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; - if ((pos & target->pbr_smask) || (count & target->pbr_smask)) + if ((pos & target->bt_smask) || (count & target->bt_smask)) return XFS_ERROR(-EINVAL); if (!VN_CACHED(vp) && pos < i_size_read(inode)) @@ -655,7 +633,7 @@ relock: iolock = XFS_IOLOCK_EXCL; locktype = VRWLOCK_WRITE; - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); } else { iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; @@ -686,7 +664,7 @@ start: int dmflags = FILP_DELAY_FLAG(file); if (need_isem) - dmflags |= DM_FLAGS_ISEM; + dmflags |= DM_FLAGS_IMUX; xfs_iunlock(xip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, @@ -713,7 +691,7 @@ start: } if (likely(!(ioflags & IO_INVIS))) { - inode_update_time(inode, 1); + file_update_time(file); xfs_ichgtime_fast(xip, inode, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); } @@ -772,7 +750,7 @@ retry: if (need_isem) { /* demote the lock now the cached pages are gone */ XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; locktype = VRWLOCK_WRITE_DIRECT; @@ -817,20 +795,24 @@ retry: xfs_rwunlock(bdp, locktype); if (need_isem) - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (error) goto out_nounlocks; if (need_isem) - down(&inode->i_sem); + mutex_lock(&inode->i_mutex); xfs_rwlock(bdp, locktype); pos = xip->i_d.di_size; ret = 0; goto retry; } + isize = i_size_read(inode); + if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) + *offset = isize; + if (*offset > xip->i_d.di_size) { xfs_ilock(xip, XFS_ILOCK_EXCL); if (*offset > xip->i_d.di_size) { @@ -926,7 +908,7 @@ retry: xfs_rwunlock(bdp, locktype); if (need_isem) - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); error = sync_page_range(inode, mapping, pos, ret); if (!error) @@ -938,7 +920,7 @@ retry: xfs_rwunlock(bdp, locktype); out_unlock_isem: if (need_isem) - up(&inode->i_sem); + mutex_unlock(&inode->i_mutex); out_nounlocks: return -error; } @@ -956,7 +938,7 @@ xfs_bdstrat_cb(struct xfs_buf *bp) mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); if (!XFS_FORCED_SHUTDOWN(mp)) { - pagebuf_iorequest(bp); + xfs_buf_iorequest(bp); return 0; } else { xfs_buftrace("XFS__BDSTRAT IOERROR", bp); @@ -1009,7 +991,7 @@ xfsbdstrat( * if (XFS_BUF_IS_GRIO(bp)) { */ - pagebuf_iorequest(bp); + xfs_buf_iorequest(bp); return 0; } diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c index 6c40a74be7c8..8955720a2c6b 100644 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ b/fs/xfs/linux-2.6/xfs_stats.c @@ -34,7 +34,7 @@ xfs_read_xfsstats( __uint64_t xs_write_bytes = 0; __uint64_t xs_read_bytes = 0; - static struct xstats_entry { + static const struct xstats_entry { char *desc; int endpoint; } xstats[] = { diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h index 50027c4a5618..8ba7a2fa6c1d 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/linux-2.6/xfs_stats.h @@ -109,15 +109,15 @@ struct xfsstats { __uint32_t vn_remove; /* # times vn_remove called */ __uint32_t vn_free; /* # times vn_free called */ #define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) - __uint32_t pb_get; - __uint32_t pb_create; - __uint32_t pb_get_locked; - __uint32_t pb_get_locked_waited; - __uint32_t pb_busy_locked; - __uint32_t pb_miss_locked; - __uint32_t pb_page_retries; - __uint32_t pb_page_found; - __uint32_t pb_get_read; + __uint32_t xb_get; + __uint32_t xb_create; + __uint32_t xb_get_locked; + __uint32_t xb_get_locked_waited; + __uint32_t xb_busy_locked; + __uint32_t xb_miss_locked; + __uint32_t xb_page_retries; + __uint32_t xb_page_found; + __uint32_t xb_get_read; /* Extra precision counters */ __uint64_t xs_xstrat_bytes; __uint64_t xs_write_bytes; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 6116b5bf433e..f22e426d9e42 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -306,13 +306,15 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) xfs_fs_cmn_err(CE_NOTE, mp, "Disabling barriers, not supported with external log device"); mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; } - if (mp->m_ddev_targp->pbr_bdev->bd_disk->queue->ordered == + if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered == QUEUE_ORDERED_NONE) { xfs_fs_cmn_err(CE_NOTE, mp, "Disabling barriers, not supported by the underlying device"); mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; } error = xfs_barrier_test(mp); @@ -320,6 +322,7 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) xfs_fs_cmn_err(CE_NOTE, mp, "Disabling barriers, trial barrier write failed"); mp->m_flags &= ~XFS_MOUNT_BARRIER; + return; } } @@ -327,7 +330,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->pbr_bdev, NULL); + blkdev_issue_flush(buftarg->bt_bdev, NULL); } STATIC struct inode * @@ -576,7 +579,7 @@ xfssyncd( timeleft = schedule_timeout_interruptible(timeleft); /* swsusp */ try_to_freeze(); - if (kthread_should_stop()) + if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list)) break; spin_lock(&vfsp->vfs_sync_lock); @@ -966,9 +969,9 @@ init_xfs_fs( void ) if (error < 0) goto undo_zones; - error = pagebuf_init(); + error = xfs_buf_init(); if (error < 0) - goto undo_pagebuf; + goto undo_buffers; vn_init(); xfs_init(); @@ -982,9 +985,9 @@ init_xfs_fs( void ) return 0; undo_register: - pagebuf_terminate(); + xfs_buf_terminate(); -undo_pagebuf: +undo_buffers: linvfs_destroy_zones(); undo_zones: @@ -998,7 +1001,7 @@ exit_xfs_fs( void ) XFS_DM_EXIT(&xfs_fs_type); unregister_filesystem(&xfs_fs_type); xfs_cleanup(); - pagebuf_terminate(); + xfs_buf_terminate(); linvfs_destroy_zones(); ktrace_uninit(); } diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c index e9bbcb4d6243..260dd8415dd7 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ b/fs/xfs/linux-2.6/xfs_vnode.c @@ -106,7 +106,6 @@ vn_revalidate_core( inode->i_blocks = vap->va_nblocks; inode->i_mtime = vap->va_mtime; inode->i_ctime = vap->va_ctime; - inode->i_atime = vap->va_atime; inode->i_blksize = vap->va_blocksize; if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index f2bbb327c081..0fe2419461d6 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -566,6 +566,25 @@ static inline int VN_BAD(struct vnode *vp) } /* + * Extracting atime values in various formats + */ +static inline void vn_atime_to_bstime(struct vnode *vp, xfs_bstime_t *bs_atime) +{ + bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec; + bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec; +} + +static inline void vn_atime_to_timespec(struct vnode *vp, struct timespec *ts) +{ + *ts = vp->v_inode.i_atime; +} + +static inline void vn_atime_to_time_t(struct vnode *vp, time_t *tt) +{ + *tt = vp->v_inode.i_atime.tv_sec; +} + +/* * Some useful predicates. */ #define VN_MAPPED(vp) mapping_mapped(LINVFS_GET_IP(vp)->i_mapping) diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 00b5043dfa5a..772ac48329ea 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -104,7 +104,7 @@ xfs_qm_dqinit( */ if (brandnewdquot) { dqp->dq_flnext = dqp->dq_flprev = dqp; - mutex_init(&dqp->q_qlock, MUTEX_DEFAULT, "xdq"); + mutex_init(&dqp->q_qlock); initnsema(&dqp->q_flock, 1, "fdq"); sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); @@ -1382,7 +1382,7 @@ void xfs_dqlock( xfs_dquot_t *dqp) { - mutex_lock(&(dqp->q_qlock), PINOD); + mutex_lock(&(dqp->q_qlock)); } void diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 2f69822344e5..2ec6b441849c 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -239,7 +239,7 @@ xfs_qm_dquot_logitem_pushbuf( * trying to duplicate our effort. */ ASSERT(qip->qli_pushbuf_flag != 0); - ASSERT(qip->qli_push_owner == get_thread_id()); + ASSERT(qip->qli_push_owner == current_pid()); /* * If flushlock isn't locked anymore, chances are that the @@ -333,7 +333,7 @@ xfs_qm_dquot_logitem_trylock( qip->qli_pushbuf_flag = 1; ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno); #ifdef DEBUG - qip->qli_push_owner = get_thread_id(); + qip->qli_push_owner = current_pid(); #endif /* * The dquot is left locked. diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 5328a2937127..7dcdd0640c32 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -167,7 +167,7 @@ xfs_Gqm_init(void) xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; xqm->qm_nrefs = 0; #ifdef DEBUG - mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk"); + xfs_mutex_init(&qcheck_lock, MUTEX_DEFAULT, "qchk"); #endif return xqm; } @@ -1166,7 +1166,7 @@ xfs_qm_init_quotainfo( qinf->qi_dqreclaims = 0; /* mutex used to serialize quotaoffs */ - mutex_init(&qinf->qi_quotaofflock, MUTEX_DEFAULT, "qoff"); + mutex_init(&qinf->qi_quotaofflock); /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -1285,7 +1285,7 @@ xfs_qm_list_init( char *str, int n) { - mutex_init(&list->qh_lock, MUTEX_DEFAULT, str); + mutex_init(&list->qh_lock); list->qh_next = NULL; list->qh_version = 0; list->qh_nelems = 0; @@ -1392,11 +1392,12 @@ xfs_qm_qino_alloc( { xfs_trans_t *tp; int error; - unsigned long s; + unsigned long s; cred_t zerocr; + xfs_inode_t zeroino; int committed; - tp = xfs_trans_alloc(mp,XFS_TRANS_QM_QINOCREATE); + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); if ((error = xfs_trans_reserve(tp, XFS_QM_QINOCREATE_SPACE_RES(mp), XFS_CREATE_LOG_RES(mp), 0, @@ -1406,8 +1407,9 @@ xfs_qm_qino_alloc( return (error); } memset(&zerocr, 0, sizeof(zerocr)); + memset(&zeroino, 0, sizeof(zeroino)); - if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0, + if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0, &zerocr, 0, 1, ip, &committed))) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); @@ -1918,9 +1920,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, - XFS_QMOPT_UQUOTA|XFS_QMOPT_GQUOTA| - XFS_QMOPT_PQUOTA|XFS_QMOPT_QUOTAOFF); + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); goto error_return; } /* @@ -2743,6 +2743,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode( xfs_dqunlock(udqp); ASSERT(ip->i_udquot == NULL); ip->i_udquot = udqp; + ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp)); ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } @@ -2752,7 +2753,10 @@ xfs_qm_vop_dqattach_and_dqmod_newinode( xfs_dqunlock(gdqp); ASSERT(ip->i_gdquot == NULL); ip->i_gdquot = gdqp; - ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp)); + ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ? + ip->i_d.di_gid : ip->i_d.di_projid) == + be32_to_cpu(gdqp->q_core.d_id)); xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } } @@ -2762,7 +2766,7 @@ STATIC void xfs_qm_freelist_init(xfs_frlist_t *ql) { ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; - mutex_init(&ql->qh_lock, MUTEX_DEFAULT, "dqf"); + mutex_init(&ql->qh_lock); ql->qh_version = 0; ql->qh_nelems = 0; } @@ -2772,7 +2776,7 @@ xfs_qm_freelist_destroy(xfs_frlist_t *ql) { xfs_dquot_t *dqp, *nextdqp; - mutex_lock(&ql->qh_lock, PINOD); + mutex_lock(&ql->qh_lock); for (dqp = ql->qh_next; dqp != (xfs_dquot_t *)ql; ) { xfs_dqlock(dqp); diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h index 12da259f2fcb..4568deb6da86 100644 --- a/fs/xfs/quota/xfs_qm.h +++ b/fs/xfs/quota/xfs_qm.h @@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct { #define XFS_QM_IWARNLIMIT 5 #define XFS_QM_RTBWARNLIMIT 5 -#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock, PINOD)) +#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock)) #define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) #define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) #define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c index d9d2993de435..90402a1c3983 100644 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ b/fs/xfs/quota/xfs_qm_bhv.c @@ -363,7 +363,7 @@ xfs_qm_init(void) KERN_INFO "SGI XFS Quota Management subsystem\n"; printk(message); - mutex_init(&xfs_Gqm_lock, MUTEX_DEFAULT, "xfs_qmlock"); + mutex_init(&xfs_Gqm_lock); vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs); xfs_qm_init_procfs(); } diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 24690e1af659..676884394aae 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -15,6 +15,9 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include <linux/capability.h> + #include "xfs.h" #include "xfs_fs.h" #include "xfs_bit.h" @@ -233,7 +236,7 @@ xfs_qm_scall_quotaoff( */ ASSERT(mp->m_quotainfo); if (mp->m_quotainfo) - mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); + mutex_lock(&(XFS_QI_QOFFLOCK(mp))); ASSERT(mp->m_quotainfo); @@ -508,7 +511,7 @@ xfs_qm_scall_quotaon( /* * Switch on quota enforcement in core. */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); + mutex_lock(&(XFS_QI_QOFFLOCK(mp))); mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); @@ -617,7 +620,7 @@ xfs_qm_scall_setqlim( * a quotaoff from happening). (XXXThis doesn't currently happen * because we take the vfslock before calling xfs_qm_sysent). */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp)), PINOD); + mutex_lock(&(XFS_QI_QOFFLOCK(mp))); /* * Get the dquot (locked), and join it to the transaction. @@ -1426,7 +1429,7 @@ xfs_qm_internalqcheck( xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); XFS_bflush(mp->m_ddev_targp); - mutex_lock(&qcheck_lock, PINOD); + mutex_lock(&qcheck_lock); /* There should be absolutely no quota activity while this is going on. */ qmtest_udqtab = kmem_zalloc(qmtest_hashmask * diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index 7a9f3beb818c..b7ddd04aae32 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -51,7 +51,7 @@ #define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) #define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) -#define XQMLCK(h) (mutex_lock(&((h)->qh_lock), PINOD)) +#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) #define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) #ifdef DEBUG struct xfs_dqhash; diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c index bb6dc91ea261..b08b3d9345b7 100644 --- a/fs/xfs/support/debug.c +++ b/fs/xfs/support/debug.c @@ -27,45 +27,12 @@ static DEFINE_SPINLOCK(xfs_err_lock); /* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ #define XFS_MAX_ERR_LEVEL 7 #define XFS_ERR_MASK ((1 << 3) - 1) -static char *err_level[XFS_MAX_ERR_LEVEL+1] = +static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = {KERN_EMERG, KERN_ALERT, KERN_CRIT, KERN_ERR, KERN_WARNING, KERN_NOTICE, KERN_INFO, KERN_DEBUG}; void -assfail(char *a, char *f, int l) -{ - printk("XFS assertion failed: %s, file: %s, line: %d\n", a, f, l); - BUG(); -} - -#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM)) - -unsigned long -random(void) -{ - static unsigned long RandomValue = 1; - /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */ - register long rv = RandomValue; - register long lo; - register long hi; - - hi = rv / 127773; - lo = rv % 127773; - rv = 16807 * lo - 2836 * hi; - if( rv <= 0 ) rv += 2147483647; - return( RandomValue = rv ); -} - -int -get_thread_id(void) -{ - return current->pid; -} - -#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */ - -void cmn_err(register int level, char *fmt, ...) { char *fp = fmt; @@ -90,7 +57,6 @@ cmn_err(register int level, char *fmt, ...) BUG(); } - void icmn_err(register int level, char *fmt, va_list ap) { @@ -109,3 +75,27 @@ icmn_err(register int level, char *fmt, va_list ap) if (level == CE_PANIC) BUG(); } + +void +assfail(char *expr, char *file, int line) +{ + printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); + BUG(); +} + +#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM)) +unsigned long random(void) +{ + static unsigned long RandomValue = 1; + /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */ + register long rv = RandomValue; + register long lo; + register long hi; + + hi = rv / 127773; + lo = rv % 127773; + rv = 16807 * lo - 2836 * hi; + if (rv <= 0) rv += 2147483647; + return RandomValue = rv; +} +#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */ diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index aff558664c32..e3bf58112e7e 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -31,24 +31,23 @@ extern void icmn_err(int, char *, va_list) __attribute__ ((format (printf, 2, 0))); extern void cmn_err(int, char *, ...) __attribute__ ((format (printf, 2, 3))); +extern void assfail(char *expr, char *f, int l); -#ifndef STATIC -# define STATIC static -#endif +#define prdev(fmt,targ,args...) \ + printk("Device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args) -#ifdef DEBUG -# define ASSERT(EX) ((EX) ? ((void)0) : assfail(#EX, __FILE__, __LINE__)) -#else -# define ASSERT(x) ((void)0) -#endif +#define ASSERT_ALWAYS(expr) \ + (unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) -extern void assfail(char *, char *, int); -#ifdef DEBUG +#ifndef DEBUG +# define ASSERT(expr) ((void)0) +#else +# define ASSERT(expr) ASSERT_ALWAYS(expr) extern unsigned long random(void); -extern int get_thread_id(void); #endif -#define ASSERT_ALWAYS(EX) ((EX)?((void)0):assfail(#EX, __FILE__, __LINE__)) -#define debug_stop_all_cpus(param) /* param is "cpumask_t *" */ +#ifndef STATIC +# define STATIC static +#endif #endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c index 70ce40914c8a..a3d565a67734 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/support/uuid.c @@ -24,9 +24,19 @@ static uuid_t *uuid_table; void uuid_init(void) { - mutex_init(&uuid_monitor, MUTEX_DEFAULT, "uuid_monitor"); + mutex_init(&uuid_monitor); } + +/* IRIX interpretation of an uuid_t */ +typedef struct { + __be32 uu_timelow; + __be16 uu_timemid; + __be16 uu_timehi; + __be16 uu_clockseq; + __be16 uu_node[3]; +} xfs_uu_t; + /* * uuid_getnodeuniq - obtain the node unique fields of a UUID. * @@ -36,16 +46,11 @@ uuid_init(void) void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) { - char *uu = (char *)uuid; - - /* on IRIX, this function assumes big-endian fields within - * the uuid, so we use INT_GET to get the same result on - * little-endian systems - */ + xfs_uu_t *uup = (xfs_uu_t *)uuid; - fsid[0] = (INT_GET(*(u_int16_t*)(uu+8), ARCH_CONVERT) << 16) + - INT_GET(*(u_int16_t*)(uu+4), ARCH_CONVERT); - fsid[1] = INT_GET(*(u_int32_t*)(uu ), ARCH_CONVERT); + fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) | + be16_to_cpu(uup->uu_timemid); + fsid[1] = be16_to_cpu(uup->uu_timelow); } void @@ -94,7 +99,7 @@ uuid_table_insert(uuid_t *uuid) { int i, hole; - mutex_lock(&uuid_monitor, PVFS); + mutex_lock(&uuid_monitor); for (i = 0, hole = -1; i < uuid_table_size; i++) { if (uuid_is_nil(&uuid_table[i])) { hole = i; @@ -122,7 +127,7 @@ uuid_table_remove(uuid_t *uuid) { int i; - mutex_lock(&uuid_monitor, PVFS); + mutex_lock(&uuid_monitor); for (i = 0; i < uuid_table_size; i++) { if (uuid_is_nil(&uuid_table[i])) continue; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index cc9c91b9e771..4ff0f4e41c61 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -36,6 +36,7 @@ #include "xfs_mac.h" #include "xfs_attr.h" +#include <linux/capability.h> #include <linux/posix_acl_xattr.h> STATIC int xfs_acl_setmode(vnode_t *, xfs_acl_t *, int *); diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h index 68e5051d8e24..c4836890b726 100644 --- a/fs/xfs/xfs_arch.h +++ b/fs/xfs/xfs_arch.h @@ -40,6 +40,22 @@ #undef XFS_NATIVE_HOST #endif +#ifdef XFS_NATIVE_HOST +#define cpu_to_be16(val) ((__be16)(val)) +#define cpu_to_be32(val) ((__be32)(val)) +#define cpu_to_be64(val) ((__be64)(val)) +#define be16_to_cpu(val) ((__uint16_t)(val)) +#define be32_to_cpu(val) ((__uint32_t)(val)) +#define be64_to_cpu(val) ((__uint64_t)(val)) +#else +#define cpu_to_be16(val) (__swab16((__uint16_t)(val))) +#define cpu_to_be32(val) (__swab32((__uint32_t)(val))) +#define cpu_to_be64(val) (__swab64((__uint64_t)(val))) +#define be16_to_cpu(val) (__swab16((__be16)(val))) +#define be32_to_cpu(val) (__swab32((__be32)(val))) +#define be64_to_cpu(val) (__swab64((__be64)(val))) +#endif + #endif /* __KERNEL__ */ /* do we need conversion? */ @@ -186,7 +202,7 @@ static inline void be64_add(__be64 *a, __s64 b) */ #define XFS_GET_DIR_INO4(di) \ - (((u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) + (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) #define XFS_PUT_DIR_INO4(from, di) \ do { \ @@ -197,9 +213,9 @@ do { \ } while (0) #define XFS_DI_HI(di) \ - (((u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) + (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) #define XFS_DI_LO(di) \ - (((u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) + (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) #define XFS_GET_DIR_INO8(di) \ (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 5484eeb460c8..e5e91e9c7e89 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -15,6 +15,9 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include <linux/capability.h> + #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -117,11 +120,6 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, ip->i_d.di_anextents == 0)) return(ENOATTR); - if (!(flags & (ATTR_KERNACCESS|ATTR_SECURE))) { - if ((error = xfs_iaccess(ip, S_IRUSR, cred))) - return(XFS_ERROR(error)); - } - /* * Fill in the arg structure for this request. */ @@ -425,7 +423,7 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f struct cred *cred) { xfs_inode_t *dp; - int namelen, error; + int namelen; namelen = strlen(name); if (namelen >= MAXNAMELEN) @@ -437,14 +435,6 @@ xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int f if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return (EIO); - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!(flags & ATTR_SECURE) && - (error = xfs_iaccess(dp, S_IWUSR, cred))) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(XFS_ERROR(error)); - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags); } @@ -579,7 +569,7 @@ int xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) { xfs_inode_t *dp; - int namelen, error; + int namelen; namelen = strlen(name); if (namelen >= MAXNAMELEN) @@ -592,11 +582,7 @@ xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) return (EIO); xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!(flags & ATTR_SECURE) && - (error = xfs_iaccess(dp, S_IWUSR, cred))) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(XFS_ERROR(error)); - } else if (XFS_IFORK_Q(dp) == 0 || + if (XFS_IFORK_Q(dp) == 0 || (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && dp->i_d.di_anextents == 0)) { xfs_iunlock(dp, XFS_ILOCK_SHARED); @@ -668,12 +654,6 @@ xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, return (EIO); xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!(flags & ATTR_SECURE) && - (error = xfs_iaccess(dp, S_IRUSR, cred))) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(XFS_ERROR(error)); - } - /* * Decide on what work routines to call based on the inode size. */ diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 1c7421840c18..fe91eac4e2a7 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -128,7 +128,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return (offset >= minforkoff) ? minforkoff : 0; } - if (unlikely(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) { + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { if (bytes <= XFS_IFORK_ASIZE(dp)) return mp->m_attroffset >> 3; return 0; @@ -157,7 +157,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) { unsigned long s; - if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR) && + if ((mp->m_flags & XFS_MOUNT_ATTR2) && !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) { s = XFS_SB_LOCK(mp); if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { @@ -311,7 +311,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) */ totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname && - !(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) { + (mp->m_flags & XFS_MOUNT_ATTR2)) { /* * Last attribute now removed, revert to original * inode format making all literal area available @@ -330,7 +330,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_d.di_forkoff); ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname || - (mp->m_flags & XFS_MOUNT_COMPAT_ATTR)); + !(mp->m_flags & XFS_MOUNT_ATTR2)); dp->i_afp->if_ext_max = XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); dp->i_df.if_ext_max = @@ -739,7 +739,7 @@ xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) + name_loc->namelen + INT_GET(name_loc->valuelen, ARCH_CONVERT); } - if (!(dp->i_mount->m_flags & XFS_MOUNT_COMPAT_ATTR) && + if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return(-1); return(xfs_attr_shortform_bytesfit(dp, bytes)); @@ -778,7 +778,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) goto out; if (forkoff == -1) { - ASSERT(!(dp->i_mount->m_flags & XFS_MOUNT_COMPAT_ATTR)); + ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); /* * Last attribute was removed, revert to original diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index f6143ff251a0..541e34109bb9 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -63,7 +63,7 @@ struct xfs_trans; * the leaf_entry. The namespaces are independent only because we also look * at the namespace bit when we are looking for a matching attribute name. * - * We also store a "incomplete" bit in the leaf_entry. It shows that an + * We also store an "incomplete" bit in the leaf_entry. It shows that an * attribute is in the middle of being created and should not be shown to * the user if we crash during the time that the bit is set. We clear the * bit when we have finished setting up the attribute. We do this because @@ -72,42 +72,48 @@ struct xfs_trans; */ #define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __uint16_t base; /* base of free region */ + __uint16_t size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active leaf_entry's */ + __uint16_t usedbytes; /* num bytes of names/values stored */ + __uint16_t firstused; /* first used byte in name area */ + __uint8_t holes; /* != 0 if blk needs compaction */ + __uint8_t pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + xfs_dahash_t hashval; /* hash value of name */ + __uint16_t nameidx; /* index into buffer of name/value */ + __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __uint8_t pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __uint16_t valuelen; /* number of bytes in value */ + __uint8_t namelen; /* length of name bytes */ + __uint8_t nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + xfs_dablk_t valueblk; /* block number of value bytes */ + __uint32_t valuelen; /* number of bytes in value */ + __uint8_t namelen; /* length of name bytes */ + __uint8_t name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + typedef struct xfs_attr_leafblock { - struct xfs_attr_leaf_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __uint16_t count; /* count of active leaf_entry's */ - __uint16_t usedbytes; /* num bytes of names/values stored */ - __uint16_t firstused; /* first used byte in name area */ - __uint8_t holes; /* != 0 if blk needs compaction */ - __uint8_t pad1; - struct xfs_attr_leaf_map { /* RLE map of free bytes */ - __uint16_t base; /* base of free region */ - __uint16_t size; /* length of free region */ - } freemap[XFS_ATTR_LEAF_MAPSIZE]; /* N largest free regions */ - } hdr; - struct xfs_attr_leaf_entry { /* sorted on key, not name */ - xfs_dahash_t hashval; /* hash value of name */ - __uint16_t nameidx; /* index into buffer of name/value */ - __uint8_t flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ - __uint8_t pad2; /* unused pad byte */ - } entries[1]; /* variable sized array */ - struct xfs_attr_leaf_name_local { - __uint16_t valuelen; /* number of bytes in value */ - __uint8_t namelen; /* length of name bytes */ - __uint8_t nameval[1]; /* name/value bytes */ - } namelist; /* grows from bottom of buf */ - struct xfs_attr_leaf_name_remote { - xfs_dablk_t valueblk; /* block number of value bytes */ - __uint32_t valuelen; /* number of bytes in value */ - __uint8_t namelen; /* length of name bytes */ - __uint8_t name[1]; /* name bytes */ - } valuelist; /* grows from bottom of buf */ + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ } xfs_attr_leafblock_t; -typedef struct xfs_attr_leaf_hdr xfs_attr_leaf_hdr_t; -typedef struct xfs_attr_leaf_map xfs_attr_leaf_map_t; -typedef struct xfs_attr_leaf_entry xfs_attr_leaf_entry_t; -typedef struct xfs_attr_leaf_name_local xfs_attr_leaf_name_local_t; -typedef struct xfs_attr_leaf_name_remote xfs_attr_leaf_name_remote_t; /* * Flags used in the leaf_entry[i].flags field. @@ -150,7 +156,8 @@ xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) (leafp))[INT_GET((leafp)->entries[idx].nameidx, ARCH_CONVERT)]; } -#define XFS_ATTR_LEAF_NAME(leafp,idx) xfs_attr_leaf_name(leafp,idx) +#define XFS_ATTR_LEAF_NAME(leafp,idx) \ + xfs_attr_leaf_name(leafp,idx) static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) { return (&((char *) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index e415a4698e9c..70625e577c70 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -2146,13 +2146,176 @@ xfs_bmap_add_extent_hole_real( return 0; /* keep gcc quite */ } +/* + * Adjust the size of the new extent based on di_extsize and rt extsize. + */ +STATIC int +xfs_bmap_extsize_align( + xfs_mount_t *mp, + xfs_bmbt_irec_t *gotp, /* next extent pointer */ + xfs_bmbt_irec_t *prevp, /* previous extent pointer */ + xfs_extlen_t extsz, /* align to this extent size */ + int rt, /* is this a realtime inode? */ + int eof, /* is extent at end-of-file? */ + int delay, /* creating delalloc extent? */ + int convert, /* overwriting unwritten extent? */ + xfs_fileoff_t *offp, /* in/out: aligned offset */ + xfs_extlen_t *lenp) /* in/out: aligned length */ +{ + xfs_fileoff_t orig_off; /* original offset */ + xfs_extlen_t orig_alen; /* original length */ + xfs_fileoff_t orig_end; /* original off+len */ + xfs_fileoff_t nexto; /* next file offset */ + xfs_fileoff_t prevo; /* previous file offset */ + xfs_fileoff_t align_off; /* temp for offset */ + xfs_extlen_t align_alen; /* temp for length */ + xfs_extlen_t temp; /* temp for calculations */ + + if (convert) + return 0; + + orig_off = align_off = *offp; + orig_alen = align_alen = *lenp; + orig_end = orig_off + orig_alen; + + /* + * If this request overlaps an existing extent, then don't + * attempt to perform any additional alignment. + */ + if (!delay && !eof && + (orig_off >= gotp->br_startoff) && + (orig_end <= gotp->br_startoff + gotp->br_blockcount)) { + return 0; + } + + /* + * If the file offset is unaligned vs. the extent size + * we need to align it. This will be possible unless + * the file was previously written with a kernel that didn't + * perform this alignment, or if a truncate shot us in the + * foot. + */ + temp = do_mod(orig_off, extsz); + if (temp) { + align_alen += temp; + align_off -= temp; + } + /* + * Same adjustment for the end of the requested area. + */ + if ((temp = (align_alen % extsz))) { + align_alen += extsz - temp; + } + /* + * If the previous block overlaps with this proposed allocation + * then move the start forward without adjusting the length. + */ + if (prevp->br_startoff != NULLFILEOFF) { + if (prevp->br_startblock == HOLESTARTBLOCK) + prevo = prevp->br_startoff; + else + prevo = prevp->br_startoff + prevp->br_blockcount; + } else + prevo = 0; + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + /* + * If the next block overlaps with this proposed allocation + * then move the start back without adjusting the length, + * but not before offset 0. + * This may of course make the start overlap previous block, + * and if we hit the offset 0 limit then the next block + * can still overlap too. + */ + if (!eof && gotp->br_startoff != NULLFILEOFF) { + if ((delay && gotp->br_startblock == HOLESTARTBLOCK) || + (!delay && gotp->br_startblock == DELAYSTARTBLOCK)) + nexto = gotp->br_startoff + gotp->br_blockcount; + else + nexto = gotp->br_startoff; + } else + nexto = NULLFILEOFF; + if (!eof && + align_off + align_alen != orig_end && + align_off + align_alen > nexto) + align_off = nexto > align_alen ? nexto - align_alen : 0; + /* + * If we're now overlapping the next or previous extent that + * means we can't fit an extsz piece in this hole. Just move + * the start forward to the first valid spot and set + * the length so we hit the end. + */ + if (align_off != orig_off && align_off < prevo) + align_off = prevo; + if (align_off + align_alen != orig_end && + align_off + align_alen > nexto && + nexto != NULLFILEOFF) { + ASSERT(nexto > prevo); + align_alen = nexto - align_off; + } + + /* + * If realtime, and the result isn't a multiple of the realtime + * extent size we need to remove blocks until it is. + */ + if (rt && (temp = (align_alen % mp->m_sb.sb_rextsize))) { + /* + * We're not covering the original request, or + * we won't be able to once we fix the length. + */ + if (orig_off < align_off || + orig_end > align_off + align_alen || + align_alen - temp < orig_alen) + return XFS_ERROR(EINVAL); + /* + * Try to fix it by moving the start up. + */ + if (align_off + temp <= orig_off) { + align_alen -= temp; + align_off += temp; + } + /* + * Try to fix it by moving the end in. + */ + else if (align_off + align_alen - temp >= orig_end) + align_alen -= temp; + /* + * Set the start to the minimum then trim the length. + */ + else { + align_alen -= orig_off - align_off; + align_off = orig_off; + align_alen -= align_alen % mp->m_sb.sb_rextsize; + } + /* + * Result doesn't cover the request, fail it. + */ + if (orig_off < align_off || orig_end > align_off + align_alen) + return XFS_ERROR(EINVAL); + } else { + ASSERT(orig_off >= align_off); + ASSERT(orig_end <= align_off + align_alen); + } + +#ifdef DEBUG + if (!eof && gotp->br_startoff != NULLFILEOFF) + ASSERT(align_off + align_alen <= gotp->br_startoff); + if (prevp->br_startoff != NULLFILEOFF) + ASSERT(align_off >= prevp->br_startoff + prevp->br_blockcount); +#endif + + *lenp = align_alen; + *offp = align_off; + return 0; +} + #define XFS_ALLOC_GAP_UNITS 4 /* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. */ -STATIC int /* error */ +STATIC int xfs_bmap_alloc( xfs_bmalloca_t *ap) /* bmap alloc argument struct */ { @@ -2163,10 +2326,10 @@ xfs_bmap_alloc( xfs_mount_t *mp; /* mount point structure */ int nullfb; /* true if ap->firstblock isn't set */ int rt; /* true if inode is realtime */ -#ifdef __KERNEL__ - xfs_extlen_t prod=0; /* product factor for allocators */ - xfs_extlen_t ralen=0; /* realtime allocation length */ -#endif + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtx; #define ISVALID(x,y) \ (rt ? \ @@ -2182,125 +2345,25 @@ xfs_bmap_alloc( nullfb = ap->firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); -#ifdef __KERNEL__ if (rt) { - xfs_extlen_t extsz; /* file extent size for rt */ - xfs_fileoff_t nexto; /* next file offset */ - xfs_extlen_t orig_alen; /* original ap->alen */ - xfs_fileoff_t orig_end; /* original off+len */ - xfs_fileoff_t orig_off; /* original ap->off */ - xfs_extlen_t mod_off; /* modulus calculations */ - xfs_fileoff_t prevo; /* previous file offset */ - xfs_rtblock_t rtx; /* realtime extent number */ - xfs_extlen_t temp; /* temp for rt calculations */ - - /* - * Set prod to match the realtime extent size. - */ - if (!(extsz = ap->ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - prod = extsz / mp->m_sb.sb_rextsize; - orig_off = ap->off; - orig_alen = ap->alen; - orig_end = orig_off + orig_alen; - /* - * If the file offset is unaligned vs. the extent size - * we need to align it. This will be possible unless - * the file was previously written with a kernel that didn't - * perform this alignment. - */ - mod_off = do_mod(orig_off, extsz); - if (mod_off) { - ap->alen += mod_off; - ap->off -= mod_off; - } - /* - * Same adjustment for the end of the requested area. - */ - if ((temp = (ap->alen % extsz))) - ap->alen += extsz - temp; - /* - * If the previous block overlaps with this proposed allocation - * then move the start forward without adjusting the length. - */ - prevo = - ap->prevp->br_startoff == NULLFILEOFF ? - 0 : - (ap->prevp->br_startoff + - ap->prevp->br_blockcount); - if (ap->off != orig_off && ap->off < prevo) - ap->off = prevo; - /* - * If the next block overlaps with this proposed allocation - * then move the start back without adjusting the length, - * but not before offset 0. - * This may of course make the start overlap previous block, - * and if we hit the offset 0 limit then the next block - * can still overlap too. - */ - nexto = (ap->eof || ap->gotp->br_startoff == NULLFILEOFF) ? - NULLFILEOFF : ap->gotp->br_startoff; - if (!ap->eof && - ap->off + ap->alen != orig_end && - ap->off + ap->alen > nexto) - ap->off = nexto > ap->alen ? nexto - ap->alen : 0; - /* - * If we're now overlapping the next or previous extent that - * means we can't fit an extsz piece in this hole. Just move - * the start forward to the first valid spot and set - * the length so we hit the end. - */ - if ((ap->off != orig_off && ap->off < prevo) || - (ap->off + ap->alen != orig_end && - ap->off + ap->alen > nexto)) { - ap->off = prevo; - ap->alen = nexto - prevo; - } - /* - * If the result isn't a multiple of rtextents we need to - * remove blocks until it is. - */ - if ((temp = (ap->alen % mp->m_sb.sb_rextsize))) { - /* - * We're not covering the original request, or - * we won't be able to once we fix the length. - */ - if (orig_off < ap->off || - orig_end > ap->off + ap->alen || - ap->alen - temp < orig_alen) - return XFS_ERROR(EINVAL); - /* - * Try to fix it by moving the start up. - */ - if (ap->off + temp <= orig_off) { - ap->alen -= temp; - ap->off += temp; - } - /* - * Try to fix it by moving the end in. - */ - else if (ap->off + ap->alen - temp >= orig_end) - ap->alen -= temp; - /* - * Set the start to the minimum then trim the length. - */ - else { - ap->alen -= orig_off - ap->off; - ap->off = orig_off; - ap->alen -= ap->alen % mp->m_sb.sb_rextsize; - } - /* - * Result doesn't cover the request, fail it. - */ - if (orig_off < ap->off || orig_end > ap->off + ap->alen) - return XFS_ERROR(EINVAL); - } + align = ap->ip->i_d.di_extsize ? + ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize; + /* Set prod to match the extent size */ + prod = align / mp->m_sb.sb_rextsize; + + error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + align, rt, ap->eof, 0, + ap->conv, &ap->off, &ap->alen); + if (error) + return error; + ASSERT(ap->alen); ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); + /* * If the offset & length are not perfectly aligned * then kill prod, it will just get us in trouble. */ - if (do_mod(ap->off, extsz) || ap->alen % extsz) + if (do_mod(ap->off, align) || ap->alen % align) prod = 1; /* * Set ralen to be the actual requested length in rtextents. @@ -2326,15 +2389,24 @@ xfs_bmap_alloc( ap->rval = rtx * mp->m_sb.sb_rextsize; } else ap->rval = 0; + } else { + align = (ap->userdata && ap->ip->i_d.di_extsize && + (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ? + ap->ip->i_d.di_extsize : 0; + if (unlikely(align)) { + error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + align, rt, + ap->eof, 0, ap->conv, + &ap->off, &ap->alen); + ASSERT(!error); + ASSERT(ap->alen); + } + if (nullfb) + ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + else + ap->rval = ap->firstblock; } -#else - if (rt) - ap->rval = 0; -#endif /* __KERNEL__ */ - else if (nullfb) - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); - else - ap->rval = ap->firstblock; + /* * If allocating at eof, and there's a previous real block, * try to use it's last block as our starting point. @@ -2598,11 +2670,12 @@ xfs_bmap_alloc( args.total = ap->total; args.minlen = ap->minlen; } - if (ap->ip->i_d.di_extsize) { + if (unlikely(ap->userdata && ap->ip->i_d.di_extsize && + (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) { args.prod = ap->ip->i_d.di_extsize; if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); - } else if (mp->m_sb.sb_blocksize >= NBPP) { + } else if (unlikely(mp->m_sb.sb_blocksize >= NBPP)) { args.prod = 1; args.mod = 0; } else { @@ -3580,14 +3653,16 @@ xfs_bmap_search_extents( ep = xfs_bmap_do_search_extents(base, lastx, nextents, bno, eofp, lastxp, gotp, prevp); - rt = ip->i_d.di_flags & XFS_DIFLAG_REALTIME; - if(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM)) { + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) { cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld " "start_block : %llx start_off : %llx blkcnt : %llx " "extent-state : %x \n", - (ip->i_mount)->m_fsname,(long long)ip->i_ino, - gotp->br_startblock, gotp->br_startoff, - gotp->br_blockcount,gotp->br_state); + (ip->i_mount)->m_fsname, (long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state); } return ep; } @@ -3875,7 +3950,7 @@ xfs_bmap_add_attrfork( ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_d.di_forkoff) ip->i_d.di_forkoff = mp->m_attroffset >> 3; - else if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) + else if (mp->m_flags & XFS_MOUNT_ATTR2) version = 2; break; default: @@ -4023,13 +4098,13 @@ xfs_bmap_compute_maxlevels( */ if (whichfork == XFS_DATA_FORK) { maxleafents = MAXEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) ? - mp->m_attroffset : XFS_BMDR_SPACE_CALC(MINDBTPTRS); + sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? + XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset; } else { maxleafents = MAXAEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) ? - mp->m_sb.sb_inodesize - mp->m_attroffset : - XFS_BMDR_SPACE_CALC(MINABTPTRS); + sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? + XFS_BMDR_SPACE_CALC(MINABTPTRS) : + mp->m_sb.sb_inodesize - mp->m_attroffset; } maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); minleafrecs = mp->m_bmap_dmnr[0]; @@ -4418,8 +4493,8 @@ xfs_bmap_read_extents( num_recs = be16_to_cpu(block->bb_numrecs); if (unlikely(i + num_recs > room)) { ASSERT(i + num_recs <= room); - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt dinode %Lu, (btree extents). Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, (btree extents).", (unsigned long long) ip->i_ino); XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", XFS_ERRLEVEL_LOW, @@ -4590,6 +4665,7 @@ xfs_bmapi( char contig; /* allocation must be one extent */ char delay; /* this request is for delayed alloc */ char exact; /* don't do all of wasdelayed extent */ + char convert; /* unwritten extent I/O completion */ xfs_bmbt_rec_t *ep; /* extent list entry pointer */ int error; /* error return */ xfs_bmbt_irec_t got; /* current extent list record */ @@ -4643,7 +4719,7 @@ xfs_bmapi( } if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rt = XFS_IS_REALTIME_INODE(ip); + rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(ifp->if_ext_max == XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); @@ -4654,6 +4730,7 @@ xfs_bmapi( delay = (flags & XFS_BMAPI_DELAY) != 0; trim = (flags & XFS_BMAPI_ENTIRE) == 0; userdata = (flags & XFS_BMAPI_METADATA) == 0; + convert = (flags & XFS_BMAPI_CONVERT) != 0; exact = (flags & XFS_BMAPI_EXACT) != 0; rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; contig = (flags & XFS_BMAPI_CONTIG) != 0; @@ -4748,15 +4825,25 @@ xfs_bmapi( } minlen = contig ? alen : 1; if (delay) { - xfs_extlen_t extsz = 0; + xfs_extlen_t extsz; /* Figure out the extent size, adjust alen */ if (rt) { if (!(extsz = ip->i_d.di_extsize)) extsz = mp->m_sb.sb_rextsize; - alen = roundup(alen, extsz); - extsz = alen / mp->m_sb.sb_rextsize; + } else { + extsz = ip->i_d.di_extsize; } + if (extsz) { + error = xfs_bmap_extsize_align(mp, + &got, &prev, extsz, + rt, eof, delay, convert, + &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; /* * Make a transaction-less quota reservation for @@ -4785,32 +4872,33 @@ xfs_bmapi( xfs_bmap_worst_indlen(ip, alen); ASSERT(indlen > 0); - if (rt) + if (rt) { error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, -(extsz), rsvd); - else + } else { error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, -(alen), rsvd); + } if (!error) { error = xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, -(indlen), rsvd); - if (error && rt) { - xfs_mod_incore_sb(ip->i_mount, + if (error && rt) + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, rsvd); - } else if (error) { - xfs_mod_incore_sb(ip->i_mount, + else if (error) + xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, alen, rsvd); - } } if (error) { - if (XFS_IS_QUOTA_ON(ip->i_mount)) + if (XFS_IS_QUOTA_ON(mp)) /* unreserve the blocks now */ + (void) XFS_TRANS_UNRESERVE_QUOTA_NBLKS( mp, NULL, ip, (long)alen, 0, rt ? @@ -4849,6 +4937,7 @@ xfs_bmapi( bma.firstblock = *firstblock; bma.alen = alen; bma.off = aoff; + bma.conv = convert; bma.wasdel = wasdelay; bma.minlen = minlen; bma.low = flist->xbf_low; @@ -5270,8 +5359,7 @@ xfs_bunmapi( return 0; } XFS_STATS_INC(xs_blk_unmap); - isrt = (whichfork == XFS_DATA_FORK) && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME); + isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, @@ -5443,7 +5531,7 @@ xfs_bunmapi( } if (wasdel) { ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); - /* Update realtim/data freespace, unreserve quota */ + /* Update realtime/data freespace, unreserve quota */ if (isrt) { xfs_filblks_t rtexts; @@ -5451,14 +5539,14 @@ xfs_bunmapi( do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, (int)rtexts, rsvd); - XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, - -((long)del.br_blockcount), 0, + (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, + NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)del.br_blockcount, rsvd); - XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, NULL, ip, - -((long)del.br_blockcount), 0, + (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, + NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); } ip->i_delayed_blks -= del.br_blockcount; @@ -5652,7 +5740,9 @@ xfs_getbmap( ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) return XFS_ERROR(EINVAL); if (whichfork == XFS_DATA_FORK) { - if (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC) { + if ((ip->i_d.di_extsize && (ip->i_d.di_flags & + (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ prealloced = 1; fixlen = XFS_MAXIOFFSET(mp); } else { diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 2e0717a01309..12cc63dfc2c4 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -62,6 +62,10 @@ typedef struct xfs_bmap_free #define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ /* combine contig. space */ #define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ +/* XFS_BMAPI_DIRECT_IO 0x800 */ +#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */ + /* need write cache flushing and no */ + /* additional allocation alignments */ #define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) static inline int xfs_bmapi_aflag(int w) @@ -101,7 +105,8 @@ typedef struct xfs_bmalloca { char wasdel; /* replacing a delayed allocation */ char userdata;/* set if is user data */ char low; /* low on space, using seq'l ags */ - char aeof; /* allocated space at eof */ + char aeof; /* allocated space at eof */ + char conv; /* overwriting unwritten extents */ } xfs_bmalloca_t; #ifdef __KERNEL__ diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 328a528b926d..f57cc9ac875e 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -57,7 +57,7 @@ struct xfs_mount_args { /* * XFS mount option flags -- args->flags1 */ -#define XFSMNT_COMPAT_ATTR 0x00000001 /* do not use ATTR2 format */ +#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */ #define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount * compatible */ #define XFSMNT_INO64 0x00000004 /* move inode numbers up diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 070259a4254c..c6191d00ad27 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -60,8 +60,6 @@ xfs_swapext( xfs_bstat_t *sbp; struct file *fp = NULL, *tfp = NULL; vnode_t *vp, *tvp; - bhv_desc_t *bdp, *tbdp; - vn_bhv_head_t *bhp, *tbhp; static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; int ilf_fields, tilf_fields; int error = 0; @@ -90,13 +88,10 @@ xfs_swapext( goto error0; } - bhp = VN_BHV_HEAD(vp); - bdp = vn_bhv_lookup(bhp, &xfs_vnodeops); - if (bdp == NULL) { + ip = xfs_vtoi(vp); + if (ip == NULL) { error = XFS_ERROR(EBADF); goto error0; - } else { - ip = XFS_BHVTOI(bdp); } if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) || @@ -105,13 +100,10 @@ xfs_swapext( goto error0; } - tbhp = VN_BHV_HEAD(tvp); - tbdp = vn_bhv_lookup(tbhp, &xfs_vnodeops); - if (tbdp == NULL) { + tip = xfs_vtoi(tvp); + if (tip == NULL) { error = XFS_ERROR(EBADF); goto error0; - } else { - tip = XFS_BHVTOI(tbdp); } if (ip->i_mount != tip->i_mount) { diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index c5a0e537ff1a..79d0d9e1fbab 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -199,10 +199,16 @@ typedef enum xfs_dinode_fmt #define XFS_DFORK_DSIZE(dip,mp) \ XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp) +#define XFS_DFORK_DSIZE_HOST(dip,mp) \ + XFS_CFORK_DSIZE(&(dip)->di_core, mp) #define XFS_DFORK_ASIZE(dip,mp) \ XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp) +#define XFS_DFORK_ASIZE_HOST(dip,mp) \ + XFS_CFORK_ASIZE(&(dip)->di_core, mp) #define XFS_DFORK_SIZE(dip,mp,w) \ XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w) +#define XFS_DFORK_SIZE_HOST(dip,mp,w) \ + XFS_CFORK_SIZE(&(dip)->di_core, mp, w) #define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core) #define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core) @@ -216,6 +222,7 @@ typedef enum xfs_dinode_fmt #define XFS_CFORK_FMT_SET(dcp,w,n) \ ((w) == XFS_DATA_FORK ? \ ((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n))) +#define XFS_DFORK_FORMAT(dip,w) XFS_CFORK_FORMAT(&(dip)->di_core, w) #define XFS_CFORK_NEXTENTS_DISK(dcp,w) \ ((w) == XFS_DATA_FORK ? \ @@ -223,13 +230,13 @@ typedef enum xfs_dinode_fmt INT_GET((dcp)->di_anextents, ARCH_CONVERT)) #define XFS_CFORK_NEXTENTS(dcp,w) \ ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents) +#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w) +#define XFS_DFORK_NEXTENTS_HOST(dip,w) XFS_CFORK_NEXTENTS(&(dip)->di_core, w) #define XFS_CFORK_NEXT_SET(dcp,w,n) \ ((w) == XFS_DATA_FORK ? \ ((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n))) -#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w) - #define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) /* @@ -246,8 +253,10 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_NOATIME_BIT 6 /* do not update atime */ #define XFS_DIFLAG_NODUMP_BIT 7 /* do not dump */ #define XFS_DIFLAG_RTINHERIT_BIT 8 /* create with realtime bit set */ -#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ -#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_PROJINHERIT_BIT 9 /* create with parents projid */ +#define XFS_DIFLAG_NOSYMLINKS_BIT 10 /* disallow symlink creation */ +#define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ +#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) @@ -259,11 +268,14 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_RTINHERIT (1 << XFS_DIFLAG_RTINHERIT_BIT) #define XFS_DIFLAG_PROJINHERIT (1 << XFS_DIFLAG_PROJINHERIT_BIT) #define XFS_DIFLAG_NOSYMLINKS (1 << XFS_DIFLAG_NOSYMLINKS_BIT) +#define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) +#define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) #define XFS_DIFLAG_ANY \ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ - XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS) + XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ + XFS_DIFLAG_EXTSZINHERIT) #endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/xfs_dir.c b/fs/xfs/xfs_dir.c index 3dd30391f551..bb87d2a700a9 100644 --- a/fs/xfs/xfs_dir.c +++ b/fs/xfs/xfs_dir.c @@ -176,7 +176,7 @@ xfs_dir_mount(xfs_mount_t *mp) uint shortcount, leafcount, count; mp->m_dirversion = 1; - if (mp->m_flags & XFS_MOUNT_COMPAT_ATTR) { + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { shortcount = (mp->m_attroffset - (uint)sizeof(xfs_dir_sf_hdr_t)) / (uint)sizeof(xfs_dir_sf_entry_t); diff --git a/fs/xfs/xfs_dir.h b/fs/xfs/xfs_dir.h index 488defe86ba6..8cc8afb9f6c0 100644 --- a/fs/xfs/xfs_dir.h +++ b/fs/xfs/xfs_dir.h @@ -135,6 +135,8 @@ void xfs_dir_startup(void); /* called exactly once */ ((mp)->m_dirops.xd_shortform_to_single(args)) #define XFS_DIR_IS_V1(mp) ((mp)->m_dirversion == 1) +#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2) extern xfs_dirops_t xfsv1_dirops; +extern xfs_dirops_t xfsv2_dirops; #endif /* __XFS_DIR_H__ */ diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 7e24ffeda9e1..3158f5dc431f 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -72,9 +72,6 @@ typedef struct xfs_dir2_put_args { struct uio *uio; /* uio control structure */ } xfs_dir2_put_args_t; -#define XFS_DIR_IS_V2(mp) ((mp)->m_dirversion == 2) -extern xfs_dirops_t xfsv2_dirops; - /* * Other interfaces used by the rest of the dir v2 code. */ diff --git a/fs/xfs/xfs_dir_leaf.h b/fs/xfs/xfs_dir_leaf.h index ab6b09eef9ab..eb8cd9a4667f 100644 --- a/fs/xfs/xfs_dir_leaf.h +++ b/fs/xfs/xfs_dir_leaf.h @@ -67,34 +67,38 @@ struct xfs_trans; */ #define XFS_DIR_LEAF_MAPSIZE 3 /* how many freespace slots */ +typedef struct xfs_dir_leaf_map { /* RLE map of free bytes */ + __uint16_t base; /* base of free region */ + __uint16_t size; /* run length of free region */ +} xfs_dir_leaf_map_t; + +typedef struct xfs_dir_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __uint16_t count; /* count of active leaf_entry's */ + __uint16_t namebytes; /* num bytes of name strings stored */ + __uint16_t firstused; /* first used byte in name area */ + __uint8_t holes; /* != 0 if blk needs compaction */ + __uint8_t pad1; + xfs_dir_leaf_map_t freemap[XFS_DIR_LEAF_MAPSIZE]; +} xfs_dir_leaf_hdr_t; + +typedef struct xfs_dir_leaf_entry { /* sorted on key, not name */ + xfs_dahash_t hashval; /* hash value of name */ + __uint16_t nameidx; /* index into buffer of name */ + __uint8_t namelen; /* length of name string */ + __uint8_t pad2; +} xfs_dir_leaf_entry_t; + +typedef struct xfs_dir_leaf_name { + xfs_dir_ino_t inumber; /* inode number for this key */ + __uint8_t name[1]; /* name string itself */ +} xfs_dir_leaf_name_t; + typedef struct xfs_dir_leafblock { - struct xfs_dir_leaf_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __uint16_t count; /* count of active leaf_entry's */ - __uint16_t namebytes; /* num bytes of name strings stored */ - __uint16_t firstused; /* first used byte in name area */ - __uint8_t holes; /* != 0 if blk needs compaction */ - __uint8_t pad1; - struct xfs_dir_leaf_map {/* RLE map of free bytes */ - __uint16_t base; /* base of free region */ - __uint16_t size; /* run length of free region */ - } freemap[XFS_DIR_LEAF_MAPSIZE]; /* N largest free regions */ - } hdr; - struct xfs_dir_leaf_entry { /* sorted on key, not name */ - xfs_dahash_t hashval; /* hash value of name */ - __uint16_t nameidx; /* index into buffer of name */ - __uint8_t namelen; /* length of name string */ - __uint8_t pad2; - } entries[1]; /* var sized array */ - struct xfs_dir_leaf_name { - xfs_dir_ino_t inumber; /* inode number for this key */ - __uint8_t name[1]; /* name string itself */ - } namelist[1]; /* grows from bottom of buf */ + xfs_dir_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_dir_leaf_entry_t entries[1]; /* var sized array */ + xfs_dir_leaf_name_t namelist[1]; /* grows from bottom of buf */ } xfs_dir_leafblock_t; -typedef struct xfs_dir_leaf_hdr xfs_dir_leaf_hdr_t; -typedef struct xfs_dir_leaf_map xfs_dir_leaf_map_t; -typedef struct xfs_dir_leaf_entry xfs_dir_leaf_entry_t; -typedef struct xfs_dir_leaf_name xfs_dir_leaf_name_t; /* * Length of name for which a 512-byte block filesystem @@ -126,11 +130,10 @@ typedef union { #define XFS_PUT_COOKIE(c,mp,bno,entry,hash) \ ((c).s.be = XFS_DA_MAKE_BNOENTRY(mp, bno, entry), (c).s.h = (hash)) -typedef struct xfs_dir_put_args -{ +typedef struct xfs_dir_put_args { xfs_dircook_t cook; /* cookie of (next) entry */ xfs_intino_t ino; /* inode number */ - struct xfs_dirent *dbp; /* buffer pointer */ + struct xfs_dirent *dbp; /* buffer pointer */ char *name; /* directory entry name */ int namelen; /* length of name */ int done; /* output: set if value was stored */ @@ -138,7 +141,8 @@ typedef struct xfs_dir_put_args struct uio *uio; /* uio control structure */ } xfs_dir_put_args_t; -#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) xfs_dir_leaf_entsize_byname(len) +#define XFS_DIR_LEAF_ENTSIZE_BYNAME(len) \ + xfs_dir_leaf_entsize_byname(len) static inline int xfs_dir_leaf_entsize_byname(int len) { return (uint)sizeof(xfs_dir_leaf_name_t)-1 + len; diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index 864bf6955689..b4c7f2bc55a0 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -152,7 +152,7 @@ typedef enum { #define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ #define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ -#define DM_FLAGS_ISEM 0x004 /* thread holds i_sem */ +#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */ #define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ #define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ @@ -161,21 +161,21 @@ typedef enum { */ #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_ISEM : 0) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) + DM_FLAGS_IMUX : 0) +#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) #endif #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \ (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22)) #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_ISEM) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_ISEM) + DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX) +#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) #endif #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21) #define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - 0 : DM_FLAGS_ISEM) -#define DM_SEM_FLAG_WR (DM_FLAGS_ISEM) + 0 : DM_FLAGS_IMUX) +#define DM_SEM_FLAG_WR (DM_FLAGS_IMUX) #endif diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index d7b6b5d16704..2a21c5024017 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -54,7 +54,6 @@ xfs_error_trap(int e) if (e != xfs_etrap[i]) continue; cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); - debug_stop_all_cpus((void *)-1LL); BUG(); break; } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 06d8a8426c16..26b8e709a569 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -18,9 +18,6 @@ #ifndef __XFS_ERROR_H__ #define __XFS_ERROR_H__ -#define prdev(fmt,targ,args...) \ - printk("XFS: device %s - " fmt "\n", XFS_BUFTARG_NAME(targ), ## args) - #define XFS_ERECOVER 1 /* Failure to recover log */ #define XFS_ELOGSTAT 2 /* Failure to stat log in user space */ #define XFS_ENOLOGSPACE 3 /* Reservation too large */ @@ -182,8 +179,11 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); struct xfs_mount; /* PRINTFLIKE4 */ extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...); + char *fmt, ...); /* PRINTFLIKE3 */ extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); +#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ + xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index ba096f80f48d..14010f1fa82f 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -3,15 +3,15 @@ * All Rights Reserved. * * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * GNU Lesser General Public License for more details. * - * You should have received a copy of the GNU General Public License + * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -65,6 +65,8 @@ struct fsxattr { #define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ #define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ #define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index d1236d6f4045..163031c1e394 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -540,6 +540,32 @@ xfs_reserve_blocks( return(0); } +void +xfs_fs_log_dummy(xfs_mount_t *mp) +{ + xfs_trans_t *tp; + xfs_inode_t *ip; + + + tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); + atomic_inc(&mp->m_active_trans); + if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { + xfs_trans_cancel(tp, 0); + return; + } + + ip = mp->m_rootip; + xfs_ilock(ip, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_ihold(tp, ip); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_set_sync(tp); + xfs_trans_commit(tp, 0, NULL); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + int xfs_fs_goingdown( xfs_mount_t *mp, diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index f32713f14f9a..300d0c9d61ad 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,5 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); +extern void xfs_fs_log_dummy(xfs_mount_t *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index fc19eedbd11b..8e380a1fb79b 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -493,7 +493,6 @@ xfs_iget( retry: if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) { - bhv_desc_t *bdp; xfs_inode_t *ip; vp = LINVFS_GET_VP(inode); @@ -517,14 +516,12 @@ retry: * to wait for the inode to go away. */ if (is_bad_inode(inode) || - ((bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), - &xfs_vnodeops)) == NULL)) { + ((ip = xfs_vtoi(vp)) == NULL)) { iput(inode); delay(1); goto retry; } - ip = XFS_BHVTOI(bdp); if (lock_flags != 0) xfs_ilock(ip, lock_flags); XFS_STATS_INC(xs_ig_found); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index df0d4572d70a..1d7f5a7e063e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -404,9 +404,8 @@ xfs_iformat( INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu." - " Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", (unsigned long long)ip->i_ino, (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), @@ -418,9 +417,8 @@ xfs_iformat( } if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt dinode %Lu, forkoff = 0x%x." - " Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, forkoff = 0x%x.", (unsigned long long)ip->i_ino, (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, @@ -451,8 +449,9 @@ xfs_iformat( * no local regular files yet */ if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu " + "(local format for regular file).", (unsigned long long) ip->i_ino); XFS_CORRUPTION_ERROR("xfs_iformat(4)", XFS_ERRLEVEL_LOW, @@ -462,8 +461,9 @@ xfs_iformat( di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu (bad size %Ld for local inode). Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu " + "(bad size %Ld for local inode).", (unsigned long long) ip->i_ino, (long long) di_size); XFS_CORRUPTION_ERROR("xfs_iformat(5)", @@ -551,8 +551,9 @@ xfs_iformat_local( * kmem_alloc() or memcpy() below. */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu (bad size %d for local fork, size = %d). Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu " + "(bad size %d for local fork, size = %d).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, @@ -610,8 +611,8 @@ xfs_iformat_extents( * kmem_alloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu ((a)extents = %d). Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu ((a)extents = %d).", (unsigned long long) ip->i_ino, nex); XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); @@ -692,8 +693,8 @@ xfs_iformat_btree( || XFS_BMDR_SPACE_CALC(nrecs) > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu (btree). Unmount and run xfs_repair.", + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, ip->i_mount); @@ -809,6 +810,10 @@ _xfs_dic2xflags( flags |= XFS_XFLAG_PROJINHERIT; if (di_flags & XFS_DIFLAG_NOSYMLINKS) flags |= XFS_XFLAG_NOSYMLINKS; + if (di_flags & XFS_DIFLAG_EXTSIZE) + flags |= XFS_XFLAG_EXTSIZE; + if (di_flags & XFS_DIFLAG_EXTSZINHERIT) + flags |= XFS_XFLAG_EXTSZINHERIT; } return flags; @@ -1192,11 +1197,19 @@ xfs_ialloc( if ((mode & S_IFMT) == S_IFDIR) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; - } else { + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } else if ((mode & S_IFMT) == S_IFREG) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { di_flags |= XFS_DIFLAG_REALTIME; ip->i_iocore.io_flags |= XFS_IOCORE_RT; } + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } } if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && xfs_inherit_noatime) @@ -1262,7 +1275,7 @@ xfs_isize_check( if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) return; - if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME ) + if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE)) return; nimaps = 2; @@ -1765,22 +1778,19 @@ xfs_igrow_start( xfs_fsize_t new_size, cred_t *credp) { - xfs_fsize_t isize; int error; ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); ASSERT(new_size > ip->i_d.di_size); - error = 0; - isize = ip->i_d.di_size; /* * Zero any pages that may have been created by * xfs_write_file() beyond the end of the file * and any blocks between the old and new file sizes. */ - error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize, - new_size); + error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, + ip->i_d.di_size, new_size); return error; } @@ -3355,6 +3365,11 @@ xfs_iflush_int( ip->i_update_core = 0; SYNCHRONIZE(); + /* + * Make sure to get the latest atime from the Linux inode. + */ + xfs_synchronize_atime(ip); + if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 124d30e6143b..1cfbcf18ce86 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -436,6 +436,10 @@ void xfs_ichgtime(xfs_inode_t *, int); xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); void xfs_lock_inodes(xfs_inode_t **, int, int, uint); +xfs_inode_t *xfs_vtoi(struct vnode *vp); + +void xfs_synchronize_atime(xfs_inode_t *); + #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) #ifdef DEBUG diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7f3363c621e1..36aa1fcb90a5 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -271,6 +271,11 @@ xfs_inode_item_format( if (ip->i_update_size) ip->i_update_size = 0; + /* + * Make sure to get the latest atime from the Linux inode. + */ + xfs_synchronize_atime(ip); + vecp->i_addr = (xfs_caddr_t)&ip->i_d; vecp->i_len = sizeof(xfs_dinode_core_t); XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); @@ -603,7 +608,7 @@ xfs_inode_item_trylock( if (iip->ili_pushbuf_flag == 0) { iip->ili_pushbuf_flag = 1; #ifdef DEBUG - iip->ili_push_owner = get_thread_id(); + iip->ili_push_owner = current_pid(); #endif /* * Inode is left locked in shared mode. @@ -782,7 +787,7 @@ xfs_inode_item_pushbuf( * trying to duplicate our effort. */ ASSERT(iip->ili_pushbuf_flag != 0); - ASSERT(iip->ili_push_owner == get_thread_id()); + ASSERT(iip->ili_push_owner == current_pid()); /* * If flushlock isn't locked anymore, chances are that the diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 45a77a3a6c07..788917f355c4 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -40,7 +40,6 @@ #include "xfs_ialloc.h" #include "xfs_btree.h" #include "xfs_bmap.h" -#include "xfs_bit.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_itable.h" @@ -263,7 +262,7 @@ phase2: case BMAPI_WRITE: /* If we found an extent, return it */ if (nimaps && - (imap.br_startblock != HOLESTARTBLOCK) && + (imap.br_startblock != HOLESTARTBLOCK) && (imap.br_startblock != DELAYSTARTBLOCK)) { xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, offset, count, iomapp, &imap, flags); @@ -318,6 +317,58 @@ out: } STATIC int +xfs_iomap_eof_align_last_fsb( + xfs_mount_t *mp, + xfs_iocore_t *io, + xfs_fsize_t isize, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_fileoff_t new_last_fsb = 0; + xfs_extlen_t align; + int eof, error; + + if (io->io_flags & XFS_IOCORE_RT) + ; + /* + * If mounted with the "-o swalloc" option, roundup the allocation + * request to a stripe width boundary if the file size is >= + * stripe width and we are allocating past the allocation eof. + */ + else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && + (isize >= XFS_FSB_TO_B(mp, mp->m_swidth))) + new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); + /* + * Roundup the allocation request to a stripe unit (m_dalign) boundary + * if the file size is >= stripe unit size, and we are allocating past + * the allocation eof. + */ + else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign))) + new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + + /* + * Always round up the allocation request to an extent boundary + * (when file on a real-time subvolume or has di_extsize hint). + */ + if (extsize) { + if (new_last_fsb) + align = roundup_64(new_last_fsb, extsize); + else + align = extsize; + new_last_fsb = roundup_64(*last_fsb, align); + } + + if (new_last_fsb) { + error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); + if (error) + return error; + if (eof) + *last_fsb = new_last_fsb; + } + return 0; +} + +STATIC int xfs_flush_space( xfs_inode_t *ip, int *fsynced, @@ -363,19 +414,20 @@ xfs_iomap_write_direct( xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; - xfs_filblks_t count_fsb; + xfs_filblks_t count_fsb, resaligned; xfs_fsblock_t firstfsb; + xfs_extlen_t extsz, temp; + xfs_fsize_t isize; int nimaps; - int error; int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; - xfs_filblks_t qblocks, resblks; + uint qblocks, resblks, resrtextents; int committed; - int resrtextents; + int error; /* * Make sure that the dquots are there. This doesn't hold @@ -385,38 +437,53 @@ xfs_iomap_write_direct( if (error) return XFS_ERROR(error); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - count_fsb = last_fsb - offset_fsb; - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) { - xfs_fileoff_t map_last_fsb; - - map_last_fsb = ret_imap->br_blockcount + ret_imap->br_startoff; - if (map_last_fsb < last_fsb) { - last_fsb = map_last_fsb; - count_fsb = last_fsb - offset_fsb; - } - ASSERT(count_fsb > 0); + rt = XFS_IS_REALTIME_INODE(ip); + if (unlikely(rt)) { + if (!(extsz = ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + } else { + extsz = ip->i_d.di_extsize; } - /* - * Determine if reserving space on the data or realtime partition. - */ - if ((rt = XFS_IS_REALTIME_INODE(ip))) { - xfs_extlen_t extsz; + isize = ip->i_d.di_size; + if (io->io_new_size > isize) + isize = io->io_new_size; - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - resrtextents = qblocks = (count_fsb + extsz - 1); - do_div(resrtextents, mp->m_sb.sb_rextsize); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if ((offset + count) > isize) { + error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, + &last_fsb); + if (error) + goto error_out; } else { - resrtextents = 0; - resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, count_fsb); - quota_flag = XFS_QMOPT_RES_REGBLKS; + if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) + last_fsb = MIN(last_fsb, (xfs_fileoff_t) + ret_imap->br_blockcount + + ret_imap->br_startoff); + } + count_fsb = last_fsb - offset_fsb; + ASSERT(count_fsb > 0); + + resaligned = count_fsb; + if (unlikely(extsz)) { + if ((temp = do_mod(offset_fsb, extsz))) + resaligned += temp; + if ((temp = do_mod(resaligned, extsz))) + resaligned += extsz - temp; } + if (unlikely(rt)) { + resrtextents = qblocks = resaligned; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + /* * Allocate and setup the transaction */ @@ -426,7 +493,6 @@ xfs_iomap_write_direct( XFS_WRITE_LOG_RES(mp), resrtextents, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); - /* * Check for running out of space, note: need lock to return */ @@ -436,20 +502,20 @@ xfs_iomap_write_direct( if (error) goto error_out; - if (XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag)) { - error = (EDQUOT); + error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, + qblocks, 0, quota_flag); + if (error) goto error1; - } - bmapi_flag = XFS_BMAPI_WRITE; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - if (!(flags & BMAPI_MMAP) && (offset < ip->i_d.di_size || rt)) + bmapi_flag = XFS_BMAPI_WRITE; + if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz)) bmapi_flag |= XFS_BMAPI_PREALLOC; /* - * Issue the bmapi() call to allocate the blocks + * Issue the xfs_bmapi() call to allocate the blocks */ XFS_BMAP_INIT(&free_list, &firstfsb); nimaps = 1; @@ -484,8 +550,10 @@ xfs_iomap_write_direct( "extent-state : %x \n", (ip->i_mount)->m_fsname, (long long)ip->i_ino, - ret_imap->br_startblock, ret_imap->br_startoff, - ret_imap->br_blockcount,ret_imap->br_state); + (unsigned long long)ret_imap->br_startblock, + (unsigned long long)ret_imap->br_startoff, + (unsigned long long)ret_imap->br_blockcount, + ret_imap->br_state); } return 0; @@ -501,6 +569,63 @@ error_out: return XFS_ERROR(error); } +/* + * If the caller is doing a write at the end of the file, + * then extend the allocation out to the file system's write + * iosize. We clean up any extra space left over when the + * file is closed in xfs_inactive(). + * + * For sync writes, we are flushing delayed allocate space to + * try to make additional space available for allocation near + * the filesystem full boundary - preallocation hurts in that + * situation, of course. + */ +STATIC int +xfs_iomap_eof_want_preallocate( + xfs_mount_t *mp, + xfs_iocore_t *io, + xfs_fsize_t isize, + xfs_off_t offset, + size_t count, + int ioflag, + xfs_bmbt_irec_t *imap, + int nimaps, + int *prealloc) +{ + xfs_fileoff_t start_fsb; + xfs_filblks_t count_fsb; + xfs_fsblock_t firstblock; + int n, error, imaps; + + *prealloc = 0; + if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize) + return 0; + + /* + * If there are any real blocks past eof, then don't + * do any speculative allocation. + */ + start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); + count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + while (count_fsb > 0) { + imaps = nimaps; + firstblock = NULLFSBLOCK; + error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, + 0, &firstblock, 0, imap, &imaps, NULL); + if (error) + return error; + for (n = 0; n < imaps; n++) { + if ((imap[n].br_startblock != HOLESTARTBLOCK) && + (imap[n].br_startblock != DELAYSTARTBLOCK)) + return 0; + start_fsb += imap[n].br_blockcount; + count_fsb -= imap[n].br_blockcount; + } + } + *prealloc = 1; + return 0; +} + int xfs_iomap_write_delay( xfs_inode_t *ip, @@ -514,13 +639,15 @@ xfs_iomap_write_delay( xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; - xfs_fsize_t isize; + xfs_off_t aligned_offset; + xfs_fileoff_t ioalign; xfs_fsblock_t firstblock; + xfs_extlen_t extsz; + xfs_fsize_t isize; int nimaps; - int error; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int aeof; - int fsynced = 0; + int prealloc, fsynced = 0; + int error; ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); @@ -528,152 +655,57 @@ xfs_iomap_write_delay( * Make sure that the dquots are there. This doesn't hold * the ilock across a disk read. */ - error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); if (error) return XFS_ERROR(error); + if (XFS_IS_REALTIME_INODE(ip)) { + if (!(extsz = ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + } else { + extsz = ip->i_d.di_extsize; + } + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + retry: isize = ip->i_d.di_size; - if (io->io_new_size > isize) { + if (io->io_new_size > isize) isize = io->io_new_size; - } - aeof = 0; - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - /* - * If the caller is doing a write at the end of the file, - * then extend the allocation (and the buffer used for the write) - * out to the file system's write iosize. We clean up any extra - * space left over when the file is closed in xfs_inactive(). - * - * For sync writes, we are flushing delayed allocate space to - * try to make additional space available for allocation near - * the filesystem full boundary - preallocation hurts in that - * situation, of course. - */ - if (!(ioflag & BMAPI_SYNC) && ((offset + count) > ip->i_d.di_size)) { - xfs_off_t aligned_offset; - xfs_filblks_t count_fsb; - unsigned int iosize; - xfs_fileoff_t ioalign; - int n; - xfs_fileoff_t start_fsb; + error = xfs_iomap_eof_want_preallocate(mp, io, isize, offset, count, + ioflag, imap, XFS_WRITE_IMAPS, &prealloc); + if (error) + return error; - /* - * If there are any real blocks past eof, then don't - * do any speculative allocation. - */ - start_fsb = XFS_B_TO_FSBT(mp, - ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - while (count_fsb > 0) { - nimaps = XFS_WRITE_IMAPS; - error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, - 0, &firstblock, 0, imap, &nimaps, NULL); - if (error) { - return error; - } - for (n = 0; n < nimaps; n++) { - if ( !(io->io_flags & XFS_IOCORE_RT) && - !imap[n].br_startblock) { - cmn_err(CE_PANIC,"Access to block " - "zero: fs <%s> inode: %lld " - "start_block : %llx start_off " - ": %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - imap[n].br_startblock, - imap[n].br_startoff, - imap[n].br_blockcount, - imap[n].br_state); - } - if ((imap[n].br_startblock != HOLESTARTBLOCK) && - (imap[n].br_startblock != DELAYSTARTBLOCK)) { - goto write_map; - } - start_fsb += imap[n].br_blockcount; - count_fsb -= imap[n].br_blockcount; - } - } - iosize = mp->m_writeio_blocks; + if (prealloc) { aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + iosize; - aeof = 1; + last_fsb = ioalign + mp->m_writeio_blocks; + } else { + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); } -write_map: - nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - /* - * If mounted with the "-o swalloc" option, roundup the allocation - * request to a stripe width boundary if the file size is >= - * stripe width and we are allocating past the allocation eof. - */ - if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_swidth - && (mp->m_flags & XFS_MOUNT_SWALLOC) - && (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)) && aeof) { - int eof; - xfs_fileoff_t new_last_fsb; - - new_last_fsb = roundup_64(last_fsb, mp->m_swidth); - error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); - if (error) { - return error; - } - if (eof) { - last_fsb = new_last_fsb; - } - /* - * Roundup the allocation request to a stripe unit (m_dalign) boundary - * if the file size is >= stripe unit size, and we are allocating past - * the allocation eof. - */ - } else if (!(io->io_flags & XFS_IOCORE_RT) && mp->m_dalign && - (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)) && aeof) { - int eof; - xfs_fileoff_t new_last_fsb; - new_last_fsb = roundup_64(last_fsb, mp->m_dalign); - error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); - if (error) { - return error; - } - if (eof) { - last_fsb = new_last_fsb; - } - /* - * Round up the allocation request to a real-time extent boundary - * if the file is on the real-time subvolume. - */ - } else if (io->io_flags & XFS_IOCORE_RT && aeof) { - int eof; - xfs_fileoff_t new_last_fsb; - - new_last_fsb = roundup_64(last_fsb, mp->m_sb.sb_rextsize); - error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); - if (error) { + if (prealloc || extsz) { + error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, + &last_fsb); + if (error) return error; - } - if (eof) - last_fsb = new_last_fsb; } + + nimaps = XFS_WRITE_IMAPS; + firstblock = NULLFSBLOCK; error = xfs_bmapi(NULL, ip, offset_fsb, (xfs_filblks_t)(last_fsb - offset_fsb), XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_ENTIRE, &firstblock, 1, imap, &nimaps, NULL); - /* - * This can be EDQUOT, if nimaps == 0 - */ - if (error && (error != ENOSPC)) { + if (error && (error != ENOSPC)) return XFS_ERROR(error); - } + /* * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space. + * then we must have run out of space - flush delalloc, and retry.. */ if (nimaps == 0) { xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, @@ -685,17 +717,21 @@ write_map: goto retry; } - *ret_imap = imap[0]; - *nmaps = 1; - if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { + if (!(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " "start_block : %llx start_off : %llx blkcnt : %llx " "extent-state : %x \n", (ip->i_mount)->m_fsname, (long long)ip->i_ino, - ret_imap->br_startblock, ret_imap->br_startoff, - ret_imap->br_blockcount,ret_imap->br_state); + (unsigned long long)ret_imap->br_startblock, + (unsigned long long)ret_imap->br_startoff, + (unsigned long long)ret_imap->br_blockcount, + ret_imap->br_state); } + + *ret_imap = imap[0]; + *nmaps = 1; + return 0; } @@ -821,17 +857,21 @@ xfs_iomap_write_allocate( */ for (i = 0; i < nimaps; i++) { - if ( !(io->io_flags & XFS_IOCORE_RT) && - !imap[i].br_startblock) { + if (!(io->io_flags & XFS_IOCORE_RT) && + !imap[i].br_startblock) { cmn_err(CE_PANIC,"Access to block zero: " "fs <%s> inode: %lld " - "start_block : %llx start_off : %llx " + "start_block : %llx start_off : %llx " "blkcnt : %llx extent-state : %x \n", (ip->i_mount)->m_fsname, (long long)ip->i_ino, - imap[i].br_startblock, - imap[i].br_startoff, - imap[i].br_blockcount,imap[i].br_state); + (unsigned long long) + imap[i].br_startblock, + (unsigned long long) + imap[i].br_startoff, + (unsigned long long) + imap[i].br_blockcount, + imap[i].br_state); } if ((offset_fsb >= imap[i].br_startoff) && (offset_fsb < (imap[i].br_startoff + @@ -868,17 +908,17 @@ xfs_iomap_write_unwritten( { xfs_mount_t *mp = ip->i_mount; xfs_iocore_t *io = &ip->i_iocore; - xfs_trans_t *tp; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; - xfs_bmbt_irec_t imap; + xfs_fsblock_t firstfsb; + int nimaps; + xfs_trans_t *tp; + xfs_bmbt_irec_t imap; + xfs_bmap_free_t free_list; + uint resblks; int committed; int error; - int nres; - int nimaps; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, &ip->i_iocore, offset, count); @@ -887,9 +927,9 @@ xfs_iomap_write_unwritten( count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); - do { - nres = XFS_DIOSTRAT_SPACE_RES(mp, 0); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + do { /* * set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until @@ -897,7 +937,7 @@ xfs_iomap_write_unwritten( */ tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - error = xfs_trans_reserve(tp, nres, + error = xfs_trans_reserve(tp, resblks, XFS_WRITE_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); @@ -916,7 +956,7 @@ xfs_iomap_write_unwritten( XFS_BMAP_INIT(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_WRITE, &firstfsb, + XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; @@ -930,15 +970,17 @@ xfs_iomap_write_unwritten( xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) goto error0; - + if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) { cmn_err(CE_PANIC,"Access to block zero: fs <%s> " "inode: %lld start_block : %llx start_off : " "%llx blkcnt : %llx extent-state : %x \n", (ip->i_mount)->m_fsname, (long long)ip->i_ino, - imap.br_startblock,imap.br_startoff, - imap.br_blockcount,imap.br_state); + (unsigned long long)imap.br_startblock, + (unsigned long long)imap.br_startoff, + (unsigned long long)imap.br_blockcount, + imap.br_state); } if ((numblks_fsb = imap.br_blockcount) == 0) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f63646ead816..c59450e1be40 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -56,6 +56,7 @@ xfs_bulkstat_one_iget( { xfs_dinode_core_t *dic; /* dinode core info pointer */ xfs_inode_t *ip; /* incore inode pointer */ + vnode_t *vp; int error; error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno); @@ -72,6 +73,7 @@ xfs_bulkstat_one_iget( goto out_iput; } + vp = XFS_ITOV(ip); dic = &ip->i_d; /* xfs_iget returns the following without needing @@ -84,8 +86,7 @@ xfs_bulkstat_one_iget( buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - buf->bs_atime.tv_sec = dic->di_atime.t_sec; - buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; + vn_atime_to_bstime(vp, &buf->bs_atime); buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 29af51275ca9..3d9a36e77363 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -178,6 +178,83 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) #define xlog_trace_iclog(iclog,state) #endif /* XFS_LOG_TRACE */ + +static void +xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) +{ + if (*qp) { + tic->t_next = (*qp); + tic->t_prev = (*qp)->t_prev; + (*qp)->t_prev->t_next = tic; + (*qp)->t_prev = tic; + } else { + tic->t_prev = tic->t_next = tic; + *qp = tic; + } + + tic->t_flags |= XLOG_TIC_IN_Q; +} + +static void +xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) +{ + if (tic == tic->t_next) { + *qp = NULL; + } else { + *qp = tic->t_next; + tic->t_next->t_prev = tic->t_prev; + tic->t_prev->t_next = tic->t_next; + } + + tic->t_next = tic->t_prev = NULL; + tic->t_flags &= ~XLOG_TIC_IN_Q; +} + +static void +xlog_grant_sub_space(struct log *log, int bytes) +{ + log->l_grant_write_bytes -= bytes; + if (log->l_grant_write_bytes < 0) { + log->l_grant_write_bytes += log->l_logsize; + log->l_grant_write_cycle--; + } + + log->l_grant_reserve_bytes -= bytes; + if ((log)->l_grant_reserve_bytes < 0) { + log->l_grant_reserve_bytes += log->l_logsize; + log->l_grant_reserve_cycle--; + } + +} + +static void +xlog_grant_add_space_write(struct log *log, int bytes) +{ + log->l_grant_write_bytes += bytes; + if (log->l_grant_write_bytes > log->l_logsize) { + log->l_grant_write_bytes -= log->l_logsize; + log->l_grant_write_cycle++; + } +} + +static void +xlog_grant_add_space_reserve(struct log *log, int bytes) +{ + log->l_grant_reserve_bytes += bytes; + if (log->l_grant_reserve_bytes > log->l_logsize) { + log->l_grant_reserve_bytes -= log->l_logsize; + log->l_grant_reserve_cycle++; + } +} + +static inline void +xlog_grant_add_space(struct log *log, int bytes) +{ + xlog_grant_add_space_write(log, bytes); + xlog_grant_add_space_reserve(log, bytes); +} + + /* * NOTES: * @@ -428,7 +505,7 @@ xfs_log_mount(xfs_mount_t *mp, if (readonly) vfsp->vfs_flag &= ~VFS_RDONLY; - error = xlog_recover(mp->m_log, readonly); + error = xlog_recover(mp->m_log); if (readonly) vfsp->vfs_flag |= VFS_RDONLY; @@ -1320,8 +1397,7 @@ xlog_sync(xlog_t *log, /* move grant heads by roundoff in sync */ s = GRANT_LOCK(log); - XLOG_GRANT_ADD_SPACE(log, roundoff, 'w'); - XLOG_GRANT_ADD_SPACE(log, roundoff, 'r'); + xlog_grant_add_space(log, roundoff); GRANT_UNLOCK(log, s); /* put cycle number in every block */ @@ -1515,7 +1591,6 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -#if defined(XFS_LOG_RES_DEBUG) STATIC void xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) { @@ -1605,11 +1680,11 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) ticket->t_res_arr_sum, ticket->t_res_o_flow, ticket->t_res_num_ophdrs, ophdr_spc, ticket->t_res_arr_sum + - ticket->t_res_o_flow + ophdr_spc, + ticket->t_res_o_flow + ophdr_spc, ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; + uint r_type = ticket->t_res_arr[i].r_type; cmn_err(CE_WARN, "region[%u]: %s - %u bytes\n", i, @@ -1618,9 +1693,6 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) ticket->t_res_arr[i].r_len); } } -#else -#define xlog_print_tic_res(mp, ticket) -#endif /* * Write some region out to in-core log @@ -2389,7 +2461,7 @@ xlog_grant_log_space(xlog_t *log, /* something is already sleeping; insert new transaction at end */ if (log->l_reserve_headq) { - XLOG_INS_TICKETQ(log->l_reserve_headq, tic); + xlog_ins_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 1"); /* @@ -2422,7 +2494,7 @@ redo: log->l_grant_reserve_bytes); if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - XLOG_INS_TICKETQ(log->l_reserve_headq, tic); + xlog_ins_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: sleep 2"); XFS_STATS_INC(xs_sleep_logspace); @@ -2439,11 +2511,10 @@ redo: s = GRANT_LOCK(log); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) - XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); + xlog_del_ticketq(&log->l_reserve_headq, tic); /* we've got enough space */ - XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); - XLOG_GRANT_ADD_SPACE(log, need_bytes, 'r'); + xlog_grant_add_space(log, need_bytes); #ifdef DEBUG tail_lsn = log->l_tail_lsn; /* @@ -2464,7 +2535,7 @@ redo: error_return: if (tic->t_flags & XLOG_TIC_IN_Q) - XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); + xlog_del_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); /* * If we are failing, make sure the ticket doesn't have any @@ -2533,7 +2604,7 @@ xlog_regrant_write_log_space(xlog_t *log, if (ntic != log->l_write_headq) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - XLOG_INS_TICKETQ(log->l_write_headq, tic); + xlog_ins_ticketq(&log->l_write_headq, tic); xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: sleep 1"); @@ -2565,7 +2636,7 @@ redo: log->l_grant_write_bytes); if (free_bytes < need_bytes) { if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - XLOG_INS_TICKETQ(log->l_write_headq, tic); + xlog_ins_ticketq(&log->l_write_headq, tic); XFS_STATS_INC(xs_sleep_logspace); sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); @@ -2581,9 +2652,10 @@ redo: s = GRANT_LOCK(log); goto redo; } else if (tic->t_flags & XLOG_TIC_IN_Q) - XLOG_DEL_TICKETQ(log->l_write_headq, tic); + xlog_del_ticketq(&log->l_write_headq, tic); - XLOG_GRANT_ADD_SPACE(log, need_bytes, 'w'); /* we've got enough space */ + /* we've got enough space */ + xlog_grant_add_space_write(log, need_bytes); #ifdef DEBUG tail_lsn = log->l_tail_lsn; if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { @@ -2600,7 +2672,7 @@ redo: error_return: if (tic->t_flags & XLOG_TIC_IN_Q) - XLOG_DEL_TICKETQ(log->l_reserve_headq, tic); + xlog_del_ticketq(&log->l_reserve_headq, tic); xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); /* * If we are failing, make sure the ticket doesn't have any @@ -2633,8 +2705,7 @@ xlog_regrant_reserve_log_space(xlog_t *log, ticket->t_cnt--; s = GRANT_LOCK(log); - XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w'); - XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r'); + xlog_grant_sub_space(log, ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; XLOG_TIC_RESET_RES(ticket); xlog_trace_loggrant(log, ticket, @@ -2647,7 +2718,7 @@ xlog_regrant_reserve_log_space(xlog_t *log, return; } - XLOG_GRANT_ADD_SPACE(log, ticket->t_unit_res, 'r'); + xlog_grant_add_space_reserve(log, ticket->t_unit_res); xlog_trace_loggrant(log, ticket, "xlog_regrant_reserve_log_space: exit"); xlog_verify_grant_head(log, 0); @@ -2683,8 +2754,7 @@ xlog_ungrant_log_space(xlog_t *log, s = GRANT_LOCK(log); xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); - XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w'); - XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r'); + xlog_grant_sub_space(log, ticket->t_curr_res); xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); @@ -2693,8 +2763,7 @@ xlog_ungrant_log_space(xlog_t *log, */ if (ticket->t_cnt > 0) { ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); - XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'w'); - XLOG_GRANT_SUB_SPACE(log, ticket->t_unit_res*ticket->t_cnt,'r'); + xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); } xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 158829ca56f6..4b2ac88dbb83 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -30,13 +30,7 @@ * By comparing each compnent, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number */ -static -#if defined(__GNUC__) && (__GNUC__ == 2) && ( (__GNUC_MINOR__ == 95) || (__GNUC_MINOR__ == 96)) -__attribute__((unused)) /* gcc 2.95, 2.96 miscompile this when inlined */ -#else -__inline__ -#endif -xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) +static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) { if (CYCLE_LSN(lsn1) != CYCLE_LSN(lsn2)) return (CYCLE_LSN(lsn1)<CYCLE_LSN(lsn2))? -999 : 999; @@ -102,7 +96,6 @@ xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) /* Region types for iovec's i_type */ -#if defined(XFS_LOG_RES_DEBUG) #define XLOG_REG_TYPE_BFORMAT 1 #define XLOG_REG_TYPE_BCHUNK 2 #define XLOG_REG_TYPE_EFI_FORMAT 3 @@ -123,21 +116,13 @@ xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XLOG_REG_TYPE_COMMIT 18 #define XLOG_REG_TYPE_TRANSHDR 19 #define XLOG_REG_TYPE_MAX 19 -#endif -#if defined(XFS_LOG_RES_DEBUG) #define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t)) -#else -#define XLOG_VEC_SET_TYPE(vecp, t) -#endif - typedef struct xfs_log_iovec { xfs_caddr_t i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ -#if defined(XFS_LOG_RES_DEBUG) - uint i_type; /* type of region */ -#endif + uint i_type; /* type of region */ } xfs_log_iovec_t; typedef void* xfs_log_ticket_t; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 4518b188ade6..34bcbf50789c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -253,7 +253,6 @@ typedef __uint32_t xlog_tid_t; /* Ticket reservation region accounting */ -#if defined(XFS_LOG_RES_DEBUG) #define XLOG_TIC_LEN_MAX 15 #define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \ (t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0) @@ -278,15 +277,9 @@ typedef __uint32_t xlog_tid_t; * we don't care about. */ typedef struct xlog_res { - uint r_len; - uint r_type; + uint r_len; /* region length :4 */ + uint r_type; /* region's transaction type :4 */ } xlog_res_t; -#else -#define XLOG_TIC_RESET_RES(t) -#define XLOG_TIC_ADD_OPHDR(t) -#define XLOG_TIC_ADD_REGION(t, len, type) -#endif - typedef struct xlog_ticket { sv_t t_sema; /* sleep on this semaphore : 20 */ @@ -301,14 +294,12 @@ typedef struct xlog_ticket { char t_flags; /* properties of reservation : 1 */ uint t_trans_type; /* transaction type : 4 */ -#if defined (XFS_LOG_RES_DEBUG) /* reservation array fields */ uint t_res_num; /* num in array : 4 */ - xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : X */ uint t_res_num_ophdrs; /* num op hdrs : 4 */ uint t_res_arr_sum; /* array sum : 4 */ uint t_res_o_flow; /* sum overflow : 4 */ -#endif + xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ } xlog_ticket_t; #endif @@ -494,71 +485,13 @@ typedef struct log { #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) -#define XLOG_GRANT_SUB_SPACE(log,bytes,type) \ - { \ - if (type == 'w') { \ - (log)->l_grant_write_bytes -= (bytes); \ - if ((log)->l_grant_write_bytes < 0) { \ - (log)->l_grant_write_bytes += (log)->l_logsize; \ - (log)->l_grant_write_cycle--; \ - } \ - } else { \ - (log)->l_grant_reserve_bytes -= (bytes); \ - if ((log)->l_grant_reserve_bytes < 0) { \ - (log)->l_grant_reserve_bytes += (log)->l_logsize;\ - (log)->l_grant_reserve_cycle--; \ - } \ - } \ - } -#define XLOG_GRANT_ADD_SPACE(log,bytes,type) \ - { \ - if (type == 'w') { \ - (log)->l_grant_write_bytes += (bytes); \ - if ((log)->l_grant_write_bytes > (log)->l_logsize) { \ - (log)->l_grant_write_bytes -= (log)->l_logsize; \ - (log)->l_grant_write_cycle++; \ - } \ - } else { \ - (log)->l_grant_reserve_bytes += (bytes); \ - if ((log)->l_grant_reserve_bytes > (log)->l_logsize) { \ - (log)->l_grant_reserve_bytes -= (log)->l_logsize;\ - (log)->l_grant_reserve_cycle++; \ - } \ - } \ - } -#define XLOG_INS_TICKETQ(q, tic) \ - { \ - if (q) { \ - (tic)->t_next = (q); \ - (tic)->t_prev = (q)->t_prev; \ - (q)->t_prev->t_next = (tic); \ - (q)->t_prev = (tic); \ - } else { \ - (tic)->t_prev = (tic)->t_next = (tic); \ - (q) = (tic); \ - } \ - (tic)->t_flags |= XLOG_TIC_IN_Q; \ - } -#define XLOG_DEL_TICKETQ(q, tic) \ - { \ - if ((tic) == (tic)->t_next) { \ - (q) = NULL; \ - } else { \ - (q) = (tic)->t_next; \ - (tic)->t_next->t_prev = (tic)->t_prev; \ - (tic)->t_prev->t_next = (tic)->t_next; \ - } \ - (tic)->t_next = (tic)->t_prev = NULL; \ - (tic)->t_flags &= ~XLOG_TIC_IN_Q; \ - } /* common routines */ extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); extern int xlog_find_tail(xlog_t *log, xfs_daddr_t *head_blk, - xfs_daddr_t *tail_blk, - int readonly); -extern int xlog_recover(xlog_t *log, int readonly); + xfs_daddr_t *tail_blk); +extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); extern void xlog_recover_process_iunlinks(xlog_t *log); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8ab7df768063..7d46cbd6a07a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -783,8 +783,7 @@ int xlog_find_tail( xlog_t *log, xfs_daddr_t *head_blk, - xfs_daddr_t *tail_blk, - int readonly) + xfs_daddr_t *tail_blk) { xlog_rec_header_t *rhead; xlog_op_header_t *op_head; @@ -2563,10 +2562,12 @@ xlog_recover_do_quotaoff_trans( /* * The logitem format's flag tells us if this was user quotaoff, - * group quotaoff or both. + * group/project quotaoff or both. */ if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) log->l_quotaoffs_flag |= XFS_DQ_USER; + if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) + log->l_quotaoffs_flag |= XFS_DQ_PROJ; if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) log->l_quotaoffs_flag |= XFS_DQ_GROUP; @@ -3890,14 +3891,13 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log, - int readonly) + xlog_t *log) { xfs_daddr_t head_blk, tail_blk; int error; /* find the tail of the log */ - if ((error = xlog_find_tail(log, &head_blk, &tail_blk, readonly))) + if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) return error; if (tail_blk != head_blk) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 541d5dd474be..6088e14f84e3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -51,7 +51,7 @@ STATIC int xfs_uuid_mount(xfs_mount_t *); STATIC void xfs_uuid_unmount(xfs_mount_t *mp); STATIC void xfs_unmountfs_wait(xfs_mount_t *); -static struct { +static const struct { short offset; short type; /* 0 = integer * 1 = binary / string (no translation) @@ -117,7 +117,7 @@ xfs_mount_init(void) AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail"); spinlock_init(&mp->m_sb_lock, "xfs_sb"); - mutex_init(&mp->m_ilock, MUTEX_DEFAULT, "xfs_ilock"); + mutex_init(&mp->m_ilock); initnsema(&mp->m_growlock, 1, "xfs_grow"); /* * Initialize the AIL. @@ -1077,8 +1077,7 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) xfs_iflush_all(mp); - XFS_QM_DQPURGEALL(mp, - XFS_QMOPT_UQUOTA | XFS_QMOPT_GQUOTA | XFS_QMOPT_UMOUNTING); + XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); /* * Flush out the log synchronously so that we know for sure diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 08b2e0a5d807..cd3cf9613a00 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -308,7 +308,6 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ -#define m_dev m_ddev_targp->pbr_dev __uint8_t m_dircook_elog; /* log d-cookie entry bits */ __uint8_t m_blkbit_log; /* blocklog + NBBY */ __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ @@ -393,7 +392,7 @@ typedef struct xfs_mount { user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ -#define XFS_MOUNT_COMPAT_ATTR (1ULL << 8) /* do not use attr2 format */ +#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ /* (1ULL << 9) -- currently unused */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ #define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */ @@ -533,7 +532,7 @@ typedef struct xfs_mod_sb { int msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; -#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock), PINOD) +#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) #define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) #define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock) #define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s)) diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 4d4e8f4e768e..81a05cfd77d2 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -243,7 +243,6 @@ xfs_rename( xfs_inode_t *inodes[4]; int target_ip_dropped = 0; /* dropped target_ip link? */ vnode_t *src_dir_vp; - bhv_desc_t *target_dir_bdp; int spaceres; int target_link_zero = 0; int num_inodes; @@ -260,14 +259,12 @@ xfs_rename( * Find the XFS behavior descriptor for the target directory * vnode since it was not handed to us. */ - target_dir_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(target_dir_vp), - &xfs_vnodeops); - if (target_dir_bdp == NULL) { + target_dp = xfs_vtoi(target_dir_vp); + if (target_dp == NULL) { return XFS_ERROR(EXDEV); } src_dp = XFS_BHVTOI(src_dir_bdp); - target_dp = XFS_BHVTOI(target_dir_bdp); mp = src_dp->i_mount; if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) || diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index c4b20872f07d..a59c102cf214 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -238,6 +238,7 @@ xfs_bioerror_relse( } return (EIO); } + /* * Prints out an ALERT message about I/O error. */ @@ -252,11 +253,9 @@ xfs_ioerror_alert( "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" " (\"%s\") error %d buf count %zd", (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname, - XFS_BUFTARG_NAME(bp->pb_target), - (__uint64_t)blkno, - func, - XFS_BUF_GETERROR(bp), - XFS_BUF_COUNT(bp)); + XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), + (__uint64_t)blkno, func, + XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); } /* diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index 4a17d335f897..bf168a91ddb8 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -68,18 +68,6 @@ struct xfs_mount; (XFS_SB_VERSION_NUMBITS | \ XFS_SB_VERSION_OKREALFBITS | \ XFS_SB_VERSION_OKSASHFBITS) -#define XFS_SB_VERSION_MKFS(ia,dia,extflag,dirv2,na,sflag,morebits) \ - (((ia) || (dia) || (extflag) || (dirv2) || (na) || (sflag) || \ - (morebits)) ? \ - (XFS_SB_VERSION_4 | \ - ((ia) ? XFS_SB_VERSION_ALIGNBIT : 0) | \ - ((dia) ? XFS_SB_VERSION_DALIGNBIT : 0) | \ - ((extflag) ? XFS_SB_VERSION_EXTFLGBIT : 0) | \ - ((dirv2) ? XFS_SB_VERSION_DIRV2BIT : 0) | \ - ((na) ? XFS_SB_VERSION_LOGV2BIT : 0) | \ - ((sflag) ? XFS_SB_VERSION_SECTORBIT : 0) | \ - ((morebits) ? XFS_SB_VERSION_MOREBITSBIT : 0)) : \ - XFS_SB_VERSION_1) /* * There are two words to hold XFS "feature" bits: the original @@ -105,11 +93,6 @@ struct xfs_mount; (XFS_SB_VERSION2_OKREALFBITS | \ XFS_SB_VERSION2_OKSASHFBITS ) -/* - * mkfs macro to set up sb_features2 word - */ -#define XFS_SB_VERSION2_MKFS(resvd1, sbcntr) 0 - typedef struct xfs_sb { __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 279e043d7323..d3d714e6b32a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1014,6 +1014,7 @@ xfs_trans_cancel( xfs_log_item_t *lip; int i; #endif + xfs_mount_t *mp = tp->t_mountp; /* * See if the caller is being too lazy to figure out if @@ -1026,9 +1027,10 @@ xfs_trans_cancel( * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if ((tp->t_flags & XFS_TRANS_DIRTY) && - !XFS_FORCED_SHUTDOWN(tp->t_mountp)) - xfs_force_shutdown(tp->t_mountp, XFS_CORRUPT_INCORE); + if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); + xfs_force_shutdown(mp, XFS_CORRUPT_INCORE); + } #ifdef DEBUG if (!(flags & XFS_TRANS_ABORT)) { licp = &(tp->t_items); @@ -1040,7 +1042,7 @@ xfs_trans_cancel( } lip = lidp->lid_item; - if (!XFS_FORCED_SHUTDOWN(tp->t_mountp)) + if (!XFS_FORCED_SHUTDOWN(mp)) ASSERT(!(lip->li_type == XFS_LI_EFD)); } licp = licp->lic_next; @@ -1048,7 +1050,7 @@ xfs_trans_cancel( } #endif xfs_trans_unreserve_and_mod_sb(tp); - XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp); + XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); if (tp->t_ticket) { if (flags & XFS_TRANS_RELEASE_LOG_RES) { @@ -1057,7 +1059,7 @@ xfs_trans_cancel( } else { log_flags = 0; } - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); } /* mark this thread as no longer being in a transaction */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index a889963fdd14..d77901c07f63 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -973,7 +973,6 @@ void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index fefe1d60377f..34654ec6ae10 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -55,16 +55,13 @@ xfs_get_dir_entry( xfs_inode_t **ipp) { vnode_t *vp; - bhv_desc_t *bdp; vp = VNAME_TO_VNODE(dentry); - bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(vp), &xfs_vnodeops); - if (!bdp) { - *ipp = NULL; + + *ipp = xfs_vtoi(vp); + if (!*ipp) return XFS_ERROR(ENOENT); - } VN_HOLD(vp); - *ipp = XFS_BHVTOI(bdp); return 0; } diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 7bdbd991ab1c..b6ad370fab3d 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -53,6 +53,7 @@ #include "xfs_acl.h" #include "xfs_attr.h" #include "xfs_clnt.h" +#include "xfs_fsops.h" STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); @@ -290,8 +291,8 @@ xfs_start_flags( mp->m_flags |= XFS_MOUNT_IDELETE; if (ap->flags & XFSMNT_DIRSYNC) mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (ap->flags & XFSMNT_COMPAT_ATTR) - mp->m_flags |= XFS_MOUNT_COMPAT_ATTR; + if (ap->flags & XFSMNT_ATTR2) + mp->m_flags |= XFS_MOUNT_ATTR2; if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; @@ -312,6 +313,8 @@ xfs_start_flags( mp->m_flags |= XFS_MOUNT_NOUUID; if (ap->flags & XFSMNT_BARRIER) mp->m_flags |= XFS_MOUNT_BARRIER; + else + mp->m_flags &= ~XFS_MOUNT_BARRIER; return 0; } @@ -330,10 +333,11 @@ xfs_finish_flags( /* Fail a mount where the logbuf is smaller then the log stripe */ if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { - if ((ap->logbufsize == -1) && + if ((ap->logbufsize <= 0) && (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { mp->m_logbsize = mp->m_sb.sb_logsunit; - } else if (ap->logbufsize < mp->m_sb.sb_logsunit) { + } else if (ap->logbufsize > 0 && + ap->logbufsize < mp->m_sb.sb_logsunit) { cmn_err(CE_WARN, "XFS: logbuf size must be greater than or equal to log stripe size"); return XFS_ERROR(EINVAL); @@ -347,6 +351,10 @@ xfs_finish_flags( } } + if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { + mp->m_flags |= XFS_MOUNT_ATTR2; + } + /* * prohibit r/w mounts of read-only filesystems */ @@ -382,10 +390,6 @@ xfs_finish_flags( return XFS_ERROR(EINVAL); } - if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { - mp->m_flags &= ~XFS_MOUNT_COMPAT_ATTR; - } - return 0; } @@ -504,13 +508,13 @@ xfs_mount( if (error) goto error2; + if ((mp->m_flags & XFS_MOUNT_BARRIER) && !(vfsp->vfs_flag & VFS_RDONLY)) + xfs_mountfs_check_barriers(mp); + error = XFS_IOINIT(vfsp, args, flags); if (error) goto error2; - if ((args->flags & XFSMNT_BARRIER) && - !(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)) - xfs_mountfs_check_barriers(mp); return 0; error2: @@ -655,6 +659,11 @@ xfs_mntupdate( else mp->m_flags &= ~XFS_MOUNT_NOATIME; + if (args->flags & XFSMNT_BARRIER) + mp->m_flags |= XFS_MOUNT_BARRIER; + else + mp->m_flags &= ~XFS_MOUNT_BARRIER; + if ((vfsp->vfs_flag & VFS_RDONLY) && !(*flags & MS_RDONLY)) { vfsp->vfs_flag &= ~VFS_RDONLY; @@ -1634,6 +1643,7 @@ xfs_vget( #define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ #define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and * unwritten extent conversion */ +#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ #define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ @@ -1680,7 +1690,6 @@ xfs_parseargs( int iosize; args->flags2 |= XFSMNT2_COMPAT_IOSIZE; - args->flags |= XFSMNT_COMPAT_ATTR; #if 0 /* XXX: off by default, until some remaining issues ironed out */ args->flags |= XFSMNT_IDELETE; /* default to on */ @@ -1806,6 +1815,8 @@ xfs_parseargs( args->flags |= XFSMNT_NOUUID; } else if (!strcmp(this_char, MNTOPT_BARRIER)) { args->flags |= XFSMNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + args->flags &= ~XFSMNT_BARRIER; } else if (!strcmp(this_char, MNTOPT_IKEEP)) { args->flags &= ~XFSMNT_IDELETE; } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { @@ -1815,9 +1826,9 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { args->flags2 |= XFSMNT2_COMPAT_IOSIZE; } else if (!strcmp(this_char, MNTOPT_ATTR2)) { - args->flags &= ~XFSMNT_COMPAT_ATTR; + args->flags |= XFSMNT_ATTR2; } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { - args->flags |= XFSMNT_COMPAT_ATTR; + args->flags &= ~XFSMNT_ATTR2; } else if (!strcmp(this_char, "osyncisdsync")) { /* no-op, this is now the default */ printk("XFS: osyncisdsync is now the default, option is deprecated.\n"); @@ -1892,7 +1903,6 @@ xfs_showargs( { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, - { XFS_MOUNT_BARRIER, "," MNTOPT_BARRIER }, { XFS_MOUNT_IDELETE, "," MNTOPT_NOIKEEP }, { 0, NULL } }; @@ -1914,33 +1924,28 @@ xfs_showargs( if (mp->m_logbufs > 0) seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); - if (mp->m_logbsize > 0) seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); if (mp->m_logname) seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); - if (mp->m_rtname) seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); if (mp->m_dalign > 0) seq_printf(m, "," MNTOPT_SUNIT "=%d", (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); - if (mp->m_swidth > 0) seq_printf(m, "," MNTOPT_SWIDTH "=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - if (!(mp->m_flags & XFS_MOUNT_COMPAT_ATTR)) - seq_printf(m, "," MNTOPT_ATTR2); - if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) seq_printf(m, "," MNTOPT_LARGEIO); + if (mp->m_flags & XFS_MOUNT_BARRIER) + seq_printf(m, "," MNTOPT_BARRIER); if (!(vfsp->vfs_flag & VFS_32BITINODES)) seq_printf(m, "," MNTOPT_64BITINODE); - if (vfsp->vfs_flag & VFS_GRPID) seq_printf(m, "," MNTOPT_GRPID); @@ -1959,6 +1964,7 @@ xfs_freeze( /* Push the superblock and write an unmount record */ xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); + xfs_fs_log_dummy(mp); } diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index e03fa2a3d5ed..8076cc981e11 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -15,6 +15,9 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ + +#include <linux/capability.h> + #include "xfs.h" #include "xfs_fs.h" #include "xfs_types.h" @@ -182,8 +185,7 @@ xfs_getattr( break; } - vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec; - vap->va_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + vn_atime_to_timespec(vp, &vap->va_atime); vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec; vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec; @@ -541,24 +543,6 @@ xfs_setattr( } /* - * Can't set extent size unless the file is marked, or - * about to be marked as a realtime file. - * - * This check will be removed when fixed size extents - * with buffered data writes is implemented. - * - */ - if ((mask & XFS_AT_EXTSIZE) && - ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != - vap->va_extsize) && - (!((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) || - ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME))))) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - - /* * Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && @@ -820,13 +804,17 @@ xfs_setattr( di_flags |= XFS_DIFLAG_RTINHERIT; if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; - } else { + if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { if (vap->va_xflags & XFS_XFLAG_REALTIME) { di_flags |= XFS_DIFLAG_REALTIME; ip->i_iocore.io_flags |= XFS_IOCORE_RT; } else { ip->i_iocore.io_flags &= ~XFS_IOCORE_RT; } + if (vap->va_xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; } ip->i_d.di_flags = di_flags; } @@ -996,10 +984,6 @@ xfs_readlink( goto error_return; } - if (!(ioflags & IO_INVIS)) { - xfs_ichgtime(ip, XFS_ICHGTIME_ACC); - } - /* * See if the symlink is stored inline. */ @@ -1231,7 +1215,8 @@ xfs_inactive_free_eofblocks( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!error && (nimaps != 0) && - (imap.br_startblock != HOLESTARTBLOCK)) { + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { /* * Attach the dquots to the inode up front. */ @@ -1566,9 +1551,11 @@ xfs_release( if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && + ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || + ip->i_delayed_blks > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) { + (!(ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { if ((error = xfs_inactive_free_eofblocks(mp, ip))) return (error); /* Update linux inode block count after free above */ @@ -1625,7 +1612,8 @@ xfs_inactive( * only one with a reference to the inode. */ truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) && + ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) || + (ip->i_delayed_blks > 0)) && ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); mp = ip->i_mount; @@ -1643,10 +1631,12 @@ xfs_inactive( if (ip->i_d.di_nlink != 0) { if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) || - (ip->i_delayed_blks != 0))) { + ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || + ip->i_delayed_blks > 0)) && + (ip->i_df.if_flags & XFS_IFEXTENTS) && + (!(ip->i_d.di_flags & + (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || + (ip->i_delayed_blks != 0)))) { if ((error = xfs_inactive_free_eofblocks(mp, ip))) return (VN_INACTIVE_CACHE); /* Update linux inode block count after free above */ @@ -2590,7 +2580,6 @@ xfs_link( int cancel_flags; int committed; vnode_t *target_dir_vp; - bhv_desc_t *src_bdp; int resblks; char *target_name = VNAME(dentry); int target_namelen; @@ -2603,8 +2592,7 @@ xfs_link( if (VN_ISDIR(src_vp)) return XFS_ERROR(EPERM); - src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops); - sip = XFS_BHVTOI(src_bdp); + sip = xfs_vtoi(src_vp); tdp = XFS_BHVTOI(target_dir_bdp); mp = tdp->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) @@ -3237,7 +3225,6 @@ xfs_readdir( xfs_trans_t *tp = NULL; int error = 0; uint lock_mode; - xfs_off_t start_offset; vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__, (inst_t *)__return_address); @@ -3248,11 +3235,7 @@ xfs_readdir( } lock_mode = xfs_ilock_map_shared(dp); - start_offset = uiop->uio_offset; error = XFS_DIR_GETDENTS(dp->i_mount, tp, dp, uiop, eofp); - if (start_offset != uiop->uio_offset) { - xfs_ichgtime(dp, XFS_ICHGTIME_ACC); - } xfs_iunlock_map_shared(dp, lock_mode); return error; } @@ -3829,7 +3812,12 @@ xfs_reclaim( vn_iowait(vp); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - ASSERT(VN_CACHED(vp) == 0); + + /* + * Make sure the atime in the XFS inode is correct before freeing the + * Linux inode. + */ + xfs_synchronize_atime(ip); /* If we have nothing to flush with this inode then complete the * teardown now, otherwise break the link between the xfs inode @@ -3999,42 +3987,36 @@ xfs_alloc_file_space( int alloc_type, int attr_flags) { + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; xfs_filblks_t allocated_fsb; xfs_filblks_t allocatesize_fsb; - int committed; - xfs_off_t count; - xfs_filblks_t datablocks; - int error; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_bmbt_irec_t *imapp; - xfs_bmbt_irec_t imaps[1]; - xfs_mount_t *mp; - int numrtextents; - int reccount; - uint resblks; + int nimaps; + int bmapi_flag; + int quota_flag; int rt; - int rtextsize; - xfs_fileoff_t startoffset_fsb; xfs_trans_t *tp; - int xfs_bmapi_flags; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); - mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - /* - * determine if this is a realtime file - */ - if ((rt = XFS_IS_REALTIME_INODE(ip)) != 0) { - if (ip->i_d.di_extsize) - rtextsize = ip->i_d.di_extsize; - else - rtextsize = mp->m_sb.sb_rextsize; - } else - rtextsize = 0; + rt = XFS_IS_REALTIME_INODE(ip); + if (unlikely(rt)) { + if (!(extsz = ip->i_d.di_extsize)) + extsz = mp->m_sb.sb_rextsize; + } else { + extsz = ip->i_d.di_extsize; + } if ((error = XFS_QM_DQATTACH(mp, ip, 0))) return error; @@ -4045,8 +4027,8 @@ xfs_alloc_file_space( count = len; error = 0; imapp = &imaps[0]; - reccount = 1; - xfs_bmapi_flags = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); + nimaps = 1; + bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); startoffset_fsb = XFS_B_TO_FSBT(mp, offset); allocatesize_fsb = XFS_B_TO_FSB(mp, count); @@ -4067,43 +4049,51 @@ xfs_alloc_file_space( } /* - * allocate file space until done or until there is an error + * Allocate file space until done or until there is an error */ retry: while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + /* - * determine if reserving space on - * the data or realtime partition. + * Determine space reservations for data/realtime. */ - if (rt) { - xfs_fileoff_t s, e; - + if (unlikely(extsz)) { s = startoffset_fsb; - do_div(s, rtextsize); - s *= rtextsize; - e = roundup_64(startoffset_fsb + allocatesize_fsb, - rtextsize); - numrtextents = (int)(e - s) / mp->m_sb.sb_rextsize; - datablocks = 0; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; } else { - datablocks = allocatesize_fsb; - numrtextents = 0; + s = 0; + e = allocatesize_fsb; + } + + if (unlikely(rt)) { + resrtextents = qblocks = (uint)(e - s); + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = \ + XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); + quota_flag = XFS_QMOPT_RES_REGBLKS; } /* - * allocate and setup the transaction + * Allocate and setup the transaction. */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks); - error = xfs_trans_reserve(tp, - resblks, - XFS_WRITE_LOG_RES(mp), - numrtextents, + error = xfs_trans_reserve(tp, resblks, + XFS_WRITE_LOG_RES(mp), resrtextents, XFS_TRANS_PERM_LOG_RES, XFS_WRITE_LOG_COUNT); - /* - * check for running out of space + * Check for running out of space */ if (error) { /* @@ -4114,8 +4104,8 @@ retry: break; } xfs_ilock(ip, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, - ip->i_udquot, ip->i_gdquot, resblks, 0, 0); + error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, + qblocks, 0, quota_flag); if (error) goto error1; @@ -4123,19 +4113,19 @@ retry: xfs_trans_ihold(tp, ip); /* - * issue the bmapi() call to allocate the blocks + * Issue the xfs_bmapi() call to allocate the blocks */ XFS_BMAP_INIT(&free_list, &firstfsb); error = xfs_bmapi(tp, ip, startoffset_fsb, - allocatesize_fsb, xfs_bmapi_flags, - &firstfsb, 0, imapp, &reccount, + allocatesize_fsb, bmapi_flag, + &firstfsb, 0, imapp, &nimaps, &free_list); if (error) { goto error0; } /* - * complete the transaction + * Complete the transaction */ error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); if (error) { @@ -4150,7 +4140,7 @@ retry: allocated_fsb = imapp->br_blockcount; - if (reccount == 0) { + if (nimaps == 0) { error = XFS_ERROR(ENOSPC); break; } @@ -4173,9 +4163,11 @@ dmapi_enospc_check: return error; - error0: +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_bmap_cancel(&free_list); - error1: + XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_ILOCK_EXCL); goto dmapi_enospc_check; @@ -4420,8 +4412,8 @@ xfs_free_file_space( } xfs_ilock(ip, XFS_ILOCK_EXCL); error = XFS_TRANS_RESERVE_QUOTA(mp, tp, - ip->i_udquot, ip->i_gdquot, resblks, 0, rt ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + ip->i_udquot, ip->i_gdquot, resblks, 0, + XFS_QMOPT_RES_REGBLKS); if (error) goto error1; |