diff options
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/acl.c | 33 | ||||
-rw-r--r-- | fs/ocfs2/alloc.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/aops.c | 2 | ||||
-rw-r--r-- | fs/ocfs2/blockcheck.c | 4 | ||||
-rw-r--r-- | fs/ocfs2/cluster/tcp.c | 17 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmmaster.c | 9 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 22 | ||||
-rw-r--r-- | fs/ocfs2/dlm/dlmthread.c | 114 | ||||
-rw-r--r-- | fs/ocfs2/file.c | 36 | ||||
-rw-r--r-- | fs/ocfs2/inode.c | 6 | ||||
-rw-r--r-- | fs/ocfs2/inode.h | 11 | ||||
-rw-r--r-- | fs/ocfs2/ioctl.c | 356 | ||||
-rw-r--r-- | fs/ocfs2/journal.c | 9 | ||||
-rw-r--r-- | fs/ocfs2/journal.h | 3 | ||||
-rw-r--r-- | fs/ocfs2/mmap.c | 8 | ||||
-rw-r--r-- | fs/ocfs2/namei.c | 302 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2.h | 23 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_ioctl.h | 95 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.c | 25 | ||||
-rw-r--r-- | fs/ocfs2/refcounttree.h | 4 | ||||
-rw-r--r-- | fs/ocfs2/suballoc.c | 219 | ||||
-rw-r--r-- | fs/ocfs2/suballoc.h | 21 |
22 files changed, 1076 insertions, 245 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index da702294d7e7..a76e0aa5cd3f 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -290,12 +290,30 @@ static int ocfs2_set_acl(handle_t *handle, int ocfs2_check_acl(struct inode *inode, int mask) { - struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret = -EAGAIN; - if (IS_ERR(acl)) + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return ret; + + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh); + + brelse(di_bh); + + if (IS_ERR(acl)) { + mlog_errno(PTR_ERR(acl)); return PTR_ERR(acl); + } if (acl) { - int ret = posix_acl_permission(inode, acl, mask); + ret = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); return ret; } @@ -344,7 +362,7 @@ int ocfs2_init_acl(handle_t *handle, { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; - int ret = 0; + int ret = 0, ret2; mode_t mode; if (!S_ISLNK(inode->i_mode)) { @@ -381,7 +399,12 @@ int ocfs2_init_acl(handle_t *handle, mode = inode->i_mode; ret = posix_acl_create_masq(clone, &mode); if (ret >= 0) { - ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret2) { + mlog_errno(ret2); + ret = ret2; + goto cleanup; + } if (ret > 0) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_ACCESS, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 215e12ce1d85..592fae5007d1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; do { - pages[numpages] = grab_cache_page(mapping, index); + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7155c5a919d7..5cfeee118158 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -883,8 +883,8 @@ struct ocfs2_write_ctxt { * out in so that future reads from that region will get * zero's. */ - struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; unsigned int w_num_pages; + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; struct page *w_target_page; /* diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index ec6d12339593..c7ee03c22226 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, - "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ @@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, goto out; } - mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); rc = -EIO; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index aa75ca3f78da..1361997cf205 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1759,6 +1759,7 @@ static int o2net_accept_one(struct socket *sock) struct sockaddr_in sin; struct socket *new_sock = NULL; struct o2nm_node *node = NULL; + struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; struct o2net_node *nn; @@ -1796,11 +1797,15 @@ static int o2net_accept_one(struct socket *sock) goto out; } - if (o2nm_this_node() > node->nd_num) { - mlog(ML_NOTICE, "unexpected connect attempted from a lower " - "numbered node '%s' at " "%pI4:%d with num %u\n", - node->nd_name, &sin.sin_addr.s_addr, - ntohs(sin.sin_port), node->nd_num); + if (o2nm_this_node() >= node->nd_num) { + local_node = o2nm_get_node_by_num(o2nm_this_node()); + mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" + "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } @@ -1857,6 +1862,8 @@ out: sock_release(new_sock); if (node) o2nm_node_put(node); + if (local_node) + o2nm_node_put(local_node); if (sc) sc_put(sc); return ret; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 94b97fc6a88e..ffb4c68dafa4 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref) atomic_dec(&dlm->res_cur_count); - dlm_put(dlm); - if (!hlist_unhashed(&res->hash_node) || !list_empty(&res->granted) || !list_empty(&res->converting) || @@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->migration_pending = 0; res->inflight_locks = 0; - /* put in dlm_lockres_release */ - dlm_grab(dlm); res->dlm = dlm; kref_init(&res->refs); @@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, /* check for pre-existing lock */ spin_lock(&dlm->spinlock); res = __dlm_lookup_lockres(dlm, name, namelen, hash); - spin_lock(&dlm->master_lock); - if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&res->spinlock); } + spin_lock(&dlm->master_lock); /* ignore status. only nonzero status would BUG. */ ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name, namelen, migrate->new_master, migrate->master); -unlock: spin_unlock(&dlm->master_lock); +unlock: spin_unlock(&dlm->spinlock); if (oldmle) { diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 9dfaac73b36d..aaaffbcbe916 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct list_head *queue; struct dlm_lock *lock, *next; + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); res->state |= DLM_LOCK_RES_RECOVERING; if (!list_empty(&res->recovering)) { mlog(0, @@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); if (res->owner == dead_node) { - if (res->state & DLM_LOCK_RES_DROPPING_REF) - mlog(0, "%s:%.*s: owned by " - "dead node %u, this node was " - "dropping its ref when it died. " - "continue, dropping the flag.\n", - dlm->name, res->lockname.len, - res->lockname.name, dead_node); - - /* the wake_up for this will happen when the - * RECOVERING flag is dropped later */ - res->state &= ~DLM_LOCK_RES_DROPPING_REF; + if (res->state & DLM_LOCK_RES_DROPPING_REF) { + mlog(ML_NOTICE, "Ignore %.*s for " + "recovery as it is being freed\n", + res->lockname.len, + res->lockname.name); + } else + dlm_move_lockres_to_recovery_list(dlm, + res); - dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d4f73ca68fe5..2211acf33d9b 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res) * truly ready to be freed. */ int __dlm_lockres_unused(struct dlm_lock_resource *res) { - if (!__dlm_lockres_has_locks(res) && - (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) { - /* try not to scan the bitmap unless the first two - * conditions are already true */ - int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); - if (bit >= O2NM_MAX_NODES) { - /* since the bit for dlm->node_num is not - * set, inflight_locks better be zero */ - BUG_ON(res->inflight_locks != 0); - return 1; - } - } - return 0; + int bit; + + if (__dlm_lockres_has_locks(res)) + return 0; + + if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) + return 0; + + if (res->state & DLM_LOCK_RES_RECOVERING) + return 0; + + bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (bit < O2NM_MAX_NODES) + return 0; + + /* + * since the bit for dlm->node_num is not set, inflight_locks better + * be zero + */ + BUG_ON(res->inflight_locks != 0); + return 1; } @@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } -static int dlm_purge_lockres(struct dlm_ctxt *dlm, +static void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int master; int ret = 0; - spin_lock(&res->spinlock); - if (!__dlm_lockres_unused(res)) { - mlog(0, "%s:%.*s: tried to purge but not unused\n", - dlm->name, res->lockname.len, res->lockname.name); - __dlm_print_one_lock_resource(res); - spin_unlock(&res->spinlock); - BUG(); - } - - if (res->state & DLM_LOCK_RES_MIGRATING) { - mlog(0, "%s:%.*s: Delay dropref as this lockres is " - "being remastered\n", dlm->name, res->lockname.len, - res->lockname.name); - /* Re-add the lockres to the end of the purge list */ - if (!list_empty(&res->purge)) { - list_del_init(&res->purge); - list_add_tail(&res->purge, &dlm->purge_list); - } - spin_unlock(&res->spinlock); - return 0; - } + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&res->spinlock); master = (res->owner == dlm->node_num); - if (!master) - res->state |= DLM_LOCK_RES_DROPPING_REF; - spin_unlock(&res->spinlock); mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len, res->lockname.name, master); if (!master) { + res->state |= DLM_LOCK_RES_DROPPING_REF; /* drop spinlock... retake below */ + spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); spin_lock(&res->spinlock); @@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n", dlm->name, res->lockname.len, res->lockname.name, ret); spin_lock(&dlm->spinlock); + spin_lock(&res->spinlock); } - spin_lock(&res->spinlock); if (!list_empty(&res->purge)) { mlog(0, "removing lockres %.*s:%p from purgelist, " "master = %d\n", res->lockname.len, res->lockname.name, res, master); list_del_init(&res->purge); - spin_unlock(&res->spinlock); dlm_lockres_put(res); dlm->purge_count--; - } else - spin_unlock(&res->spinlock); + } + + if (!__dlm_lockres_unused(res)) { + mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n", + dlm->name, res->lockname.len, res->lockname.name); + __dlm_print_one_lock_resource(res); + BUG(); + } __dlm_unhash_lockres(res); /* lockres is not in the hash now. drop the flag and wake up * any processes waiting in dlm_get_lock_resource. */ if (!master) { - spin_lock(&res->spinlock); res->state &= ~DLM_LOCK_RES_DROPPING_REF; spin_unlock(&res->spinlock); wake_up(&res->wq); - } - return 0; + } else + spin_unlock(&res->spinlock); } static void dlm_run_purge_list(struct dlm_ctxt *dlm, @@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, lockres = list_entry(dlm->purge_list.next, struct dlm_lock_resource, purge); - /* Status of the lockres *might* change so double - * check. If the lockres is unused, holding the dlm - * spinlock will prevent people from getting and more - * refs on it -- there's no need to keep the lockres - * spinlock. */ spin_lock(&lockres->spinlock); - unused = __dlm_lockres_unused(lockres); - spin_unlock(&lockres->spinlock); - - if (!unused) - continue; purge_jiffies = lockres->last_used + msecs_to_jiffies(DLM_PURGE_INTERVAL_MS); @@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, * in tail order, we can stop at the first * unpurgable resource -- anyone added after * him will have a greater last_used value */ + spin_unlock(&lockres->spinlock); break; } + /* Status of the lockres *might* change so double + * check. If the lockres is unused, holding the dlm + * spinlock will prevent people from getting and more + * refs on it. */ + unused = __dlm_lockres_unused(lockres); + if (!unused || + (lockres->state & DLM_LOCK_RES_MIGRATING)) { + mlog(0, "lockres %s:%.*s: is in use or " + "being remastered, used %d, state %d\n", + dlm->name, lockres->lockname.len, + lockres->lockname.name, !unused, lockres->state); + list_move_tail(&dlm->purge_list, &lockres->purge); + spin_unlock(&lockres->spinlock); + continue; + } + dlm_lockres_get(lockres); - /* This may drop and reacquire the dlm spinlock if it - * has to do migration. */ - if (dlm_purge_lockres(dlm, lockres)) - BUG(); + dlm_purge_lockres(dlm, lockres); dlm_lockres_put(lockres); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4331f57e9fde..9a74542e1a05 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -36,6 +36,7 @@ #include <linux/writeback.h> #include <linux/falloc.h> #include <linux/quotaops.h> +#include <linux/blkdev.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -63,12 +64,6 @@ #include "buffer_head_io.h" -static int ocfs2_sync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - return sync_mapping_buffers(inode->i_mapping); -} - static int ocfs2_init_file_private(struct inode *inode, struct file *file) { struct ocfs2_file_private *fp; @@ -186,12 +181,16 @@ static int ocfs2_sync_file(struct file *file, int datasync) mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, dentry->d_name.len, dentry->d_name.name); - err = ocfs2_sync_inode(dentry->d_inode); - if (err) - goto bail; - - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { + /* + * We still have to flush drive's caches to get data to the + * platter + */ + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); goto bail; + } journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); @@ -774,7 +773,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); @@ -2306,17 +2305,6 @@ relock: written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { - /* - * direct write may have instantiated a few - * blocks outside i_size. Trim these off again. - * Don't need i_size_read because we hold i_mutex. - * - * XXX(truncate): this looks buggy because ocfs2 did not - * actually implement ->truncate. Take a look at - * the new truncate sequence and update this accordingly - */ - if (*ppos + count > inode->i_size) - truncate_setsize(inode, inode->i_size); ret = written; goto out_dio; } @@ -2332,7 +2320,7 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && has_refcount)) { + ((file->f_flags & O_DIRECT) && !direct_io)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0492464916b1..eece3e05d9d0 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 6de5a869db30..0bc477a3aeb8 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -46,27 +46,24 @@ struct ocfs2_inode_info /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; - u32 ip_clusters; struct list_head ip_io_markers; + u32 ip_clusters; + u16 ip_dyn_features; struct mutex ip_io_mutex; - u32 ip_flags; /* see below */ u32 ip_attr; /* inode attributes */ - u16 ip_dyn_features; /* protected by recovery_lock. */ struct inode *ip_next_orphan; - u32 ip_dir_start_lookup; - struct ocfs2_caching_info ip_metadata_cache; - struct ocfs2_extent_map ip_extent_map; - struct inode vfs_inode; struct jbd2_inode ip_jinode; + u32 ip_dir_start_lookup; + /* Only valid if the inode is the dir. */ u32 ip_last_used_slot; u64 ip_last_used_group; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7d9d9c132cef..7a4868196152 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -26,6 +26,26 @@ #include <linux/ext2_fs.h> +#define o2info_from_user(a, b) \ + copy_from_user(&(a), (b), sizeof(a)) +#define o2info_to_user(a, b) \ + copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) + +/* + * This call is void because we are already reporting an error that may + * be -EFAULT. The error will be returned from the ioctl(2) call. It's + * just a best-effort to tell userspace that this request caused the error. + */ +static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq, + struct ocfs2_info_request __user *req) +{ + kreq->ir_flags |= OCFS2_INFO_FL_ERROR; + (void)put_user(kreq->ir_flags, (__u32 __user *)&(req->ir_flags)); +} + +#define o2info_set_request_error(a, b) \ + __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) + static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) { int status; @@ -109,6 +129,328 @@ bail: return status; } +int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_blocksize oib; + + if (o2info_from_user(oib, req)) + goto bail; + + oib.ib_blocksize = inode->i_sb->s_blocksize; + oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oib, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oib, req); + + return status; +} + +int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_clustersize oic; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oic, req)) + goto bail; + + oic.ic_clustersize = osb->s_clustersize; + oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oic, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oic, req); + + return status; +} + +int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_maxslots oim; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oim, req)) + goto bail; + + oim.im_max_slots = osb->max_slots; + oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oim, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oim, req); + + return status; +} + +int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_label oil; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oil, req)) + goto bail; + + memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); + oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oil, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oil, req); + + return status; +} + +int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_uuid oiu; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oiu, req)) + goto bail; + + memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); + oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oiu, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oiu, req); + + return status; +} + +int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_fs_features oif; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oif, req)) + goto bail; + + oif.if_compat_features = osb->s_feature_compat; + oif.if_incompat_features = osb->s_feature_incompat; + oif.if_ro_compat_features = osb->s_feature_ro_compat; + oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oif, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oif, req); + + return status; +} + +int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_journal_size oij; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (o2info_from_user(oij, req)) + goto bail; + + oij.ij_journal_size = osb->journal->j_inode->i_size; + + oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oij, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oij, req); + + return status; +} + +int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + oir.ir_flags &= ~OCFS2_INFO_FL_FILLED; + + if (o2info_to_user(oir, req)) + goto bail; + + status = 0; +bail: + if (status) + o2info_set_request_error(oir, req); + + return status; +} + +/* + * Validate and distinguish OCFS2_IOC_INFO requests. + * + * - validate the magic number. + * - distinguish different requests. + * - validate size of different requests. + */ +int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) +{ + int status = -EFAULT; + struct ocfs2_info_request oir; + + if (o2info_from_user(oir, req)) + goto bail; + + status = -EINVAL; + if (oir.ir_magic != OCFS2_INFO_MAGIC) + goto bail; + + switch (oir.ir_code) { + case OCFS2_INFO_BLOCKSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_blocksize)) + status = ocfs2_info_handle_blocksize(inode, req); + break; + case OCFS2_INFO_CLUSTERSIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_clustersize)) + status = ocfs2_info_handle_clustersize(inode, req); + break; + case OCFS2_INFO_MAXSLOTS: + if (oir.ir_size == sizeof(struct ocfs2_info_maxslots)) + status = ocfs2_info_handle_maxslots(inode, req); + break; + case OCFS2_INFO_LABEL: + if (oir.ir_size == sizeof(struct ocfs2_info_label)) + status = ocfs2_info_handle_label(inode, req); + break; + case OCFS2_INFO_UUID: + if (oir.ir_size == sizeof(struct ocfs2_info_uuid)) + status = ocfs2_info_handle_uuid(inode, req); + break; + case OCFS2_INFO_FS_FEATURES: + if (oir.ir_size == sizeof(struct ocfs2_info_fs_features)) + status = ocfs2_info_handle_fs_features(inode, req); + break; + case OCFS2_INFO_JOURNAL_SIZE: + if (oir.ir_size == sizeof(struct ocfs2_info_journal_size)) + status = ocfs2_info_handle_journal_size(inode, req); + break; + default: + status = ocfs2_info_handle_unknown(inode, req); + break; + } + +bail: + return status; +} + +int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) +{ + int status = -EFAULT; + u64 __user *bp = NULL; + + if (compat_flag) { +#ifdef CONFIG_COMPAT + /* + * pointer bp stores the base address of a pointers array, + * which collects all addresses of separate request. + */ + bp = (u64 __user *)(unsigned long)compat_ptr(info->oi_requests); +#else + BUG(); +#endif + } else + bp = (u64 __user *)(unsigned long)(info->oi_requests); + + if (o2info_from_user(*req_addr, bp + idx)) + goto bail; + + status = 0; +bail: + return status; +} + +/* + * OCFS2_IOC_INFO handles an array of requests passed from userspace. + * + * ocfs2_info_handle() recevies a large info aggregation, grab and + * validate the request count from header, then break it into small + * pieces, later specific handlers can handle them one by one. + * + * Idea here is to make each separate request small enough to ensure + * a better backward&forward compatibility, since a small piece of + * request will be less likely to be broken if disk layout get changed. + */ +int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) +{ + int i, status = 0; + u64 req_addr; + struct ocfs2_info_request __user *reqp; + + if ((info->oi_count > OCFS2_INFO_MAX_REQUEST) || + (!info->oi_requests)) { + status = -EINVAL; + goto bail; + } + + for (i = 0; i < info->oi_count; i++) { + + status = ocfs2_get_request_ptr(info, i, &req_addr, compat_flag); + if (status) + break; + + reqp = (struct ocfs2_info_request *)(unsigned long)req_addr; + if (!reqp) { + status = -EINVAL; + goto bail; + } + + status = ocfs2_info_handle_request(inode, reqp); + if (status) + break; + } + +bail: + return status; +} + long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_path.dentry->d_inode; @@ -120,6 +462,7 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct reflink_arguments args; const char *old_path, *new_path; bool preserve; + struct ocfs2_info info; switch (cmd) { case OCFS2_IOC_GETFLAGS: @@ -174,6 +517,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) preserve = (args.preserve != 0); return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 0); default: return -ENOTTY; } @@ -185,6 +534,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) bool preserve; struct reflink_arguments args; struct inode *inode = file->f_path.dentry->d_inode; + struct ocfs2_info info; switch (cmd) { case OCFS2_IOC32_GETFLAGS: @@ -209,6 +559,12 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path), compat_ptr(args.new_path), preserve); + case OCFS2_IOC_INFO: + if (copy_from_user(&info, (struct ocfs2_info __user *)arg, + sizeof(struct ocfs2_info))) + return -EFAULT; + + return ocfs2_info_handle(inode, &info, 1); default: return -ENOIOCTLCMD; } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 9b57c0350ff9..faa2303dbf0a 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -301,7 +301,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) { int status = 0; unsigned int flushed; - unsigned long old_id; struct ocfs2_journal *journal = NULL; mlog_entry_void(); @@ -326,7 +325,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - old_id = ocfs2_inc_trans_id(journal); + ocfs2_inc_trans_id(journal); flushed = atomic_read(&journal->j_num_trans); atomic_set(&journal->j_num_trans, 0); @@ -342,9 +341,6 @@ finally: return status; } -/* pass it NULL and it will allocate a new handle object for you. If - * you pass it a handle however, it may still return error, in which - * case it has free'd the passed handle for you. */ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) { journal_t *journal = osb->journal->j_journal; @@ -1888,6 +1884,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) os = &osb->osb_orphan_scan; + mlog(0, "Begin orphan scan\n"); + if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) goto out; @@ -1920,6 +1918,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: + mlog(0, "Orphan scan completed\n"); return; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index b5baaa8e710f..43e56b97f9c0 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -67,11 +67,12 @@ struct ocfs2_journal { struct buffer_head *j_bh; /* Journal disk inode block */ atomic_t j_num_trans; /* Number of transactions * currently in the system. */ + spinlock_t j_lock; unsigned long j_trans_id; struct rw_semaphore j_trans_barrier; wait_queue_head_t j_checkpointed; - spinlock_t j_lock; + /* both fields protected by j_lock*/ struct list_head j_la_cleanups; struct work_struct j_recovery_work; }; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index b04d6961c0d4..7e32db9c2c99 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -75,9 +75,11 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, /* * Another node might have truncated while we were waiting on * cluster locks. + * We don't check size == 0 before the shift. This is borrowed + * from do_generic_file_read. */ - last_index = size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) { + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!size || page->index > last_index)) { ret = -EINVAL; goto out; } @@ -108,7 +110,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = size & ~PAGE_CACHE_MASK; + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; ret = ocfs2_write_begin_nolock(file, mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f171b51a74f7..a00dda2e4f16 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -472,32 +472,23 @@ leave: return status; } -static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, - struct inode *inode, - dev_t dev, - struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *inode_ac) +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 suballoc_loc, fe_blkno = 0; - u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, - inode_ac, &suballoc_loc, - &suballoc_bit, &fe_blkno); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ @@ -591,6 +582,34 @@ leave: return status; } +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) @@ -1852,61 +1871,117 @@ bail: return status; } -static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, - struct inode **ret_orphan_dir, - u64 blkno, - char *name, - struct ocfs2_dir_lookup_result *lookup) +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; - int status = 0; - - status = ocfs2_blkno_stringify(blkno, name); - if (status < 0) { - mlog_errno(status); - return status; - } + int ret = 0; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -ENOENT; - mlog_errno(status); - return status; + ret = -ENOENT; + mlog_errno(ret); + return ret; } mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mlog_errno(status); - goto leave; + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; } - status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, - orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); - if (status < 0) { - ocfs2_inode_unlock(orphan_dir_inode, 1); + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; - mlog_errno(status); - goto leave; + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup); + if (ret < 0) { + mlog_errno(ret); + goto out; } *ret_orphan_dir = orphan_dir_inode; -leave: - if (status) { +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); } - brelse(orphan_dir_bh); - - mlog_exit(status); - return status; + mlog_exit(ret); + return ret; } static int ocfs2_orphan_add(struct ocfs2_super *osb, @@ -2053,6 +2128,99 @@ leave: return status; } +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct inode *orphan_dir = NULL; + struct buffer_head *orphan_dir_bh = NULL; + struct ocfs2_alloc_context *inode_ac = NULL; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir, &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* reserve an inode spot */ + ret = ocfs2_reserve_new_inode(osb, &inode_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_new_inode_loc(dir, dir_bh, inode_ac, + &di_blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh, + di_blkno, orphan_name, orphan_insert); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + +out: + if (ret == 0) { + *ret_orphan_dir = orphan_dir; + *ret_di_blkno = di_blkno; + *ret_inode_ac = inode_ac; + /* + * orphan_name and orphan_insert are already up to + * date via prepare_orphan_dir + */ + } else { + /* Unroll reserve_new_inode* */ + if (inode_ac) + ocfs2_free_alloc_context(inode_ac); + + /* Unroll orphan dir locking */ + mutex_unlock(&orphan_dir->i_mutex); + ocfs2_inode_unlock(orphan_dir, 1); + iput(orphan_dir); + } + + brelse(orphan_dir_bh); + + return 0; +} + int ocfs2_create_inode_in_orphan(struct inode *dir, int mode, struct inode **new_inode) @@ -2068,6 +2236,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct buffer_head *new_di_bh = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; + u64 uninitialized_var(di_blkno), suballoc_loc; + u16 suballoc_bit; status = ocfs2_inode_lock(dir, &parent_di_bh, 1); if (status < 0) { @@ -2076,20 +2246,9 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, return status; } - /* - * We give the orphan dir the root blkno to fake an orphan name, - * and allocate enough space for our insertion. - */ - status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, - osb->root_blkno, - orphan_name, &orphan_insert); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - /* reserve an inode spot */ - status = ocfs2_reserve_new_inode(osb, &inode_ac); + status = ocfs2_prep_new_orphaned_file(dir, parent_di_bh, + orphan_name, &orphan_dir, + &di_blkno, &orphan_insert, &inode_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -2116,17 +2275,20 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; did_quota_inode = 1; - inode->i_nlink = 0; - /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, inode, - 0, &new_di_bh, parent_di_bh, handle, - inode_ac); + status = ocfs2_claim_new_inode_at_loc(handle, dir, inode_ac, + &suballoc_loc, + &suballoc_bit, di_blkno); if (status < 0) { mlog_errno(status); goto leave; } - status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, orphan_name); + inode->i_nlink = 0; + /* do the real work now. */ + status = __ocfs2_mknod_locked(dir, inode, + 0, &new_di_bh, parent_di_bh, handle, + inode_ac, di_blkno, suballoc_loc, + suballoc_bit); if (status < 0) { mlog_errno(status); goto leave; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c67003b6b5a2..65739b3b3276 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status, unsigned long data); struct ocfs2_lock_res { void *l_priv; struct ocfs2_lock_res_ops *l_ops; - spinlock_t l_lock; + struct list_head l_blocked_list; struct list_head l_mask_waiters; - enum ocfs2_lock_type l_type; unsigned long l_flags; char l_name[OCFS2_LOCK_ID_MAX_LEN]; - int l_level; unsigned int l_ro_holders; unsigned int l_ex_holders; - struct ocfs2_dlm_lksb l_lksb; + unsigned char l_level; + + /* Data packed - type enum ocfs2_lock_type */ + unsigned char l_type; /* used from AST/BAST funcs. */ - enum ocfs2_ast_action l_action; - enum ocfs2_unlock_action l_unlock_action; - int l_requested; - int l_blocking; + /* Data packed - enum type ocfs2_ast_action */ + unsigned char l_action; + /* Data packed - enum type ocfs2_unlock_action */ + unsigned char l_unlock_action; + unsigned char l_requested; + unsigned char l_blocking; unsigned int l_pending_gen; + spinlock_t l_lock; + + struct ocfs2_dlm_lksb l_lksb; + wait_queue_head_t l_event; struct list_head l_debug_list; diff --git a/fs/ocfs2/ocfs2_ioctl.h b/fs/ocfs2/ocfs2_ioctl.h index 2d3420af1a83..9bc535499868 100644 --- a/fs/ocfs2/ocfs2_ioctl.h +++ b/fs/ocfs2/ocfs2_ioctl.h @@ -76,4 +76,99 @@ struct reflink_arguments { }; #define OCFS2_IOC_REFLINK _IOW('o', 4, struct reflink_arguments) +/* Following definitions dedicated for ocfs2_info_request ioctls. */ +#define OCFS2_INFO_MAX_REQUEST (50) +#define OCFS2_TEXT_UUID_LEN (OCFS2_VOL_UUID_LEN * 2) + +/* Magic number of all requests */ +#define OCFS2_INFO_MAGIC (0x4F32494E) + +/* + * Always try to separate info request into small pieces to + * guarantee the backward&forward compatibility. + */ +struct ocfs2_info { + __u64 oi_requests; /* Array of __u64 pointers to requests */ + __u32 oi_count; /* Number of requests in info_requests */ + __u32 oi_pad; +}; + +struct ocfs2_info_request { +/*00*/ __u32 ir_magic; /* Magic number */ + __u32 ir_code; /* Info request code */ + __u32 ir_size; /* Size of request */ + __u32 ir_flags; /* Request flags */ +/*10*/ /* Request specific fields */ +}; + +struct ocfs2_info_clustersize { + struct ocfs2_info_request ic_req; + __u32 ic_clustersize; + __u32 ic_pad; +}; + +struct ocfs2_info_blocksize { + struct ocfs2_info_request ib_req; + __u32 ib_blocksize; + __u32 ib_pad; +}; + +struct ocfs2_info_maxslots { + struct ocfs2_info_request im_req; + __u32 im_max_slots; + __u32 im_pad; +}; + +struct ocfs2_info_label { + struct ocfs2_info_request il_req; + __u8 il_label[OCFS2_MAX_VOL_LABEL_LEN]; +} __attribute__ ((packed)); + +struct ocfs2_info_uuid { + struct ocfs2_info_request iu_req; + __u8 iu_uuid_str[OCFS2_TEXT_UUID_LEN + 1]; +} __attribute__ ((packed)); + +struct ocfs2_info_fs_features { + struct ocfs2_info_request if_req; + __u32 if_compat_features; + __u32 if_incompat_features; + __u32 if_ro_compat_features; + __u32 if_pad; +}; + +struct ocfs2_info_journal_size { + struct ocfs2_info_request ij_req; + __u64 ij_journal_size; +}; + +/* Codes for ocfs2_info_request */ +enum ocfs2_info_type { + OCFS2_INFO_CLUSTERSIZE = 1, + OCFS2_INFO_BLOCKSIZE, + OCFS2_INFO_MAXSLOTS, + OCFS2_INFO_LABEL, + OCFS2_INFO_UUID, + OCFS2_INFO_FS_FEATURES, + OCFS2_INFO_JOURNAL_SIZE, + OCFS2_INFO_NUM_TYPES +}; + +/* Flags for struct ocfs2_info_request */ +/* Filled by the caller */ +#define OCFS2_INFO_FL_NON_COHERENT (0x00000001) /* Cluster coherency not + required. This is a hint. + It is up to ocfs2 whether + the request can be fulfilled + without locking. */ +/* Filled by ocfs2 */ +#define OCFS2_INFO_FL_FILLED (0x40000000) /* Filesystem understood + this request and + filled in the answer */ + +#define OCFS2_INFO_FL_ERROR (0x80000000) /* Error happened during + request handling. */ + +#define OCFS2_IOC_INFO _IOR('o', 5, struct ocfs2_info) + #endif /* OCFS2_IOCTL_H */ diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 47549f64224c..a120cfcf69bf 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2437,16 +2437,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + le32_to_cpu(rec.r_clusters)) - cpos; /* - * If the refcount rec already exist, cool. We just need - * to check whether there is a split. Otherwise we just need - * to increase the refcount. - * If we will insert one, increases recs_add. - * * We record all the records which will be inserted to the * same refcount block, so that we can tell exactly whether * we need a new refcount block or not. + * + * If we will insert a new one, this is easy and only happens + * during adding refcounted flag to the extent, so we don't + * have a chance of spliting. We just need one record. + * + * If the refcount rec already exists, that would be a little + * complicated. we may have to: + * 1) split at the beginning if the start pos isn't aligned. + * we need 1 more record in this case. + * 2) split int the end if the end pos isn't aligned. + * we need 1 more record in this case. + * 3) split in the middle because of file system fragmentation. + * we need 2 more records in this case(we can't detect this + * beforehand, so always think of the worst case). */ if (rec.r_refcount) { + recs_add += 2; /* Check whether we need a split at the beginning. */ if (cpos == start_cpos && cpos != le64_to_cpu(rec.r_cpos)) @@ -2954,7 +2964,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_CACHE_SIZE - 1)) to = map_end & (PAGE_CACHE_SIZE - 1); - page = grab_cache_page(mapping, page_index); + page = find_or_create_page(mapping, page_index, GFP_NOFS); /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page @@ -3181,7 +3191,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = grab_cache_page(context->inode->i_mapping, page_index); + page = find_or_create_page(context->inode->i_mapping, + page_index, GFP_NOFS); BUG_ON(!page); wait_on_page_writeback(page); diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 29cba0eaa927..c8ce46f7d8e3 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -21,14 +21,14 @@ struct ocfs2_refcount_tree { struct rb_node rf_node; u64 rf_blkno; u32 rf_generation; + struct kref rf_getcnt; struct rw_semaphore rf_sem; struct ocfs2_lock_res rf_lockres; - struct kref rf_getcnt; int rf_removed; /* the following 4 fields are used by caching_info. */ - struct ocfs2_caching_info rf_ci; spinlock_t rf_lock; + struct ocfs2_caching_info rf_ci; struct mutex rf_io_mutex; struct super_block *rf_sb; }; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a8e6a95a353f..8a286f54dca1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -57,11 +57,28 @@ struct ocfs2_suballoc_result { u64 sr_bg_blkno; /* The bg we allocated from. Set to 0 when a block group is contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ }; +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -138,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -1678,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (!ret) ocfs2_bg_discontig_fix_result(ac, gd, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); @@ -1691,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (ret < 0) mlog_errno(ret); +out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: @@ -1708,7 +1739,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; - u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1770,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, if (!status) ocfs2_bg_discontig_fix_result(ac, bg, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When @@ -1796,22 +1831,17 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } - /* Ok, claim our bits now: set the info on dinode, chainlist - * and then the group */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(alloc_inode), - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { + if (ac->ac_find_loc_only) + goto out_loc_only; + + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { mlog_errno(status); goto bail; } - tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); - ocfs2_journal_dirty(handle, ac->ac_bh); - status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, @@ -1826,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); +out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -1845,6 +1876,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, int status; u16 victim, i; u16 bits_left = 0; + u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1872,7 +1904,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } - res->sr_bg_blkno = ac->ac_last_group; + res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used @@ -1896,8 +1928,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); goto set_hint; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1920,8 +1954,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); break; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1936,7 +1972,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = res->sr_bg_blkno; + ac->ac_last_group = hint; } bail: @@ -2016,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, + (unsigned long long)di_blkno); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, @@ -2567,7 +2733,8 @@ out: * suballoc_bit. */ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, - u16 *suballoc_slot, u16 *suballoc_bit) + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) { int status; struct buffer_head *inode_bh = NULL; @@ -2604,6 +2771,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); if (suballoc_bit) *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); bail: brelse(inode_bh); @@ -2621,7 +2790,8 @@ bail: */ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, - struct buffer_head *alloc_bh, u64 blkno, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, u16 bit, int *res) { struct ocfs2_dinode *alloc_di; @@ -2642,10 +2812,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, goto bail; } - if (alloc_di->i_suballoc_loc) - bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); - else - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { @@ -2680,6 +2848,7 @@ bail: int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { int status; + u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; @@ -2687,7 +2856,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog_entry("blkno: %llu", (unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, - &suballoc_bit); + &group_blkno, &suballoc_bit); if (status < 0) { mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); goto bail; @@ -2715,7 +2884,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) } status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, - blkno, suballoc_bit, res); + group_blkno, blkno, suballoc_bit, res); if (status < 0) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a017dd3ee7d9..b8afabfeede4 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -56,6 +56,9 @@ struct ocfs2_alloc_context { u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + struct ocfs2_alloc_reservation *ac_resv; }; @@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context **meta_ac); int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + #endif /* _CHAINALLOC_H_ */ |