diff options
Diffstat (limited to 'fs/ocfs2')
35 files changed, 4832 insertions, 2313 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f27e5378caf2..a0c8667caa72 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -27,6 +27,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/swap.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -34,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "aops.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -47,63 +49,243 @@ #include "buffer_head_io.h" -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno); +static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - int wanted, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head *bhs[]); +/* + * Structures which describe a path through a btree, and functions to + * manipulate them. + * + * The idea here is to be as generic as possible with the tree + * manipulation code. + */ +struct ocfs2_path_item { + struct buffer_head *bh; + struct ocfs2_extent_list *el; +}; -static int ocfs2_add_branch(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head *eb_bh, - struct buffer_head *last_eb_bh, - struct ocfs2_alloc_context *meta_ac); +#define OCFS2_MAX_PATH_DEPTH 5 -static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - struct ocfs2_alloc_context *meta_ac, - struct buffer_head **ret_new_eb_bh); +struct ocfs2_path { + int p_tree_depth; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; +}; -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 blkno, - u32 new_clusters); +#define path_root_bh(_path) ((_path)->p_node[0].bh) +#define path_root_el(_path) ((_path)->p_node[0].el) +#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) +#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) +#define path_num_items(_path) ((_path)->p_tree_depth + 1) -static int ocfs2_find_branch_target(struct ocfs2_super *osb, - struct inode *inode, - struct buffer_head *fe_bh, - struct buffer_head **target_bh); +/* + * Reset the actual path elements so that we can re-use the structure + * to build another path. Generally, this involves freeing the buffer + * heads. + */ +static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) +{ + int i, start = 0, depth = 0; + struct ocfs2_path_item *node; -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - unsigned int new_i_clusters, - struct buffer_head *old_last_eb, - struct buffer_head **new_last_eb); + if (keep_root) + start = 1; + + for(i = start; i < path_num_items(path); i++) { + node = &path->p_node[i]; + + brelse(node->bh); + node->bh = NULL; + node->el = NULL; + } + + /* + * Tree depth may change during truncate, or insert. If we're + * keeping the root extent list, then make sure that our path + * structure reflects the proper depth. + */ + if (keep_root) + depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + + path->p_tree_depth = depth; +} + +static void ocfs2_free_path(struct ocfs2_path *path) +{ + if (path) { + ocfs2_reinit_path(path, 0); + kfree(path); + } +} + +/* + * Make the *dest path the same as src and re-initialize src path to + * have a root only. + */ +static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) +{ + int i; + + BUG_ON(path_root_bh(dest) != path_root_bh(src)); + + for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { + brelse(dest->p_node[i].bh); + + dest->p_node[i].bh = src->p_node[i].bh; + dest->p_node[i].el = src->p_node[i].el; + + src->p_node[i].bh = NULL; + src->p_node[i].el = NULL; + } +} + +/* + * Insert an extent block at given index. + * + * This will not take an additional reference on eb_bh. + */ +static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, + struct buffer_head *eb_bh) +{ + struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; + + /* + * Right now, no root bh is an extent block, so this helps + * catch code errors with dinode trees. The assertion can be + * safely removed if we ever need to insert extent block + * structures at the root. + */ + BUG_ON(index == 0); + + path->p_node[index].bh = eb_bh; + path->p_node[index].el = &eb->h_list; +} + +static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, + struct ocfs2_extent_list *root_el) +{ + struct ocfs2_path *path; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); + + path = kzalloc(sizeof(*path), GFP_NOFS); + if (path) { + path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); + get_bh(root_bh); + path_root_bh(path) = root_bh; + path_root_el(path) = root_el; + } + + return path; +} + +/* + * Allocate and initialize a new path based on a disk inode tree. + */ +static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_list *el = &di->id2.i_list; + + return ocfs2_new_path(di_bh, el); +} + +/* + * Convenience function to journal all components in a path. + */ +static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, + struct ocfs2_path *path) +{ + int i, ret = 0; + + if (!path) + goto out; + + for(i = 0; i < path_num_items(path); i++) { + ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + +out: + return ret; +} + +enum ocfs2_contig_type { + CONTIG_NONE = 0, + CONTIG_LEFT, + CONTIG_RIGHT +}; -static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); -static int ocfs2_extent_contig(struct inode *inode, - struct ocfs2_extent_rec *ext, - u64 blkno) +/* + * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and + * ocfs2_extent_contig only work properly against leaf nodes! + */ +static int ocfs2_block_extent_contig(struct super_block *sb, + struct ocfs2_extent_rec *ext, + u64 blkno) +{ + u64 blk_end = le64_to_cpu(ext->e_blkno); + + blk_end += ocfs2_clusters_to_blocks(sb, + le16_to_cpu(ext->e_leaf_clusters)); + + return blkno == blk_end; +} + +static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, + struct ocfs2_extent_rec *right) +{ + u32 left_range; + + left_range = le32_to_cpu(left->e_cpos) + + le16_to_cpu(left->e_leaf_clusters); + + return (left_range == le32_to_cpu(right->e_cpos)); +} + +static enum ocfs2_contig_type + ocfs2_extent_contig(struct inode *inode, + struct ocfs2_extent_rec *ext, + struct ocfs2_extent_rec *insert_rec) { - return blkno == (le64_to_cpu(ext->e_blkno) + - ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(ext->e_clusters))); + u64 blkno = le64_to_cpu(insert_rec->e_blkno); + + if (ocfs2_extents_adjacent(ext, insert_rec) && + ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) + return CONTIG_RIGHT; + + blkno = le64_to_cpu(ext->e_blkno); + if (ocfs2_extents_adjacent(insert_rec, ext) && + ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) + return CONTIG_LEFT; + + return CONTIG_NONE; } /* + * NOTE: We can have pretty much any combination of contiguousness and + * appending. + * + * The usefulness of APPEND_TAIL is more in that it lets us know that + * we'll have to update the path to that leaf. + */ +enum ocfs2_append_type { + APPEND_NONE = 0, + APPEND_TAIL, +}; + +struct ocfs2_insert_type { + enum ocfs2_append_type ins_appending; + enum ocfs2_contig_type ins_contig; + int ins_contig_index; + int ins_free_records; + int ins_tree_depth; +}; + +/* * How many free extents have we got before we need more meta data? */ int ocfs2_num_free_extents(struct ocfs2_super *osb, @@ -242,6 +424,28 @@ bail: } /* + * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). + * + * Returns the sum of the rightmost extent rec logical offset and + * cluster count. + * + * ocfs2_add_branch() uses this to determine what logical cluster + * value should be populated into the leftmost new branch records. + * + * ocfs2_shift_tree_depth() uses this to determine the # clusters + * value for the new topmost tree record. + */ +static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) +{ + int i; + + i = le16_to_cpu(el->l_next_free_rec) - 1; + + return le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); +} + +/* * Add an entire tree branch to our inode. eb_bh is the extent block * to start at, if we don't want to start the branch at the dinode * structure. @@ -250,7 +454,7 @@ bail: * for the new last extent block. * * the new branch will be 'empty' in the sense that every block will - * contain a single record with e_clusters == 0. + * contain a single record with cluster count == 0. */ static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, @@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; + u32 new_cpos; mlog_entry_void(); @@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, goto bail; } + eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; + new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); + /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be * linked with the rest of the tree. * conversly, new_eb_bhs[0] is the new bottommost leaf. @@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, eb->h_next_leaf_blk = 0; eb_el->l_tree_depth = cpu_to_le16(i); eb_el->l_next_free_rec = cpu_to_le16(1); - eb_el->l_recs[0].e_cpos = fe->i_clusters; + /* + * This actually counts as an empty extent as + * c_clusters == 0 + */ + eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); - eb_el->l_recs[0].e_clusters = cpu_to_le32(0); + /* + * eb_el isn't always an interior node, but even leaf + * nodes want a zero'd flags and reserved field so + * this gets the whole 32 bits regardless of use. + */ + eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); if (!eb_el->l_tree_depth) new_last_eb_blk = le64_to_cpu(eb->h_blkno); @@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * either be on the fe, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); - el->l_recs[i].e_cpos = fe->i_clusters; - el->l_recs[i].e_clusters = 0; + el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); + el->l_recs[i].e_int_clusters = 0; le16_add_cpu(&el->l_next_free_rec, 1); /* fe needs a new last extent block pointer, as does the @@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, struct buffer_head **ret_new_eb_bh) { int status, i; + u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, /* copy the fe data into the new extent block */ eb_el->l_tree_depth = fe_el->l_tree_depth; eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { - eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos; - eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters; - eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno; - } + for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) + eb_el->l_recs[i] = fe_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } + new_clusters = ocfs2_sum_rightmost_rec(eb_el); + /* update fe now */ le16_add_cpu(&fe_el->l_tree_depth, 1); fe_el->l_recs[0].e_cpos = 0; fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_clusters = fe->i_clusters; - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) { - fe_el->l_recs[i].e_cpos = 0; - fe_el->l_recs[i].e_clusters = 0; - fe_el->l_recs[i].e_blkno = 0; - } + fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) + memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); fe_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk @@ -515,199 +729,6 @@ bail: } /* - * Expects the tree to already have room in the rightmost leaf for the - * extent. Updates all the extent blocks (and the dinode) on the way - * down. - */ -static int ocfs2_do_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters) -{ - int status, i, num_bhs = 0; - u64 next_blkno; - u16 next_free; - struct buffer_head **eb_bhs = NULL; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; - - mlog_entry_void(); - - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; - if (el->l_tree_depth) { - /* This is another operation where we want to be - * careful about our tree updates. An error here means - * none of the previous changes we made should roll - * forward. As a result, we have to record the buffers - * for this part of the tree in an array and reserve a - * journal write to them before making any changes. */ - num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth); - eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!eb_bhs) { - status = -ENOMEM; - mlog_errno(status); - goto bail; - } - - i = 0; - while(el->l_tree_depth) { - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno); - - BUG_ON(i >= num_bhs); - status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i], - OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); - status = -EIO; - goto bail; - } - - status = ocfs2_journal_access(handle, inode, eb_bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } - - el = &eb->h_list; - i++; - /* When we leave this loop, eb_bhs[num_bhs - 1] will - * hold the bottom-most leaf extent block. */ - } - BUG_ON(el->l_tree_depth); - - el = &fe->id2.i_list; - /* If we have tree depth, then the fe update is - * trivial, and we want to switch el out for the - * bottom-most leaf in order to update it with the - * actual extent data below. */ - next_free = le16_to_cpu(el->l_next_free_rec); - if (next_free == 0) { - ocfs2_error(inode->i_sb, - "Dinode %llu has a bad extent list", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = -EIO; - goto bail; - } - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - /* (num_bhs - 1) to avoid the leaf */ - for(i = 0; i < (num_bhs - 1); i++) { - eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data; - el = &eb->h_list; - - /* finally, make our actual change to the - * intermediate extent blocks. */ - next_free = le16_to_cpu(el->l_next_free_rec); - le32_add_cpu(&el->l_recs[next_free - 1].e_clusters, - new_clusters); - - status = ocfs2_journal_dirty(handle, eb_bhs[i]); - if (status < 0) - mlog_errno(status); - } - BUG_ON(i != (num_bhs - 1)); - /* note that the leaf block wasn't touched in - * the loop above */ - eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data; - el = &eb->h_list; - BUG_ON(el->l_tree_depth); - } - - /* yay, we can finally add the actual extent now! */ - i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) && - ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) { - le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters); - } else if (le16_to_cpu(el->l_next_free_rec) && - (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) { - /* having an empty extent at eof is legal. */ - if (el->l_recs[i].e_cpos != fe->i_clusters) { - ocfs2_error(inode->i_sb, - "Dinode %llu trailing extent is bad: " - "cpos (%u) != number of clusters (%u)", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(fe->i_clusters)); - status = -EIO; - goto bail; - } - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - } else { - /* No contiguous record, or no empty record at eof, so - * we add a new one. */ - - BUG_ON(le16_to_cpu(el->l_next_free_rec) >= - le16_to_cpu(el->l_count)); - i = le16_to_cpu(el->l_next_free_rec); - - el->l_recs[i].e_blkno = cpu_to_le64(start_blk); - el->l_recs[i].e_clusters = cpu_to_le32(new_clusters); - el->l_recs[i].e_cpos = fe->i_clusters; - le16_add_cpu(&el->l_next_free_rec, 1); - } - - /* - * extent_map errors are not fatal, so they are ignored outside - * of flushing the thing. - */ - status = ocfs2_extent_map_append(inode, &el->l_recs[i], - new_clusters); - if (status) { - mlog_errno(status); - ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters)); - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) - mlog_errno(status); - if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]); - if (status < 0) - mlog_errno(status); - } - - status = 0; -bail: - if (eb_bhs) { - for (i = 0; i < num_bhs; i++) - if (eb_bhs[i]) - brelse(eb_bhs[i]); - kfree(eb_bhs); - } - - mlog_exit(status); - return status; -} - -/* * Should only be called when there is no space left in any of the * leaf nodes. What we want to do is find the lowest tree depth * non-leaf extent block with room for new records. There are three @@ -807,53 +828,1548 @@ bail: return status; } -/* the caller needs to update fe->i_clusters */ -int ocfs2_insert_extent(struct ocfs2_super *osb, - handle_t *handle, - struct inode *inode, - struct buffer_head *fe_bh, - u64 start_blk, - u32 new_clusters, - struct ocfs2_alloc_context *meta_ac) +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) { - int status, i, shift; - struct buffer_head *last_eb_bh = NULL; + return !rec->e_leaf_clusters; +} + +/* + * This function will discard the rightmost extent record. + */ +static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + int count = le16_to_cpu(el->l_count); + unsigned int num_bytes; + + BUG_ON(!next_free); + /* This will cause us to go off the end of our extent list. */ + BUG_ON(next_free >= count); + + num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; + + memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); +} + +static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i, insert_index, next_free, has_empty, num_bytes; + u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); + + BUG_ON(!next_free); + + /* The tree code before us didn't allow enough room in the leaf. */ + if (el->l_next_free_rec == el->l_count && !has_empty) + BUG(); + + /* + * The easiest way to approach this is to just remove the + * empty extent and temporarily decrement next_free. + */ + if (has_empty) { + /* + * If next_free was 1 (only an empty extent), this + * loop won't execute, which is fine. We still want + * the decrement above to happen. + */ + for(i = 0; i < (next_free - 1); i++) + el->l_recs[i] = el->l_recs[i+1]; + + next_free--; + } + + /* + * Figure out what the new record index should be. + */ + for(i = 0; i < next_free; i++) { + rec = &el->l_recs[i]; + + if (insert_cpos < le32_to_cpu(rec->e_cpos)) + break; + } + insert_index = i; + + mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", + insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); + + BUG_ON(insert_index < 0); + BUG_ON(insert_index >= le16_to_cpu(el->l_count)); + BUG_ON(insert_index > next_free); + + /* + * No need to memmove if we're just adding to the tail. + */ + if (insert_index != next_free) { + BUG_ON(next_free >= le16_to_cpu(el->l_count)); + + num_bytes = next_free - insert_index; + num_bytes *= sizeof(struct ocfs2_extent_rec); + memmove(&el->l_recs[insert_index + 1], + &el->l_recs[insert_index], + num_bytes); + } + + /* + * Either we had an empty extent, and need to re-increment or + * there was no empty extent on a non full rightmost leaf node, + * in which case we still need to increment. + */ + next_free++; + el->l_next_free_rec = cpu_to_le16(next_free); + /* + * Make sure none of the math above just messed up our tree. + */ + BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); + + el->l_recs[insert_index] = *insert_rec; + +} + +/* + * Create an empty extent record . + * + * l_next_free_rec may be updated. + * + * If an empty extent already exists do nothing. + */ +static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) +{ + int next_free = le16_to_cpu(el->l_next_free_rec); + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (next_free == 0) + goto set_and_inc; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) + return; + + mlog_bug_on_msg(el->l_count == el->l_next_free_rec, + "Asked to create an empty extent in a full list:\n" + "count = %u, tree depth = %u", + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_tree_depth)); + + ocfs2_shift_records_right(el); + +set_and_inc: + le16_add_cpu(&el->l_next_free_rec, 1); + memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); +} + +/* + * For a rotation which involves two leaf nodes, the "root node" is + * the lowest level tree node which contains a path to both leafs. This + * resulting set of information can be used to form a complete "subtree" + * + * This function is passed two full paths from the dinode down to a + * pair of adjacent leaves. It's task is to figure out which path + * index contains the subtree root - this can be the root index itself + * in a worst-case rotation. + * + * The array index of the subtree root is passed back. + */ +static int ocfs2_find_subtree_root(struct inode *inode, + struct ocfs2_path *left, + struct ocfs2_path *right) +{ + int i = 0; + + /* + * Check that the caller passed in two paths from the same tree. + */ + BUG_ON(path_root_bh(left) != path_root_bh(right)); + + do { + i++; + + /* + * The caller didn't pass two adjacent paths. + */ + mlog_bug_on_msg(i > left->p_tree_depth, + "Inode %lu, left depth %u, right depth %u\n" + "left leaf blk %llu, right leaf blk %llu\n", + inode->i_ino, left->p_tree_depth, + right->p_tree_depth, + (unsigned long long)path_leaf_bh(left)->b_blocknr, + (unsigned long long)path_leaf_bh(right)->b_blocknr); + } while (left->p_node[i].bh->b_blocknr == + right->p_node[i].bh->b_blocknr); + + return i - 1; +} + +typedef void (path_insert_t)(void *, struct buffer_head *); + +/* + * Traverse a btree path in search of cpos, starting at root_el. + * + * This code can be called with a cpos larger than the tree, in which + * case it will return the rightmost path. + */ +static int __ocfs2_find_path(struct inode *inode, + struct ocfs2_extent_list *root_el, u32 cpos, + path_insert_t *func, void *data) +{ + int i, ret = 0; + u32 range; + u64 blkno; struct buffer_head *bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + struct ocfs2_inode_info *oi = OCFS2_I(inode); - mlog_entry_void(); + el = root_el; + while (el->l_tree_depth) { + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent list at " + "depth %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; - mlog(0, "add %u clusters starting at block %llu to inode %llu\n", - new_clusters, (unsigned long long)start_blk, - (unsigned long long)OCFS2_I(inode)->ip_blkno); + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { + rec = &el->l_recs[i]; + + /* + * In the case that cpos is off the allocation + * tree, this should just wind up returning the + * rightmost record. + */ + range = le32_to_cpu(rec->e_cpos) + + ocfs2_rec_clusters(el, rec); + if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) + break; + } - if (el->l_tree_depth) { - /* jump to end of tree */ - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); - if (status < 0) { - mlog_exit(status); - goto bail; + blkno = le64_to_cpu(el->l_recs[i].e_blkno); + if (blkno == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad blkno in extent list " + "at depth %u (index %d)\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(el->l_tree_depth), i); + ret = -EROFS; + goto out; } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + + brelse(bh); + bh = NULL; + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, + &bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EIO; + goto out; + } + + if (le16_to_cpu(el->l_next_free_rec) > + le16_to_cpu(el->l_count)) { + ocfs2_error(inode->i_sb, + "Inode %llu has bad count in extent list " + "at block %llu (next free=%u, count=%u)\n", + (unsigned long long)oi->ip_blkno, + (unsigned long long)bh->b_blocknr, + le16_to_cpu(el->l_next_free_rec), + le16_to_cpu(el->l_count)); + ret = -EROFS; + goto out; + } + + if (func) + func(data, bh); + } + +out: + /* + * Catch any trailing bh that the loop didn't handle. + */ + brelse(bh); + + return ret; +} + +/* + * Given an initialized path (that is, it has a valid root extent + * list), this function will traverse the btree in search of the path + * which would contain cpos. + * + * The path traveled is recorded in the path structure. + * + * Note that this will not do any comparisons on leaf node extent + * records, so it will work fine in the case that we just added a tree + * branch. + */ +struct find_path_data { + int index; + struct ocfs2_path *path; +}; +static void find_path_ins(void *data, struct buffer_head *bh) +{ + struct find_path_data *fp = data; + + get_bh(bh); + ocfs2_path_insert_eb(fp->path, fp->index, bh); + fp->index++; +} +static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, + u32 cpos) +{ + struct find_path_data data; + + data.index = 1; + data.path = path; + return __ocfs2_find_path(inode, path_root_el(path), cpos, + find_path_ins, &data); +} + +static void find_leaf_ins(void *data, struct buffer_head *bh) +{ + struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; + struct ocfs2_extent_list *el = &eb->h_list; + struct buffer_head **ret = data; + + /* We want to retain only the leaf block. */ + if (le16_to_cpu(el->l_tree_depth) == 0) { + get_bh(bh); + *ret = bh; + } +} +/* + * Find the leaf block in the tree which would contain cpos. No + * checking of the actual leaf is done. + * + * Some paths want to call this instead of allocating a path structure + * and calling ocfs2_find_path(). + * + * This function doesn't handle non btree extent lists. + */ +int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, + u32 cpos, struct buffer_head **leaf_bh) +{ + int ret; + struct buffer_head *bh = NULL; + + ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + *leaf_bh = bh; +out: + return ret; +} + +/* + * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. + * + * Basically, we've moved stuff around at the bottom of the tree and + * we need to fix up the extent records above the changes to reflect + * the new changes. + * + * left_rec: the record on the left. + * left_child_el: is the child list pointed to by left_rec + * right_rec: the record to the right of left_rec + * right_child_el: is the child list pointed to by right_rec + * + * By definition, this only works on interior nodes. + */ +static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, + struct ocfs2_extent_list *left_child_el, + struct ocfs2_extent_rec *right_rec, + struct ocfs2_extent_list *right_child_el) +{ + u32 left_clusters, right_end; + + /* + * Interior nodes never have holes. Their cpos is the cpos of + * the leftmost record in their child list. Their cluster + * count covers the full theoretical range of their child list + * - the range between their cpos and the cpos of the record + * immediately to their right. + */ + left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); + left_clusters -= le32_to_cpu(left_rec->e_cpos); + left_rec->e_int_clusters = cpu_to_le32(left_clusters); + + /* + * Calculate the rightmost cluster count boundary before + * moving cpos - we will need to adjust clusters after + * updating e_cpos to keep the same highest cluster count. + */ + right_end = le32_to_cpu(right_rec->e_cpos); + right_end += le32_to_cpu(right_rec->e_int_clusters); + + right_rec->e_cpos = left_rec->e_cpos; + le32_add_cpu(&right_rec->e_cpos, left_clusters); + + right_end -= le32_to_cpu(right_rec->e_cpos); + right_rec->e_int_clusters = cpu_to_le32(right_end); +} + +/* + * Adjust the adjacent root node records involved in a + * rotation. left_el_blkno is passed in as a key so that we can easily + * find it's index in the root list. + */ +static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, + struct ocfs2_extent_list *left_el, + struct ocfs2_extent_list *right_el, + u64 left_el_blkno) +{ + int i; + + BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= + le16_to_cpu(left_el->l_tree_depth)); + + for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { + if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) + break; + } + + /* + * The path walking code should have never returned a root and + * two paths which are not adjacent. + */ + BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); + + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + &root_el->l_recs[i + 1], right_el); +} + +/* + * We've changed a leaf block (in right_path) and need to reflect that + * change back up the subtree. + * + * This happens in multiple places: + * - When we've moved an extent record from the left path leaf to the right + * path leaf to make room for an empty extent in the left path leaf. + * - When our insert into the right path leaf is at the leftmost edge + * and requires an update of the path immediately to it's left. This + * can occur at the end of some types of rotation and appending inserts. + */ +static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i, idx; + struct ocfs2_extent_list *el, *left_el, *right_el; + struct ocfs2_extent_rec *left_rec, *right_rec; + struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; + + /* + * Update the counts and position values within all the + * interior nodes to reflect the leaf rotation we just did. + * + * The root node is handled below the loop. + * + * We begin the loop with right_el and left_el pointing to the + * leaf lists and work our way up. + * + * NOTE: within this loop, left_el and right_el always refer + * to the *child* lists. + */ + left_el = path_leaf_el(left_path); + right_el = path_leaf_el(right_path); + for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { + mlog(0, "Adjust records at index %u\n", i); + + /* + * One nice property of knowing that all of these + * nodes are below the root is that we only deal with + * the leftmost right node record and the rightmost + * left node record. + */ + el = left_path->p_node[i].el; + idx = le16_to_cpu(left_el->l_next_free_rec) - 1; + left_rec = &el->l_recs[idx]; + + el = right_path->p_node[i].el; + right_rec = &el->l_recs[0]; + + ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, + right_el); + + ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); + if (ret) + mlog_errno(ret); + + ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); + if (ret) + mlog_errno(ret); + + /* + * Setup our list pointers now so that the current + * parents become children in the next iteration. + */ + left_el = left_path->p_node[i].el; + right_el = right_path->p_node[i].el; + } + + /* + * At the root node, adjust the two adjacent records which + * begin our path to the leaves. + */ + + el = left_path->p_node[subtree_index].el; + left_el = left_path->p_node[subtree_index + 1].el; + right_el = right_path->p_node[subtree_index + 1].el; + + ocfs2_adjust_root_records(el, left_el, right_el, + left_path->p_node[subtree_index + 1].bh->b_blocknr); + + root_bh = left_path->p_node[subtree_index].bh; + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) + mlog_errno(ret); +} + +static int ocfs2_rotate_subtree_right(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + int subtree_index) +{ + int ret, i; + struct buffer_head *right_leaf_bh; + struct buffer_head *left_leaf_bh = NULL; + struct buffer_head *root_bh; + struct ocfs2_extent_list *right_el, *left_el; + struct ocfs2_extent_rec move_rec; + + left_leaf_bh = path_leaf_bh(left_path); + left_el = path_leaf_el(left_path); + + if (left_el->l_next_free_rec != left_el->l_count) { + ocfs2_error(inode->i_sb, + "Inode %llu has non-full interior leaf node %llu" + "(next free = %u)", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)left_leaf_bh->b_blocknr, + le16_to_cpu(left_el->l_next_free_rec)); + return -EROFS; + } + + /* + * This extent block may already have an empty record, so we + * return early if so. + */ + if (ocfs2_is_empty_extent(&left_el->l_recs[0])) + return 0; + + root_bh = left_path->p_node[subtree_index].bh; + BUG_ON(root_bh != right_path->p_node[subtree_index].bh); + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for(i = subtree_index + 1; i < path_num_items(right_path); i++) { + ret = ocfs2_journal_access(handle, inode, + right_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, + left_path->p_node[i].bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + right_leaf_bh = path_leaf_bh(right_path); + right_el = path_leaf_el(right_path); + + /* This is a code error, not a disk corruption. */ + mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " + "because rightmost leaf block %llu is empty\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)right_leaf_bh->b_blocknr); + + ocfs2_create_empty_extent(right_el); + + ret = ocfs2_journal_dirty(handle, right_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Do the copy now. */ + i = le16_to_cpu(left_el->l_next_free_rec) - 1; + move_rec = left_el->l_recs[i]; + right_el->l_recs[0] = move_rec; + + /* + * Clear out the record we just copied and shift everything + * over, leaving an empty extent in the left leaf. + * + * We temporarily subtract from next_free_rec so that the + * shift will lose the tail record (which is now defunct). + */ + le16_add_cpu(&left_el->l_next_free_rec, -1); + ocfs2_shift_records_right(left_el); + memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&left_el->l_next_free_rec, 1); + + ret = ocfs2_journal_dirty(handle, left_leaf_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + ocfs2_complete_edge_insert(inode, handle, left_path, right_path, + subtree_index); + +out: + return ret; +} + +/* + * Given a full path, determine what cpos value would return us a path + * containing the leaf immediately to the left of the current one. + * + * Will return zero if the path passed in is already the leftmost path. + */ +static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) +{ + int i, j, ret = 0; + u64 blkno; + struct ocfs2_extent_list *el; + + BUG_ON(path->p_tree_depth == 0); + + *cpos = 0; + + blkno = path_leaf_bh(path)->b_blocknr; + + /* Start at the tree node just above the leaf and work our way up. */ + i = path->p_tree_depth - 1; + while (i >= 0) { + el = path->p_node[i].el; + + /* + * Find the extent record just before the one in our + * path. + */ + for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { + if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { + if (j == 0) { + if (i == 0) { + /* + * We've determined that the + * path specified is already + * the leftmost one - return a + * cpos of zero. + */ + goto out; + } + /* + * The leftmost record points to our + * leaf - we need to travel up the + * tree one level. + */ + goto next_node; + } + + *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); + *cpos = *cpos + ocfs2_rec_clusters(el, + &el->l_recs[j - 1]); + *cpos = *cpos - 1; + goto out; + } + } + + /* + * If we got here, we never found a valid node where + * the tree indicated one should be. + */ + ocfs2_error(sb, + "Invalid extent tree at extent block %llu\n", + (unsigned long long)blkno); + ret = -EROFS; + goto out; + +next_node: + blkno = path->p_node[i].bh->b_blocknr; + i--; + } + +out: + return ret; +} + +static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, + struct ocfs2_path *path) +{ + int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; + + if (handle->h_buffer_credits < credits) + return ocfs2_extend_trans(handle, credits); + + return 0; +} + +/* + * Trap the case where we're inserting into the theoretical range past + * the _actual_ left leaf range. Otherwise, we'll rotate a record + * whose cpos is less than ours into the right leaf. + * + * It's only necessary to look at the rightmost record of the left + * leaf because the logic that calls us should ensure that the + * theoretical ranges in the path components above the leaves are + * correct. + */ +static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, + u32 insert_cpos) +{ + struct ocfs2_extent_list *left_el; + struct ocfs2_extent_rec *rec; + int next_free; + + left_el = path_leaf_el(left_path); + next_free = le16_to_cpu(left_el->l_next_free_rec); + rec = &left_el->l_recs[next_free - 1]; + + if (insert_cpos > le32_to_cpu(rec->e_cpos)) + return 1; + return 0; +} + +/* + * Rotate all the records in a btree right one record, starting at insert_cpos. + * + * The path to the rightmost leaf should be passed in. + * + * The array is assumed to be large enough to hold an entire path (tree depth). + * + * Upon succesful return from this function: + * + * - The 'right_path' array will contain a path to the leaf block + * whose range contains e_cpos. + * - That leaf block will have a single empty extent in list index 0. + * - In the case that the rotation requires a post-insert update, + * *ret_left_path will contain a valid path which can be passed to + * ocfs2_insert_path(). + */ +static int ocfs2_rotate_tree_right(struct inode *inode, + handle_t *handle, + u32 insert_cpos, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, start; + u32 cpos; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); + + /* + * What we want to do here is: + * + * 1) Start with the rightmost path. + * + * 2) Determine a path to the leaf block directly to the left + * of that leaf. + * + * 3) Determine the 'subtree root' - the lowest level tree node + * which contains a path to both leaves. + * + * 4) Rotate the subtree. + * + * 5) Find the next subtree by considering the left path to be + * the new right path. + * + * The check at the top of this while loop also accepts + * insert_cpos == cpos because cpos is only a _theoretical_ + * value to get us the left path - insert_cpos might very well + * be filling that hole. + * + * Stop at a cpos of '0' because we either started at the + * leftmost branch (i.e., a tree with one branch and a + * rotation inside of it), or we've gone as far as we can in + * rotating subtrees. + */ + while (cpos && insert_cpos <= cpos) { + mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", + insert_cpos, cpos); + + ret = ocfs2_find_path(inode, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog_bug_on_msg(path_leaf_bh(left_path) == + path_leaf_bh(right_path), + "Inode %lu: error during insert of %u " + "(left path cpos %u) results in two identical " + "paths ending at %llu\n", + inode->i_ino, insert_cpos, cpos, + (unsigned long long) + path_leaf_bh(left_path)->b_blocknr); + + if (ocfs2_rotate_requires_path_adjustment(left_path, + insert_cpos)) { + mlog(0, "Path adjustment required\n"); + + /* + * We've rotated the tree as much as we + * should. The rest is up to + * ocfs2_insert_path() to complete, after the + * record insertion. We indicate this + * situation by returning the left path. + * + * The reason we don't adjust the records here + * before the record insert is that an error + * later might break the rule where a parent + * record e_cpos will reflect the actual + * e_cpos of the 1st nonempty record of the + * child list. + */ + *ret_left_path = left_path; + goto out_ret_path; + } + + start = ocfs2_find_subtree_root(inode, left_path, right_path); + + mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", + start, + (unsigned long long) right_path->p_node[start].bh->b_blocknr, + right_path->p_tree_depth); + + ret = ocfs2_extend_rotate_transaction(handle, start, + right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rotate_subtree_right(inode, handle, left_path, + right_path, start); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * There is no need to re-read the next right path + * as we know that it'll be our current left + * path. Optimize by copying values instead. + */ + ocfs2_mv_path(right_path, left_path); + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + } + +out: + ocfs2_free_path(left_path); + +out_ret_path: + return ret; +} + +/* + * Do the final bits of extent record insertion at the target leaf + * list. If this leaf is part of an allocation tree, it is assumed + * that the tree above has been prepared. + */ +static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_list *el, + struct ocfs2_insert_type *insert, + struct inode *inode) +{ + int i = insert->ins_contig_index; + unsigned int range; + struct ocfs2_extent_rec *rec; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + /* + * Contiguous insert - either left or right. + */ + if (insert->ins_contig != CONTIG_NONE) { + rec = &el->l_recs[i]; + if (insert->ins_contig == CONTIG_LEFT) { + rec->e_blkno = insert_rec->e_blkno; + rec->e_cpos = insert_rec->e_cpos; + } + le16_add_cpu(&rec->e_leaf_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + return; + } + + /* + * Handle insert into an empty leaf. + */ + if (le16_to_cpu(el->l_next_free_rec) == 0 || + ((le16_to_cpu(el->l_next_free_rec) == 1) && + ocfs2_is_empty_extent(&el->l_recs[0]))) { + el->l_recs[0] = *insert_rec; + el->l_next_free_rec = cpu_to_le16(1); + return; + } + + /* + * Appending insert. + */ + if (insert->ins_appending == APPEND_TAIL) { + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + range = le32_to_cpu(rec->e_cpos) + + le16_to_cpu(rec->e_leaf_clusters); + BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); + + mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= + le16_to_cpu(el->l_count), + "inode %lu, depth %u, count %u, next free %u, " + "rec.cpos %u, rec.clusters %u, " + "insert.cpos %u, insert.clusters %u\n", + inode->i_ino, + le16_to_cpu(el->l_tree_depth), + le16_to_cpu(el->l_count), + le16_to_cpu(el->l_next_free_rec), + le32_to_cpu(el->l_recs[i].e_cpos), + le16_to_cpu(el->l_recs[i].e_leaf_clusters), + le32_to_cpu(insert_rec->e_cpos), + le16_to_cpu(insert_rec->e_leaf_clusters)); + i++; + el->l_recs[i] = *insert_rec; + le16_add_cpu(&el->l_next_free_rec, 1); + return; + } + + /* + * Ok, we have to rotate. + * + * At this point, it is safe to assume that inserting into an + * empty leaf and appending to a leaf have both been handled + * above. + * + * This leaf needs to have space, either by the empty 1st + * extent record, or by virtue of an l_next_rec < l_count. + */ + ocfs2_rotate_leaf(el, insert_rec); +} + +static inline void ocfs2_update_dinode_clusters(struct inode *inode, + struct ocfs2_dinode *di, + u32 clusters) +{ + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_path *right_path, + struct ocfs2_path **ret_left_path) +{ + int ret, i, next_free; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_path *left_path = NULL; + + *ret_left_path = NULL; + + /* + * This shouldn't happen for non-trees. The extent rec cluster + * count manipulation below only works for interior nodes. + */ + BUG_ON(right_path->p_tree_depth == 0); + + /* + * If our appending insert is at the leftmost edge of a leaf, + * then we might need to update the rightmost records of the + * neighboring path. + */ + el = path_leaf_el(right_path); + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { + u32 left_cpos; + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, + &left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Append may need a left path update. cpos: %u, " + "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), + left_cpos); + + /* + * No need to worry if the append is already in the + * leftmost leaf. + */ + if (left_cpos) { + left_path = ocfs2_new_path(path_root_bh(right_path), + path_root_el(right_path)); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(inode, left_path, left_cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * ocfs2_insert_path() will pass the left_path to the + * journal for us. + */ + } + } + + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_root_el(right_path); + bh = path_root_bh(right_path); + i = 0; + while (1) { + struct ocfs2_extent_rec *rec; + + next_free = le16_to_cpu(el->l_next_free_rec); + if (next_free == 0) { + ocfs2_error(inode->i_sb, + "Dinode %llu has a bad extent list", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + ret = -EIO; + goto out; + } + + rec = &el->l_recs[next_free - 1]; + + rec->e_int_clusters = insert_rec->e_cpos; + le32_add_cpu(&rec->e_int_clusters, + le16_to_cpu(insert_rec->e_leaf_clusters)); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret) + mlog_errno(ret); + + /* Don't touch the leaf node */ + if (++i >= right_path->p_tree_depth) + break; + + bh = right_path->p_node[i].bh; + el = right_path->p_node[i].el; + } + + *ret_left_path = left_path; + ret = 0; +out: + if (ret != 0) + ocfs2_free_path(left_path); + + return ret; +} + +/* + * This function only does inserts on an allocation b-tree. For dinode + * lists, ocfs2_insert_at_leaf() is called directly. + * + * right_path is the path we want to do the actual insert + * in. left_path should only be passed in if we need to update that + * portion of the tree after an edge insert. + */ +static int ocfs2_insert_path(struct inode *inode, + handle_t *handle, + struct ocfs2_path *left_path, + struct ocfs2_path *right_path, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret, subtree_index; + struct buffer_head *leaf_bh = path_leaf_bh(right_path); + struct ocfs2_extent_list *el; + + /* + * Pass both paths to the journal. The majority of inserts + * will be touching all components anyway. + */ + ret = ocfs2_journal_access_path(inode, handle, right_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (left_path) { + int credits = handle->h_buffer_credits; + + /* + * There's a chance that left_path got passed back to + * us without being accounted for in the + * journal. Extend our transaction here to be sure we + * can change those blocks. + */ + credits += left_path->p_tree_depth; + + ret = ocfs2_extend_trans(handle, credits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, left_path); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + el = path_leaf_el(right_path); + + ocfs2_insert_at_leaf(insert_rec, el, insert, inode); + ret = ocfs2_journal_dirty(handle, leaf_bh); + if (ret) + mlog_errno(ret); + + if (left_path) { + /* + * The rotate code has indicated that we need to fix + * up portions of the tree after the insert. + * + * XXX: Should we extend the transaction here? + */ + subtree_index = ocfs2_find_subtree_root(inode, left_path, + right_path); + ocfs2_complete_edge_insert(inode, handle, left_path, + right_path, subtree_index); + } + + ret = 0; +out: + return ret; +} + +static int ocfs2_do_insert_extent(struct inode *inode, + handle_t *handle, + struct buffer_head *di_bh, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *type) +{ + int ret, rotate = 0; + u32 cpos; + struct ocfs2_path *right_path = NULL; + struct ocfs2_path *left_path = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_list *el; + + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (le16_to_cpu(el->l_tree_depth) == 0) { + ocfs2_insert_at_leaf(insert_rec, el, type, inode); + goto out_update_clusters; + } + + right_path = ocfs2_new_inode_path(di_bh); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * Determine the path to start with. Rotations need the + * rightmost path, everything else can go directly to the + * target leaf. + */ + cpos = le32_to_cpu(insert_rec->e_cpos); + if (type->ins_appending == APPEND_NONE && + type->ins_contig == CONTIG_NONE) { + rotate = 1; + cpos = UINT_MAX; + } + + ret = ocfs2_find_path(inode, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * Rotations and appends need special treatment - they modify + * parts of the tree's above them. + * + * Both might pass back a path immediate to the left of the + * one being inserted to. This will be cause + * ocfs2_insert_path() to modify the rightmost records of + * left_path to account for an edge insert. + * + * XXX: When modifying this code, keep in mind that an insert + * can wind up skipping both of these two special cases... + */ + if (rotate) { + ret = ocfs2_rotate_tree_right(inode, handle, + le32_to_cpu(insert_rec->e_cpos), + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (type->ins_appending == APPEND_TAIL + && type->ins_contig != CONTIG_LEFT) { + ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, + right_path, &left_path); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_insert_path(inode, handle, left_path, right_path, + insert_rec, type); + if (ret) { + mlog_errno(ret); + goto out; + } + +out_update_clusters: + ocfs2_update_dinode_clusters(inode, di, + le16_to_cpu(insert_rec->e_leaf_clusters)); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + + return ret; +} + +static void ocfs2_figure_contig_type(struct inode *inode, + struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + enum ocfs2_contig_type contig_type = CONTIG_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], + insert_rec); + if (contig_type != CONTIG_NONE) { + insert->ins_contig_index = i; + break; + } + } + insert->ins_contig = contig_type; +} + +/* + * This should only be called against the righmost leaf extent list. + * + * ocfs2_figure_appending_type() will figure out whether we'll have to + * insert at the tail of the rightmost leaf. + * + * This should also work against the dinode list for tree's with 0 + * depth. If we consider the dinode list to be the rightmost leaf node + * then the logic here makes sense. + */ +static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, + struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *insert_rec) +{ + int i; + u32 cpos = le32_to_cpu(insert_rec->e_cpos); + struct ocfs2_extent_rec *rec; + + insert->ins_appending = APPEND_NONE; + + BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); + + if (!el->l_next_free_rec) + goto set_tail_append; + + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + /* Were all records empty? */ + if (le16_to_cpu(el->l_next_free_rec) == 1) + goto set_tail_append; } - /* Can we allocate without adding/shifting tree bits? */ i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le16_to_cpu(el->l_next_free_rec) == 0 - || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) - || le32_to_cpu(el->l_recs[i].e_clusters) == 0 - || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) - goto out_add; + rec = &el->l_recs[i]; + + if (cpos >= + (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) + goto set_tail_append; + + return; + +set_tail_append: + insert->ins_appending = APPEND_TAIL; +} + +/* + * Helper function called at the begining of an insert. + * + * This computes a few things that are commonly used in the process of + * inserting into the btree: + * - Whether the new extent is contiguous with an existing one. + * - The current tree depth. + * - Whether the insert is an appending one. + * - The total # of free records in the tree. + * + * All of the information is stored on the ocfs2_insert_type + * structure. + */ +static int ocfs2_figure_insert_type(struct inode *inode, + struct buffer_head *di_bh, + struct buffer_head **last_eb_bh, + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_insert_type *insert) +{ + int ret; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + struct ocfs2_path *path = NULL; + struct buffer_head *bh = NULL; + + el = &di->id2.i_list; + insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); + + if (el->l_tree_depth) { + /* + * If we have tree depth, we read in the + * rightmost extent block ahead of time as + * ocfs2_figure_insert_type() and ocfs2_add_branch() + * may want it later. + */ + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(di->i_last_eb_blk), &bh, + OCFS2_BH_CACHED, inode); + if (ret) { + mlog_exit(ret); + goto out; + } + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + } + + /* + * Unless we have a contiguous insert, we'll need to know if + * there is room left in our allocation tree for another + * extent record. + * + * XXX: This test is simplistic, we can search for empty + * extent records too. + */ + insert->ins_free_records = le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec); + + if (!insert->ins_tree_depth) { + ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_appending_type(insert, el, insert_rec); + return 0; + } + + path = ocfs2_new_inode_path(di_bh); + if (!path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + /* + * In the case that we're inserting past what the tree + * currently accounts for, ocfs2_find_path() will return for + * us the rightmost tree path. This is accounted for below in + * the appending code. + */ + ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); + if (ret) { + mlog_errno(ret); + goto out; + } + + el = path_leaf_el(path); + + /* + * Now that we have the path, there's two things we want to determine: + * 1) Contiguousness (also set contig_index if this is so) + * + * 2) Are we doing an append? We can trivially break this up + * into two types of appends: simple record append, or a + * rotate inside the tail leaf. + */ + ocfs2_figure_contig_type(inode, insert, el, insert_rec); + + /* + * The insert code isn't quite ready to deal with all cases of + * left contiguousness. Specifically, if it's an insert into + * the 1st record in a leaf, it will require the adjustment of + * cluster count on the last record of the path directly to it's + * left. For now, just catch that case and fool the layers + * above us. This works just fine for tree_depth == 0, which + * is why we allow that above. + */ + if (insert->ins_contig == CONTIG_LEFT && + insert->ins_contig_index == 0) + insert->ins_contig = CONTIG_NONE; + + /* + * Ok, so we can simply compare against last_eb to figure out + * whether the path doesn't exist. This will only happen in + * the case that we're doing a tail append, so maybe we can + * take advantage of that information somehow. + */ + if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + /* + * Ok, ocfs2_find_path() returned us the rightmost + * tree path. This might be an appending insert. There are + * two cases: + * 1) We're doing a true append at the tail: + * -This might even be off the end of the leaf + * 2) We're "appending" by rotating in the tail + */ + ocfs2_figure_appending_type(insert, el, insert_rec); + } + +out: + ocfs2_free_path(path); + + if (ret == 0) + *last_eb_bh = bh; + else + brelse(bh); + return ret; +} + +/* + * Insert an extent into an inode btree. + * + * The caller needs to update fe->i_clusters + */ +int ocfs2_insert_extent(struct ocfs2_super *osb, + handle_t *handle, + struct inode *inode, + struct buffer_head *fe_bh, + u32 cpos, + u64 start_blk, + u32 new_clusters, + struct ocfs2_alloc_context *meta_ac) +{ + int status, shift; + struct buffer_head *last_eb_bh = NULL; + struct buffer_head *bh = NULL; + struct ocfs2_insert_type insert = {0, }; + struct ocfs2_extent_rec rec; + + mlog(0, "add %u clusters at position %u to inode %llu\n", + new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, + OCFS2_I(inode)->ip_clusters); + + memset(&rec, 0, sizeof(rec)); + rec.e_cpos = cpu_to_le32(cpos); + rec.e_blkno = cpu_to_le64(start_blk); + rec.e_leaf_clusters = cpu_to_le16(new_clusters); + + status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + &insert); + if (status < 0) { + mlog_errno(status); + goto bail; + } - mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing " - "tree now.\n"); + mlog(0, "Insert.appending: %u, Insert.Contig: %u, " + "Insert.contig_index: %d, Insert.free_records: %d, " + "Insert.tree_depth: %d\n", + insert.ins_appending, insert.ins_contig, insert.ins_contig_index, + insert.ins_free_records, insert.ins_tree_depth); + + /* + * Avoid growing the tree unless we're out of records and the + * insert type requres one. + */ + if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) + goto out_add; shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); if (shift < 0) { @@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, * and didn't find room for any more extents - we need to add * another tree level */ if (shift) { - /* if we hit a leaf, we'd better be empty :) */ - BUG_ON(le16_to_cpu(el->l_next_free_rec) != - le16_to_cpu(el->l_count)); BUG_ON(bh); - mlog(0, "ocfs2_allocate_extent: need to shift tree depth " - "(current = %u)\n", - le16_to_cpu(fe->id2.i_list.l_tree_depth)); + mlog(0, "need to shift tree depth " + "(current = %d)\n", insert.ins_tree_depth); /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to @@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + insert.ins_tree_depth++; /* Special case: we have room now if we shifted from * tree_depth 0 */ - if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1)) + if (insert.ins_tree_depth == 1) goto out_add; } /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ - mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh); + mlog(0, "add branch. bh = %p\n", bh); status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, meta_ac); if (status < 0) { @@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } out_add: - /* Finally, we can add clusters. */ - status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh, - start_blk, new_clusters); + /* Finally, we can add clusters. This might rotate the tree for us. */ + status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); if (status < 0) mlog_errno(status); + else + ocfs2_extent_map_insert_rec(inode, &rec); bail: if (bh) @@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * block will be deleted, and if it will, what the new last extent * block will be so we can update his h_next_leaf_blk field, as well * as the dinodes i_last_eb_blk */ -static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb, - struct inode *inode, - struct ocfs2_dinode *fe, - u32 new_i_clusters, - struct buffer_head *old_last_eb, +static int ocfs2_find_new_last_ext_blk(struct inode *inode, + unsigned int clusters_to_del, + struct ocfs2_path *path, struct buffer_head **new_last_eb) { - int i, status = 0; - u64 block = 0; + int next_free, ret = 0; + u32 cpos; + struct ocfs2_extent_rec *rec; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; *new_last_eb = NULL; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } - /* we have no tree, so of course, no last_eb. */ - if (!fe->id2.i_list.l_tree_depth) - goto bail; + if (!path->p_tree_depth) + goto out; /* trunc to zero special case - this makes tree_depth = 0 * regardless of what it is. */ - if (!new_i_clusters) - goto bail; + if (OCFS2_I(inode)->ip_clusters == clusters_to_del) + goto out; - eb = (struct ocfs2_extent_block *) old_last_eb->b_data; - el = &(eb->h_list); + el = path_leaf_el(path); BUG_ON(!el->l_next_free_rec); - /* Make sure that this guy will actually be empty after we - * clear away the data. */ - if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters) - goto bail; + /* + * Make sure that this extent list will actually be empty + * after we clear away the data. We can shortcut out if + * there's more than one non-empty extent in the + * list. Otherwise, a check of the remaining extent is + * necessary. + */ + next_free = le16_to_cpu(el->l_next_free_rec); + rec = NULL; + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + if (next_free > 2) + goto out; - /* Ok, at this point, we know that last_eb will definitely - * change, so lets traverse the tree and find the second to - * last extent block. */ - el = &(fe->id2.i_list); - /* go down the tree, */ - do { - for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) { - if (le32_to_cpu(el->l_recs[i].e_cpos) < - new_i_clusters) { - block = le64_to_cpu(el->l_recs[i].e_blkno); - break; - } + /* We may have a valid extent in index 1, check it. */ + if (next_free == 2) + rec = &el->l_recs[1]; + + /* + * Fall through - no more nonempty extents, so we want + * to delete this leaf. + */ + } else { + if (next_free > 1) + goto out; + + rec = &el->l_recs[0]; + } + + if (rec) { + /* + * Check it we'll only be trimming off the end of this + * cluster. + */ + if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) + goto out; + } + + ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) bh->b_data; + el = &eb->h_list; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + ret = -EROFS; + goto out; + } + + *new_last_eb = bh; + get_bh(*new_last_eb); + mlog(0, "returning block %llu, (cpos: %u)\n", + (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); +out: + brelse(bh); + + return ret; +} + +/* + * Trim some clusters off the rightmost edge of a tree. Only called + * during truncate. + * + * The caller needs to: + * - start journaling of each path component. + * - compute and fully set up any new last ext block + */ +static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, + handle_t *handle, struct ocfs2_truncate_context *tc, + u32 clusters_to_del, u64 *delete_start) +{ + int ret, i, index = path->p_tree_depth; + u32 new_edge = 0; + u64 deleted_eb = 0; + struct buffer_head *bh; + struct ocfs2_extent_list *el; + struct ocfs2_extent_rec *rec; + + *delete_start = 0; + + while (index >= 0) { + bh = path->p_node[index].bh; + el = path->p_node[index].el; + + mlog(0, "traveling tree (index = %d, block = %llu)\n", + index, (unsigned long long)bh->b_blocknr); + + BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); + + if (index != + (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { + ocfs2_error(inode->i_sb, + "Inode %lu has invalid ext. block %llu", + inode->i_ino, + (unsigned long long)bh->b_blocknr); + ret = -EROFS; + goto out; } - BUG_ON(i < 0); - if (bh) { - brelse(bh); - bh = NULL; +find_tail_record: + i = le16_to_cpu(el->l_next_free_rec) - 1; + rec = &el->l_recs[i]; + + mlog(0, "Extent list before: record %d: (%u, %u, %llu), " + "next = %u\n", i, le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec), + (unsigned long long)le64_to_cpu(rec->e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); + + if (le16_to_cpu(el->l_tree_depth) == 0) { + /* + * If the leaf block contains a single empty + * extent and no records, we can just remove + * the block. + */ + if (i == 0 && ocfs2_is_empty_extent(rec)) { + memset(rec, 0, + sizeof(struct ocfs2_extent_rec)); + el->l_next_free_rec = cpu_to_le16(0); + + goto delete; + } + + /* + * Remove any empty extents by shifting things + * left. That should make life much easier on + * the code below. This condition is rare + * enough that we shouldn't see a performance + * hit. + */ + if (ocfs2_is_empty_extent(&el->l_recs[0])) { + le16_add_cpu(&el->l_next_free_rec, -1); + + for(i = 0; + i < le16_to_cpu(el->l_next_free_rec); i++) + el->l_recs[i] = el->l_recs[i + 1]; + + memset(&el->l_recs[i], 0, + sizeof(struct ocfs2_extent_rec)); + + /* + * We've modified our extent list. The + * simplest way to handle this change + * is to being the search from the + * start again. + */ + goto find_tail_record; + } + + le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); + + /* + * We'll use "new_edge" on our way back up the + * tree to know what our rightmost cpos is. + */ + new_edge = le16_to_cpu(rec->e_leaf_clusters); + new_edge += le32_to_cpu(rec->e_cpos); + + /* + * The caller will use this to delete data blocks. + */ + *delete_start = le64_to_cpu(rec->e_blkno) + + ocfs2_clusters_to_blocks(inode->i_sb, + le16_to_cpu(rec->e_leaf_clusters)); + + /* + * If it's now empty, remove this record. + */ + if (le16_to_cpu(rec->e_leaf_clusters) == 0) { + memset(rec, 0, + sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&el->l_next_free_rec, -1); + } + } else { + if (le64_to_cpu(rec->e_blkno) == deleted_eb) { + memset(rec, 0, + sizeof(struct ocfs2_extent_rec)); + le16_add_cpu(&el->l_next_free_rec, -1); + + goto delete; + } + + /* Can this actually happen? */ + if (le16_to_cpu(el->l_next_free_rec) == 0) + goto delete; + + /* + * We never actually deleted any clusters + * because our leaf was empty. There's no + * reason to adjust the rightmost edge then. + */ + if (new_edge == 0) + goto delete; + + rec->e_int_clusters = cpu_to_le32(new_edge); + le32_add_cpu(&rec->e_int_clusters, + -le32_to_cpu(rec->e_cpos)); + + /* + * A deleted child record should have been + * caught above. + */ + BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); } - status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED, - inode); - if (status < 0) { - mlog_errno(status); - goto bail; +delete: + ret = ocfs2_journal_dirty(handle, bh); + if (ret) { + mlog_errno(ret); + goto out; } - eb = (struct ocfs2_extent_block *) bh->b_data; - el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; + + mlog(0, "extent list container %llu, after: record %d: " + "(%u, %u, %llu), next = %u.\n", + (unsigned long long)bh->b_blocknr, i, + le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), + (unsigned long long)le64_to_cpu(rec->e_blkno), + le16_to_cpu(el->l_next_free_rec)); + + /* + * We must be careful to only attempt delete of an + * extent block (and not the root inode block). + */ + if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + /* + * Save this for use when processing the + * parent block. + */ + deleted_eb = le64_to_cpu(eb->h_blkno); + + mlog(0, "deleting this extent block.\n"); + + ocfs2_remove_from_cache(inode, bh); + + BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); + BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); + BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); + + if (le16_to_cpu(eb->h_suballoc_slot) == 0) { + /* + * This code only understands how to + * lock the suballocator in slot 0, + * which is fine because allocation is + * only ever done out of that + * suballocator too. A future version + * might change that however, so avoid + * a free if we don't know how to + * handle it. This way an fs incompat + * bit will not be necessary. + */ + ret = ocfs2_free_extent_block(handle, + tc->tc_ext_alloc_inode, + tc->tc_ext_alloc_bh, + eb); + + /* An error here is not fatal. */ + if (ret < 0) + mlog_errno(ret); + } + } else { + deleted_eb = 0; } - } while (el->l_tree_depth); - *new_last_eb = bh; - get_bh(*new_last_eb); - mlog(0, "returning block %llu\n", - (unsigned long long)le64_to_cpu(eb->h_blkno)); -bail: - if (bh) - brelse(bh); + index--; + } - return status; + ret = 0; +out: + return ret; } static int ocfs2_do_truncate(struct ocfs2_super *osb, unsigned int clusters_to_del, struct inode *inode, struct buffer_head *fe_bh, - struct buffer_head *old_last_eb_bh, handle_t *handle, - struct ocfs2_truncate_context *tc) + struct ocfs2_truncate_context *tc, + struct ocfs2_path *path) { - int status, i, depth; + int status; struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; struct ocfs2_extent_block *last_eb = NULL; struct ocfs2_extent_list *el; - struct buffer_head *eb_bh = NULL; struct buffer_head *last_eb_bh = NULL; - u64 next_eb = 0; u64 delete_blk = 0; fe = (struct ocfs2_dinode *) fe_bh->b_data; - status = ocfs2_find_new_last_ext_blk(osb, - inode, - fe, - le32_to_cpu(fe->i_clusters) - - clusters_to_del, - old_last_eb_bh, - &last_eb_bh); + status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, + path, &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } - if (last_eb_bh) - last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* + * Each component will be touched, so we might as well journal + * here to avoid having to handle errors later. + */ + status = ocfs2_journal_access_path(inode, handle, path); if (status < 0) { mlog_errno(status); goto bail; } + + if (last_eb_bh) { + status = ocfs2_journal_access(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; + } + el = &(fe->id2.i_list); + /* + * Lower levels depend on this never happening, but it's best + * to check it up here before changing the tree. + */ + if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { + ocfs2_error(inode->i_sb, + "Inode %lu has an empty extent record, depth %u\n", + inode->i_ino, le16_to_cpu(el->l_tree_depth)); + status = -EROFS; + goto bail; + } + spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - clusters_to_del; spin_unlock(&OCFS2_I(inode)->ip_lock); le32_add_cpu(&fe->i_clusters, -clusters_to_del); - fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec); - - i = le16_to_cpu(el->l_next_free_rec) - 1; - - BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); - le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); - /* tree depth zero, we can just delete the clusters, otherwise - * we need to record the offset of the next level extent block - * as we may overwrite it. */ - if (!el->l_tree_depth) - delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) - + ocfs2_clusters_to_blocks(osb->sb, - le32_to_cpu(el->l_recs[i].e_clusters)); - else - next_eb = le64_to_cpu(el->l_recs[i].e_blkno); - if (!el->l_recs[i].e_clusters) { - /* if we deleted the whole extent record, then clear - * out the other fields and update the extent - * list. For depth > 0 trees, we've already recorded - * the extent block in 'next_eb' */ - el->l_recs[i].e_cpos = 0; - el->l_recs[i].e_blkno = 0; - BUG_ON(!el->l_next_free_rec); - le16_add_cpu(&el->l_next_free_rec, -1); + status = ocfs2_trim_tree(inode, path, handle, tc, + clusters_to_del, &delete_blk); + if (status) { + mlog_errno(status); + goto bail; } - depth = le16_to_cpu(el->l_tree_depth); - if (!fe->i_clusters) { + if (le32_to_cpu(fe->i_clusters) == 0) { /* trunc to zero is a special case. */ el->l_tree_depth = 0; fe->i_last_eb_blk = 0; @@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, /* If there will be a new last extent block, then by * definition, there cannot be any leaves to the right of * him. */ - status = ocfs2_journal_access(handle, inode, last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; - } last_eb->h_next_leaf_blk = 0; status = ocfs2_journal_dirty(handle, last_eb_bh); if (status < 0) { @@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } } - /* if our tree depth > 0, update all the tree blocks below us. */ - while (depth) { - mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n", - depth, (unsigned long long)next_eb); - status = ocfs2_read_block(osb, next_eb, &eb_bh, - OCFS2_BH_CACHED, inode); + if (delete_blk) { + status = ocfs2_truncate_log_append(osb, handle, delete_blk, + clusters_to_del); if (status < 0) { mlog_errno(status); goto bail; } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; + } + status = 0; +bail: + + mlog_exit(status); + return status; +} + +static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return 0; +} + +static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) +{ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + return ocfs2_journal_dirty_data(handle, bh); +} + +static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, + struct page **pages, int numpages, + u64 phys, handle_t *handle) +{ + int i, ret, partial = 0; + void *kaddr; + struct page *page; + unsigned int from, to = PAGE_CACHE_SIZE; + struct super_block *sb = inode->i_sb; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + if (numpages == 0) + goto out; + + from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ + if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { + /* + * Since 'from' has been capped to a value below page + * size, this calculation won't be able to overflow + * 'to' + */ + to = ocfs2_align_bytes_to_clusters(sb, from); + + /* + * The truncate tail in this case should never contain + * more than one page at maximum. The loop below also + * assumes this. + */ + BUG_ON(numpages != 1); + } + + for(i = 0; i < numpages; i++) { + page = pages[i]; + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + + ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); + if (ret) + mlog_errno(ret); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from, 0, to - from); + kunmap_atomic(kaddr, KM_USER0); + + /* + * Need to set the buffers we zero'd into uptodate + * here if they aren't - ocfs2_map_page_blocks() + * might've skipped some + */ + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle, + page_buffers(page), + from, to, &partial, + ocfs2_ordered_zero_func); + if (ret < 0) + mlog_errno(ret); + } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_writeback_zero_func); + if (ret < 0) + mlog_errno(ret); } - el = &(eb->h_list); - status = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto bail; + if (!partial) + SetPageUptodate(page); + + flush_dcache_page(page); + + /* + * Every page after the 1st one should be completely zero'd. + */ + from = 0; + } +out: + if (pages) { + for (i = 0; i < numpages; i++) { + page = pages[i]; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); } + } +} - BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); - BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1)); +static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, + int *num, u64 *phys) +{ + int i, numpages = 0, ret = 0; + unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; + unsigned int ext_flags; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + unsigned long index; + u64 next_cluster_bytes; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); + + /* Cluster boundary, so we don't need to grab any pages. */ + if ((isize & (csize - 1)) == 0) + goto out; - i = le16_to_cpu(el->l_next_free_rec) - 1; + ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, + phys, NULL, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } - mlog(0, "extent block %llu, before: record %d: " - "(%u, %u, %llu), next = %u\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), i, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(el->l_recs[i].e_clusters), - (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), - le16_to_cpu(el->l_next_free_rec)); + /* Tail is a hole. */ + if (*phys == 0) + goto out; - BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del); - le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del); - - next_eb = le64_to_cpu(el->l_recs[i].e_blkno); - /* bottom-most block requires us to delete data.*/ - if (!el->l_tree_depth) - delete_blk = le64_to_cpu(el->l_recs[i].e_blkno) - + ocfs2_clusters_to_blocks(osb->sb, - le32_to_cpu(el->l_recs[i].e_clusters)); - if (!el->l_recs[i].e_clusters) { - el->l_recs[i].e_cpos = 0; - el->l_recs[i].e_blkno = 0; - BUG_ON(!el->l_next_free_rec); - le16_add_cpu(&el->l_next_free_rec, -1); - } - mlog(0, "extent block %llu, after: record %d: " - "(%u, %u, %llu), next = %u\n", - (unsigned long long)le64_to_cpu(eb->h_blkno), i, - le32_to_cpu(el->l_recs[i].e_cpos), - le32_to_cpu(el->l_recs[i].e_clusters), - (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno), - le16_to_cpu(el->l_next_free_rec)); + /* Tail is marked as unwritten, we can count on write to zero + * in that case. */ + if (ext_flags & OCFS2_EXT_UNWRITTEN) + goto out; - status = ocfs2_journal_dirty(handle, eb_bh); - if (status < 0) { - mlog_errno(status); - goto bail; + next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); + index = isize >> PAGE_CACHE_SHIFT; + do { + pages[numpages] = grab_cache_page(mapping, index); + if (!pages[numpages]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; } - if (!el->l_next_free_rec) { - mlog(0, "deleting this extent block.\n"); - - ocfs2_remove_from_cache(inode, eb_bh); + numpages++; + index++; + } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); - BUG_ON(el->l_recs[0].e_clusters); - BUG_ON(el->l_recs[0].e_cpos); - BUG_ON(el->l_recs[0].e_blkno); - if (eb->h_suballoc_slot == 0) { - /* - * This code only understands how to - * lock the suballocator in slot 0, - * which is fine because allocation is - * only ever done out of that - * suballocator too. A future version - * might change that however, so avoid - * a free if we don't know how to - * handle it. This way an fs incompat - * bit will not be necessary. - */ - status = ocfs2_free_extent_block(handle, - tc->tc_ext_alloc_inode, - tc->tc_ext_alloc_bh, - eb); - if (status < 0) { - mlog_errno(status); - goto bail; +out: + if (ret != 0) { + if (pages) { + for (i = 0; i < numpages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + page_cache_release(pages[i]); } } } - brelse(eb_bh); - eb_bh = NULL; - depth--; + numpages = 0; } - BUG_ON(!delete_blk); - status = ocfs2_truncate_log_append(osb, handle, delete_blk, - clusters_to_del); - if (status < 0) { - mlog_errno(status); - goto bail; + *num = numpages; + + return ret; +} + +/* + * Zero the area past i_size but still within an allocated + * cluster. This avoids exposing nonzero data on subsequent file + * extends. + * + * We need to call this before i_size is updated on the inode because + * otherwise block_write_full_page() will skip writeout of pages past + * i_size. The new_i_size parameter is passed for this reason. + */ +int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, + u64 new_i_size) +{ + int ret, numpages; + loff_t endbyte; + struct page **pages = NULL; + u64 phys; + + /* + * File systems which don't support sparse files zero on every + * extend. + */ + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + return 0; + + pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), + sizeof(struct page *), GFP_NOFS); + if (pages == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; } - status = 0; -bail: - if (!status) - ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters)); - else - ocfs2_extent_map_drop(inode, 0); - mlog_exit(status); - return status; + + ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (numpages == 0) + goto out; + + ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, + handle); + + /* + * Initiate writeout of the pages we zero'd here. We don't + * wait on them - the truncate_inode_pages() call later will + * do that for us. + */ + endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); + ret = do_sync_mapping_range(inode->i_mapping, new_i_size, + endbyte - 1, SYNC_FILE_RANGE_WRITE); + if (ret) + mlog_errno(ret); + +out: + if (pages) + kfree(pages); + + return ret; } /* @@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct ocfs2_truncate_context *tc) { int status, i, credits, tl_sem = 0; - u32 clusters_to_del, target_i_clusters; - u64 last_eb = 0; - struct ocfs2_dinode *fe; - struct ocfs2_extent_block *eb; + u32 clusters_to_del, new_highest_cpos, range; struct ocfs2_extent_list *el; - struct buffer_head *last_eb_bh; handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_path *path = NULL; mlog_entry_void(); down_write(&OCFS2_I(inode)->ip_alloc_sem); - target_i_clusters = ocfs2_clusters_for_bytes(osb->sb, + new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - last_eb_bh = tc->tc_last_eb_bh; - tc->tc_last_eb_bh = NULL; + path = ocfs2_new_inode_path(fe_bh); + if (!path) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } - fe = (struct ocfs2_dinode *) fe_bh->b_data; + ocfs2_extent_map_trunc(inode, new_highest_cpos); - if (fe->id2.i_list.l_tree_depth) { - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - el = &eb->h_list; - } else - el = &fe->id2.i_list; - last_eb = le64_to_cpu(fe->i_last_eb_blk); start: - mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, " - "last_eb = %llu, fe->i_last_eb_blk = %llu, " - "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n", - le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb, - (unsigned long long)le64_to_cpu(fe->i_last_eb_blk), - le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh); - - if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) { - mlog(0, "last_eb changed!\n"); - BUG_ON(!fe->id2.i_list.l_tree_depth); - last_eb = le64_to_cpu(fe->i_last_eb_blk); - /* i_last_eb_blk may have changed, read it if - * necessary. We don't have to worry about the - * truncate to zero case here (where there becomes no - * last_eb) because we never loop back after our work - * is done. */ - if (last_eb_bh) { - brelse(last_eb_bh); - last_eb_bh = NULL; - } + /* + * Check that we still have allocation to delete. + */ + if (OCFS2_I(inode)->ip_clusters == 0) { + status = 0; + goto bail; + } - status = ocfs2_read_block(osb, last_eb, - &last_eb_bh, OCFS2_BH_CACHED, - inode); - if (status < 0) { - mlog_errno(status); - goto bail; - } - eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } - el = &(eb->h_list); + /* + * Truncate always works against the rightmost tree branch. + */ + status = ocfs2_find_path(inode, path, UINT_MAX); + if (status) { + mlog_errno(status); + goto bail; + } + + mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", + OCFS2_I(inode)->ip_clusters, path->p_tree_depth); + + /* + * By now, el will point to the extent list on the bottom most + * portion of this tree. Only the tail record is considered in + * each pass. + * + * We handle the following cases, in order: + * - empty extent: delete the remaining branch + * - remove the entire record + * - remove a partial record + * - no record needs to be removed (truncate has completed) + */ + el = path_leaf_el(path); + if (le16_to_cpu(el->l_next_free_rec) == 0) { + ocfs2_error(inode->i_sb, + "Inode %llu has empty extent block at %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)path_leaf_bh(path)->b_blocknr); + status = -EROFS; + goto bail; } - /* by now, el will point to the extent list on the bottom most - * portion of this tree. */ i = le16_to_cpu(el->l_next_free_rec) - 1; - if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters) - clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters); - else - clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) + + range = le32_to_cpu(el->l_recs[i].e_cpos) + + ocfs2_rec_clusters(el, &el->l_recs[i]); + if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { + clusters_to_del = 0; + } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { + clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); + } else if (range > new_highest_cpos) { + clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + le32_to_cpu(el->l_recs[i].e_cpos)) - - target_i_clusters; + new_highest_cpos; + } else { + status = 0; + goto bail; + } - mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del); + mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", + clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); + + BUG_ON(clusters_to_del == 0); mutex_lock(&tl_inode->i_mutex); tl_sem = 1; @@ -1861,7 +3722,8 @@ start: } credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, - fe, el); + (struct ocfs2_dinode *)fe_bh->b_data, + el); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -1870,13 +3732,8 @@ start: goto bail; } - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); - if (status < 0) - mlog_errno(status); - - status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, - last_eb_bh, handle, tc); + status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, + tc, path); if (status < 0) { mlog_errno(status); goto bail; @@ -1888,9 +3745,14 @@ start: ocfs2_commit_trans(osb, handle); handle = NULL; - BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters); - if (le32_to_cpu(fe->i_clusters) > target_i_clusters) - goto start; + ocfs2_reinit_path(path, 1); + + /* + * The check above will catch the case where we've truncated + * away all allocation. + */ + goto start; + bail: up_write(&OCFS2_I(inode)->ip_alloc_sem); @@ -1902,8 +3764,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (last_eb_bh) - brelse(last_eb_bh); + ocfs2_free_path(path); /* This will drop the ext_alloc cluster lock for us */ ocfs2_free_truncate_context(tc); @@ -1912,7 +3773,6 @@ bail: return status; } - /* * Expects the inode to already be locked. This will figure out which * inodes need to be locked and will put them on the returned truncate @@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_truncate_context **tc) { - int status, metadata_delete; + int status, metadata_delete, i; unsigned int new_i_clusters; struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; @@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, "%llu\n", fe->i_clusters, new_i_clusters, (unsigned long long)fe->i_size); - if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) { - ocfs2_error(inode->i_sb, "Dinode %llu has cluster count " - "%u and size %llu whereas struct inode has " - "cluster count %u and size %llu which caused an " - "invalid truncate to %u clusters.", - (unsigned long long)le64_to_cpu(fe->i_blkno), - le32_to_cpu(fe->i_clusters), - (unsigned long long)le64_to_cpu(fe->i_size), - OCFS2_I(inode)->ip_clusters, i_size_read(inode), - new_i_clusters); - mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres); - status = -EIO; - goto bail; - } - *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); if (!(*tc)) { status = -ENOMEM; @@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, goto bail; } el = &(eb->h_list); - if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters) + + i = 0; + if (ocfs2_is_empty_extent(&el->l_recs[0])) + i = 1; + /* + * XXX: Should we check that next_free_rec contains + * the extent? + */ + if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) metadata_delete = 1; } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 0b82e8044325..fbcb5934a081 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, struct buffer_head *fe_bh, - u64 blkno, + u32 cpos, + u64 start_blk, u32 new_clusters, struct ocfs2_alloc_context *meta_ac); int ocfs2_num_free_extents(struct ocfs2_super *osb, @@ -70,6 +71,8 @@ struct ocfs2_truncate_context { struct buffer_head *tc_last_eb_bh; }; +int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, + u64 new_i_size); int ocfs2_prepare_truncate(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *fe_bh, @@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_truncate_context *tc); +int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, + u32 cpos, struct buffer_head **leaf_bh); + +/* + * Helper function to look at the # of clusters in an extent record. + */ +static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, + struct ocfs2_extent_rec *rec) +{ + /* + * Cluster count in extent records is slightly different + * between interior nodes and leaf nodes. This is to support + * unwritten extents which need a flags field in leaf node + * records, thus shrinking the available space for a clusters + * field. + */ + if (el->l_tree_depth) + return le32_to_cpu(rec->e_int_clusters); + else + return le16_to_cpu(rec->e_leaf_clusters); +} + #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 93628b02ef5d..56963e6c46c0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -24,6 +24,8 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <asm/byteorder.h> +#include <linux/swap.h> +#include <linux/pipe_fs_i.h> #define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> @@ -37,6 +39,7 @@ #include "file.h" #include "inode.h" #include "journal.h" +#include "suballoc.h" #include "super.h" #include "symlink.h" @@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int err = 0; + unsigned int ext_flags; u64 p_blkno, past_eof; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, (unsigned long long)iblock, bh_result, create); @@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - /* this can happen if another node truncs after our extend! */ - spin_lock(&OCFS2_I(inode)->ip_lock); - if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters)) - err = -EIO; - spin_unlock(&OCFS2_I(inode)->ip_lock); - if (err) - goto bail; - - err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, - NULL); + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, + &ext_flags); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " "%llu, NULL)\n", err, inode, (unsigned long long)iblock, @@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - map_bh(bh_result, inode->i_sb, p_blkno); - - if (bh_result->b_blocknr == 0) { - err = -EIO; - mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", - (unsigned long long)iblock, - (unsigned long long)p_blkno, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - } + /* + * ocfs2 never allocates in this function - the only time we + * need to use BH_New is when we're extending i_size on a file + * system which doesn't support holes, in which case BH_New + * allows block_prepare_write() to zero. + */ + mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), + "ino %lu, iblock %llu\n", inode->i_ino, + (unsigned long long)iblock); + + /* Treat the unwritten extent as a hole for zeroing purposes. */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + + if (!ocfs2_sparse_alloc(osb)) { + if (p_blkno == 0) { + err = -EIO; + mlog(ML_ERROR, + "iblock = %llu p_blkno = %llu blkno=(%llu)\n", + (unsigned long long)iblock, + (unsigned long long)p_blkno, + (unsigned long long)OCFS2_I(inode)->ip_blkno); + mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); + dump_stack(); + } - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + } bail: if (err < 0) @@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } -/* This can also be called from ocfs2_write_zero_page() which has done - * it's own cluster locking. */ +/* + * This is called from ocfs2_write_zero_page() which has handled it's + * own cluster locking and has ensured allocation exists for those + * blocks to be written. + */ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, unsigned from, unsigned to) { @@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, return ret; } -/* - * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called - * from loopback. It must be able to perform its own locking around - * ocfs2_get_block(). - */ -static int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - int ret; - - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - - ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); - if (ret != 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_prepare_write_nolock(inode, page, from, to); - - ocfs2_meta_unlock(inode, 0); -out: - mlog_exit(ret); - return ret; -} - /* Taken from ext3. We don't necessarily need the full blown * functionality yet, but IMHO it's better to cut and paste the whole * thing so we can avoid introducing our own bugs (and easily pick up * their fixes when they happen) --Mark */ -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -388,95 +377,6 @@ out: return handle; } -static int ocfs2_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - int ret; - struct buffer_head *di_bh = NULL; - struct inode *inode = page->mapping->host; - handle_t *handle = NULL; - struct ocfs2_dinode *di; - - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - - /* NOTE: ocfs2_file_aio_write has ensured that it's safe for - * us to continue here without rechecking the I/O against - * changed inode values. - * - * 1) We're currently holding the inode alloc lock, so no - * nodes can change it underneath us. - * - * 2) We've had to take the metadata lock at least once - * already to check for extending writes, suid removal, etc. - * The meta data update code then ensures that we don't get a - * stale inode allocation image (i_size, i_clusters, etc). - */ - - ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); - if (ret != 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_data_lock_with_page(inode, 1, page); - if (ret != 0) { - mlog_errno(ret); - goto out_unlock_meta; - } - - handle = ocfs2_start_walk_page_trans(inode, page, from, to); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_unlock_data; - } - - /* Mark our buffer early. We'd rather catch this error up here - * as opposed to after a successful commit_write which would - * require us to set back inode->i_size. */ - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - /* might update i_size */ - ret = generic_commit_write(file, page, from, to); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - di = (struct ocfs2_dinode *)di_bh->b_data; - - /* ocfs2_mark_inode_dirty() is too heavy to use here. */ - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - - inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); -out_unlock_data: - ocfs2_data_unlock(inode, 1); -out_unlock_meta: - ocfs2_meta_unlock(inode, 1); -out: - if (di_bh) - brelse(di_bh); - - mlog_exit(ret); - return ret; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) down_read(&OCFS2_I(inode)->ip_alloc_sem); } - err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, - NULL); + err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); if (!INODE_JOURNAL(inode)) { up_read(&OCFS2_I(inode)->ip_alloc_sem); @@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret; - u64 p_blkno, inode_blocks; - int contig_blocks; + u64 p_blkno, inode_blocks, contig_blocks; + unsigned int ext_flags; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, * nicely aligned and of the right size, so there's no need * for us to check any of that. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, - OCFS2_I(inode)->ip_clusters); - - /* - * For a read which begins past the end of file, we return a hole. - */ - if (!create && (iblock >= inode_blocks)) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - ret = 0; - goto bail; - } + inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); /* * Any write past EOF is not allowed because we'd be extending. */ if (create && (iblock + max_blocks) > inode_blocks) { - spin_unlock(&OCFS2_I(inode)->ip_lock); ret = -EIO; goto bail; } - spin_unlock(&OCFS2_I(inode)->ip_lock); /* This figures out the size of the next contiguous block, and * our logical offset */ - ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, - &contig_blocks); + ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, + &contig_blocks, &ext_flags); if (ret) { mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", (unsigned long long)iblock); @@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - map_bh(bh_result, inode->i_sb, p_blkno); + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { + ocfs2_error(inode->i_sb, + "Inode %llu has a hole at block %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)iblock); + ret = -EROFS; + goto bail; + } + + /* + * get_more_blocks() expects us to describe a hole by clearing + * the mapped bit on bh_result(). + * + * Consider an unwritten extent as a hole. + */ + if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) + map_bh(bh_result, inode->i_sb, p_blkno); + else { + /* + * ocfs2_prepare_inode_for_write() should have caught + * the case where we'd be filling a hole and triggered + * a buffered write instead. + */ + if (create) { + ret = -EIO; + mlog_errno(ret); + goto bail; + } + + clear_buffer_mapped(bh_result); + } /* make sure we don't map more than max_blocks blocks here as that's all the kernel will handle at this point. */ @@ -606,12 +522,38 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, void *private) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; + int level; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + ocfs2_iocb_clear_rw_locked(iocb); - up_read(&inode->i_alloc_sem); - ocfs2_rw_unlock(inode, 0); + + level = ocfs2_iocb_rw_locked_level(iocb); + if (!level) + up_read(&inode->i_alloc_sem); + ocfs2_rw_unlock(inode, level); +} + +/* + * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen + * from ext3. PageChecked() bits have been removed as OCFS2 does not + * do journalled data. + */ +static void ocfs2_invalidatepage(struct page *page, unsigned long offset) +{ + journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; + + journal_invalidatepage(journal, page, offset); +} + +static int ocfs2_releasepage(struct page *page, gfp_t wait) +{ + journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; + + if (!page_has_buffers(page)) + return 0; + return journal_try_to_free_buffers(journal, page, wait); } static ssize_t ocfs2_direct_IO(int rw, @@ -626,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw, mlog_entry_void(); - /* - * We get PR data locks even for O_DIRECT. This allows - * concurrent O_DIRECT I/O but doesn't let O_DIRECT with - * extending and buffered zeroing writes race. If they did - * race then the buffered zeroing could be written back after - * the O_DIRECT I/O. It's one thing to tell people not to mix - * buffered and O_DIRECT writes, but expecting them to - * understand that file extension is also an implicit buffered - * write is too much. By getting the PR we force writeback of - * the buffered zeroing before proceeding. - */ - ret = ocfs2_data_lock(inode, 0); - if (ret < 0) { - mlog_errno(ret); - goto out; + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + /* + * We get PR data locks even for O_DIRECT. This + * allows concurrent O_DIRECT I/O but doesn't let + * O_DIRECT with extending and buffered zeroing writes + * race. If they did race then the buffered zeroing + * could be written back after the O_DIRECT I/O. It's + * one thing to tell people not to mix buffered and + * O_DIRECT writes, but expecting them to understand + * that file extension is also an implicit buffered + * write is too much. By getting the PR we force + * writeback of the buffered zeroing before + * proceeding. + */ + ret = ocfs2_data_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_data_unlock(inode, 0); } - ocfs2_data_unlock(inode, 0); ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, @@ -654,12 +600,719 @@ out: return ret; } +static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, + u32 cpos, + unsigned int *start, + unsigned int *end) +{ + unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { + unsigned int cpp; + + cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); + + cluster_start = cpos % cpp; + cluster_start = cluster_start << osb->s_clustersize_bits; + + cluster_end = cluster_start + osb->s_clustersize; + } + + BUG_ON(cluster_start > PAGE_SIZE); + BUG_ON(cluster_end > PAGE_SIZE); + + if (start) + *start = cluster_start; + if (end) + *end = cluster_end; +} + +/* + * 'from' and 'to' are the region in the page to avoid zeroing. + * + * If pagesize > clustersize, this function will avoid zeroing outside + * of the cluster boundary. + * + * from == to == 0 is code for "zero the entire cluster region" + */ +static void ocfs2_clear_page_regions(struct page *page, + struct ocfs2_super *osb, u32 cpos, + unsigned from, unsigned to) +{ + void *kaddr; + unsigned int cluster_start, cluster_end; + + ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); + + kaddr = kmap_atomic(page, KM_USER0); + + if (from || to) { + if (from > cluster_start) + memset(kaddr + cluster_start, 0, from - cluster_start); + if (to < cluster_end) + memset(kaddr + to, 0, cluster_end - to); + } else { + memset(kaddr + cluster_start, 0, cluster_end - cluster_start); + } + + kunmap_atomic(kaddr, KM_USER0); +} + +/* + * Some of this taken from block_prepare_write(). We already have our + * mapping by now though, and the entire write will be allocating or + * it won't, so not much need to use BH_New. + * + * This will also skip zeroing, which is handled externally. + */ +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new) +{ + int ret = 0; + struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; + unsigned int block_end, block_start; + unsigned int bsize = 1 << inode->i_blkbits; + + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + head = page_buffers(page); + for (bh = head, block_start = 0; bh != head || !block_start; + bh = bh->b_this_page, block_start += bsize) { + block_end = block_start + bsize; + + /* + * Ignore blocks outside of our i/o range - + * they may belong to unallocated clusters. + */ + if (block_start >= to || block_end <= from) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + continue; + } + + /* + * For an allocating write with cluster size >= page + * size, we always write the entire page. + */ + + if (buffer_new(bh)) + clear_buffer_new(bh); + + if (!buffer_mapped(bh)) { + map_bh(bh, inode->i_sb, *p_blkno); + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + + *p_blkno = *p_blkno + 1; + } + + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + ret = -EIO; + } + + if (ret == 0 || !new) + return ret; + + /* + * If we get -EIO above, zero out any newly allocated blocks + * to avoid exposing stale data. + */ + bh = head; + block_start = 0; + do { + void *kaddr; + + block_end = block_start + bsize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+block_start, 0, bh->b_size); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + + return ret; +} + +/* + * This will copy user data from the buffer page in the splice + * context. + * + * For now, we ignore SPLICE_F_MOVE as that would require some extra + * communication out all the way to ocfs2_write(). + */ +int ocfs2_map_and_write_splice_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, u64 *p_blkno, + unsigned int *ret_from, unsigned int *ret_to) +{ + int ret; + unsigned int to, from, cluster_start, cluster_end; + char *src, *dst; + struct ocfs2_splice_write_priv *sp = wc->w_private; + struct pipe_buffer *buf = sp->s_buf; + unsigned long bytes, src_from; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, + &cluster_end); + + from = sp->s_offset; + src_from = sp->s_buf_offset; + bytes = wc->w_count; + + if (wc->w_large_pages) { + /* + * For cluster size < page size, we have to + * calculate pos within the cluster and obey + * the rightmost boundary. + */ + bytes = min(bytes, (unsigned long)(osb->s_clustersize + - (wc->w_pos & (osb->s_clustersize - 1)))); + } + to = from + bytes; + + if (wc->w_this_page_new) + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + cluster_start, cluster_end, 1); + else + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + from, to, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > osb->s_clustersize); + BUG_ON(to > osb->s_clustersize); + + src = buf->ops->map(sp->s_pipe, buf, 1); + dst = kmap_atomic(wc->w_this_page, KM_USER1); + memcpy(dst + from, src + src_from, bytes); + kunmap_atomic(wc->w_this_page, KM_USER1); + buf->ops->unmap(sp->s_pipe, buf, src); + + wc->w_finished_copy = 1; + + *ret_from = from; + *ret_to = to; +out: + + return bytes ? (unsigned int)bytes : ret; +} + +/* + * This will copy user data from the iovec in the buffered write + * context. + */ +int ocfs2_map_and_write_user_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, u64 *p_blkno, + unsigned int *ret_from, unsigned int *ret_to) +{ + int ret; + unsigned int to, from, cluster_start, cluster_end; + unsigned long bytes, src_from; + char *dst; + struct ocfs2_buffered_write_priv *bp = wc->w_private; + const struct iovec *cur_iov = bp->b_cur_iov; + char __user *buf; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, + &cluster_end); + + buf = cur_iov->iov_base + bp->b_cur_off; + src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; + + from = wc->w_pos & (PAGE_CACHE_SIZE - 1); + + /* + * This is a lot of comparisons, but it reads quite + * easily, which is important here. + */ + /* Stay within the src page */ + bytes = PAGE_SIZE - src_from; + /* Stay within the vector */ + bytes = min(bytes, + (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); + /* Stay within count */ + bytes = min(bytes, (unsigned long)wc->w_count); + /* + * For clustersize > page size, just stay within + * target page, otherwise we have to calculate pos + * within the cluster and obey the rightmost + * boundary. + */ + if (wc->w_large_pages) { + /* + * For cluster size < page size, we have to + * calculate pos within the cluster and obey + * the rightmost boundary. + */ + bytes = min(bytes, (unsigned long)(osb->s_clustersize + - (wc->w_pos & (osb->s_clustersize - 1)))); + } else { + /* + * cluster size > page size is the most common + * case - we just stay within the target page + * boundary. + */ + bytes = min(bytes, PAGE_CACHE_SIZE - from); + } + + to = from + bytes; + + if (wc->w_this_page_new) + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + cluster_start, cluster_end, 1); + else + ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, + from, to, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > osb->s_clustersize); + BUG_ON(to > osb->s_clustersize); + + dst = kmap(wc->w_this_page); + memcpy(dst + from, bp->b_src_buf + src_from, bytes); + kunmap(wc->w_this_page); + + /* + * XXX: This is slow, but simple. The caller of + * ocfs2_buffered_write_cluster() is responsible for + * passing through the iovecs, so it's difficult to + * predict what our next step is in here after our + * initial write. A future version should be pushing + * that iovec manipulation further down. + * + * By setting this, we indicate that a copy from user + * data was done, and subsequent calls for this + * cluster will skip copying more data. + */ + wc->w_finished_copy = 1; + + *ret_from = from; + *ret_to = to; +out: + + return bytes ? (unsigned int)bytes : ret; +} + +/* + * Map, fill and write a page to disk. + * + * The work of copying data is done via callback. Newly allocated + * pages which don't take user data will be zero'd (set 'new' to + * indicate an allocating write) + * + * Returns a negative error code or the number of bytes copied into + * the page. + */ +int ocfs2_write_data_page(struct inode *inode, handle_t *handle, + u64 *p_blkno, struct page *page, + struct ocfs2_write_ctxt *wc, int new) +{ + int ret, copied = 0; + unsigned int from = 0, to = 0; + unsigned int cluster_start, cluster_end; + unsigned int zero_from = 0, zero_to = 0; + + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, + &cluster_start, &cluster_end); + + if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index + && !wc->w_finished_copy) { + + wc->w_this_page = page; + wc->w_this_page_new = new; + ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + copied = ret; + + zero_from = from; + zero_to = to; + if (new) { + from = cluster_start; + to = cluster_end; + } + } else { + /* + * If we haven't allocated the new page yet, we + * shouldn't be writing it out without copying user + * data. This is likely a math error from the caller. + */ + BUG_ON(!new); + + from = cluster_start; + to = cluster_end; + + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* + * Parts of newly allocated pages need to be zero'd. + * + * Above, we have also rewritten 'to' and 'from' - as far as + * the rest of the function is concerned, the entire cluster + * range inside of a page needs to be written. + * + * We can skip this if the page is up to date - it's already + * been zero'd from being read in as a hole. + */ + if (new && !PageUptodate(page)) + ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), + wc->w_cpos, zero_from, zero_to); + + flush_dcache_page(page); + + if (ocfs2_should_order_data(inode)) { + ret = walk_page_buffers(handle, + page_buffers(page), + from, to, NULL, + ocfs2_journal_dirty_data); + if (ret < 0) + mlog_errno(ret); + } + + /* + * We don't use generic_commit_write() because we need to + * handle our own i_size update. + */ + ret = block_commit_write(page, from, to); + if (ret) + mlog_errno(ret); +out: + + return copied ? copied : ret; +} + +/* + * Do the actual write of some data into an inode. Optionally allocate + * in order to fulfill the write. + * + * cpos is the logical cluster offset within the file to write at + * + * 'phys' is the physical mapping of that offset. a 'phys' value of + * zero indicates that allocation is required. In this case, data_ac + * and meta_ac should be valid (meta_ac can be null if metadata + * allocation isn't required). + */ +static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, + struct buffer_head *di_bh, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc) +{ + int ret, i, numpages = 1, new; + unsigned int copied = 0; + u32 tmp_pos; + u64 v_blkno, p_blkno; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned long index, start; + struct page **cpages; + + new = phys == 0 ? 1 : 0; + + /* + * Figure out how many pages we'll be manipulating here. For + * non allocating write, we just change the one + * page. Otherwise, we'll need a whole clusters worth. + */ + if (new) + numpages = ocfs2_pages_per_cluster(inode->i_sb); + + cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); + if (!cpages) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + if (new) { + start = ocfs2_align_clusters_to_page_index(inode->i_sb, + wc->w_cpos); + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); + } else { + start = wc->w_pos >> PAGE_CACHE_SHIFT; + v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; + } + + for(i = 0; i < numpages; i++) { + index = start + i; + + cpages[i] = grab_cache_page(mapping, index); + if (!cpages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + } + + if (new) { + /* + * This is safe to call with the page locks - it won't take + * any additional semaphores or cluster locks. + */ + tmp_pos = wc->w_cpos; + ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, di_bh, handle, + data_ac, meta_ac, NULL); + /* + * This shouldn't happen because we must have already + * calculated the correct meta data allocation required. The + * internal tree allocation code should know how to increase + * transaction credits itself. + * + * If need be, we could handle -EAGAIN for a + * RESTART_TRANS here. + */ + mlog_bug_on_msg(ret == -EAGAIN, + "Inode %llu: EAGAIN return during allocation.\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, + NULL); + if (ret < 0) { + + /* + * XXX: Should we go readonly here? + */ + + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0); + + for(i = 0; i < numpages; i++) { + ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], + wc, new); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + copied += ret; + } + +out: + for(i = 0; i < numpages; i++) { + unlock_page(cpages[i]); + mark_page_accessed(cpages[i]); + page_cache_release(cpages[i]); + } + kfree(cpages); + + return copied ? copied : ret; +} + +static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, + struct ocfs2_super *osb, loff_t pos, + size_t count, ocfs2_page_writer *cb, + void *cb_priv) +{ + wc->w_count = count; + wc->w_pos = pos; + wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; + wc->w_finished_copy = 0; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; + + wc->w_write_data_page = cb; + wc->w_private = cb_priv; +} + +/* + * Write a cluster to an inode. The cluster may not be allocated yet, + * in which case it will be. This only exists for buffered writes - + * O_DIRECT takes a more "traditional" path through the kernel. + * + * The caller is responsible for incrementing pos, written counts, etc + * + * For file systems that don't support sparse files, pre-allocation + * and page zeroing up until cpos should be done prior to this + * function call. + * + * Callers should be holding i_sem, and the rw cluster lock. + * + * Returns the number of user bytes written, or less than zero for + * error. + */ +ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, + size_t count, ocfs2_page_writer *actor, + void *priv) +{ + int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + ssize_t written = 0; + u32 phys; + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle; + struct ocfs2_write_ctxt wc; + + ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); + + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + di = (struct ocfs2_dinode *)di_bh->b_data; + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + + /* phys == 0 means that allocation is required. */ + if (phys == 0) { + ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + + credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); + } + + ret = ocfs2_data_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out_meta; + } + + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_data; + } + + written = ocfs2_write(file, phys, handle, di_bh, data_ac, + meta_ac, &wc); + if (written < 0) { + ret = written; + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + pos += written; + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_data: + ocfs2_data_unlock(inode, 1); + +out_meta: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + ocfs2_meta_unlock(inode, 1); + +out: + brelse(di_bh); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return written ? written : ret; +} + const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .writepage = ocfs2_writepage, - .prepare_write = ocfs2_prepare_write, - .commit_write = ocfs2_commit_write, .bmap = ocfs2_bmap, .sync_page = block_sync_page, - .direct_IO = ocfs2_direct_IO + .direct_IO = ocfs2_direct_IO, + .invalidatepage = ocfs2_invalidatepage, + .releasepage = ocfs2_releasepage, + .migratepage = buffer_migrate_page, }; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index f446a15eab88..45821d479b5a 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, unsigned from, unsigned to); +int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, + struct inode *inode, unsigned int from, + unsigned int to, int new); + +int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)); + +struct ocfs2_write_ctxt; +typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, + u64 *, unsigned int *, unsigned int *); + +ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, + size_t count, ocfs2_page_writer *actor, + void *priv); + +struct ocfs2_write_ctxt { + size_t w_count; + loff_t w_pos; + u32 w_cpos; + unsigned int w_finished_copy; + + /* This is true if page_size > cluster_size */ + unsigned int w_large_pages; + + /* Filler callback and private data */ + ocfs2_page_writer *w_write_data_page; + void *w_private; + + /* Only valid for the filler callback */ + struct page *w_this_page; + unsigned int w_this_page_new; +}; + +struct ocfs2_buffered_write_priv { + char *b_src_buf; + const struct iovec *b_cur_iov; /* Current iovec */ + size_t b_cur_off; /* Offset in the + * current iovec */ +}; +int ocfs2_map_and_write_user_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, + u64 *p_blkno, + unsigned int *ret_from, + unsigned int *ret_to); + +struct ocfs2_splice_write_priv { + struct splice_desc *s_sd; + struct pipe_buffer *s_buf; + struct pipe_inode_info *s_pipe; + /* Neither offset value is ever larger than one page */ + unsigned int s_offset; + unsigned int s_buf_offset; +}; +int ocfs2_map_and_write_splice_data(struct inode *inode, + struct ocfs2_write_ctxt *wc, + u64 *p_blkno, + unsigned int *ret_from, + unsigned int *ret_to); + /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_rw_locked(iocb) \ - set_bit(0, (unsigned long *)&iocb->private) +static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) +{ + set_bit(0, (unsigned long *)&iocb->private); + if (level) + set_bit(1, (unsigned long *)&iocb->private); + else + clear_bit(1, (unsigned long *)&iocb->private); +} #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(0, (unsigned long *)&iocb->private) - +#define ocfs2_iocb_rw_locked_level(iocb) \ + test_bit(1, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 5a9779bb9236..eba282da500e 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1234,6 +1234,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, const char *page, size_t count) { + struct task_struct *hb_task; long fd; int sectsize; char *p = (char *)page; @@ -1319,20 +1320,28 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, */ atomic_set(®->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1); - reg->hr_task = kthread_run(o2hb_thread, reg, "o2hb-%s", - reg->hr_item.ci_name); - if (IS_ERR(reg->hr_task)) { - ret = PTR_ERR(reg->hr_task); + hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", + reg->hr_item.ci_name); + if (IS_ERR(hb_task)) { + ret = PTR_ERR(hb_task); mlog_errno(ret); - reg->hr_task = NULL; goto out; } + spin_lock(&o2hb_live_lock); + reg->hr_task = hb_task; + spin_unlock(&o2hb_live_lock); + ret = wait_event_interruptible(o2hb_steady_queue, atomic_read(®->hr_steady_iterations) == 0); if (ret) { - kthread_stop(reg->hr_task); + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; reg->hr_task = NULL; + spin_unlock(&o2hb_live_lock); + + if (hb_task) + kthread_stop(hb_task); goto out; } @@ -1354,10 +1363,17 @@ out: static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, char *page) { - if (!reg->hr_task) + pid_t pid = 0; + + spin_lock(&o2hb_live_lock); + if (reg->hr_task) + pid = reg->hr_task->pid; + spin_unlock(&o2hb_live_lock); + + if (!pid) return 0; - return sprintf(page, "%u\n", reg->hr_task->pid); + return sprintf(page, "%u\n", pid); } struct o2hb_region_attribute { @@ -1495,13 +1511,17 @@ out: static void o2hb_heartbeat_group_drop_item(struct config_group *group, struct config_item *item) { + struct task_struct *hb_task; struct o2hb_region *reg = to_o2hb_region(item); /* stop the thread when the user removes the region dir */ - if (reg->hr_task) { - kthread_stop(reg->hr_task); - reg->hr_task = NULL; - } + spin_lock(&o2hb_live_lock); + hb_task = reg->hr_task; + reg->hr_task = NULL; + spin_unlock(&o2hb_live_lock); + + if (hb_task) + kthread_stop(hb_task); config_item_put(item); } @@ -1682,7 +1702,7 @@ out: } EXPORT_SYMBOL_GPL(o2hb_register_callback); -int o2hb_unregister_callback(struct o2hb_callback_func *hc) +void o2hb_unregister_callback(struct o2hb_callback_func *hc) { BUG_ON(hc->hc_magic != O2HB_CB_MAGIC); @@ -1690,15 +1710,13 @@ int o2hb_unregister_callback(struct o2hb_callback_func *hc) __builtin_return_address(0), hc); if (list_empty(&hc->hc_item)) - return 0; + return; down_write(&o2hb_callback_sem); list_del_init(&hc->hc_item); up_write(&o2hb_callback_sem); - - return 0; } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index cac6223206a9..cc6d40b39771 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -70,7 +70,7 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc, void *data, int priority); int o2hb_register_callback(struct o2hb_callback_func *hc); -int o2hb_unregister_callback(struct o2hb_callback_func *hc); +void o2hb_unregister_callback(struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_init(void); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 4705d659fe57..bbacf7da48a4 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -46,6 +46,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/workqueue.h> +#include <linux/reboot.h> #include "heartbeat.h" #include "nodemanager.h" @@ -72,7 +73,9 @@ static void o2quo_fence_self(void) /* panic spins with interrupts enabled. with preempt * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - panic("ocfs2 is very sorry to be fencing this system by panicing\n"); + + printk("ocfs2 is very sorry to be fencing this system by restarting\n"); + emergency_restart(); } /* Indicate that a timeout occured on a hearbeat region write. The diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1718215fc018..69caf3e12fea 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1638,17 +1638,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, void o2net_unregister_hb_callbacks(void) { - int ret; - - ret = o2hb_unregister_callback(&o2net_hb_up); - if (ret < 0) - mlog(ML_ERROR, "Status return %d unregistering heartbeat up " - "callback!\n", ret); - - ret = o2hb_unregister_callback(&o2net_hb_down); - if (ret < 0) - mlog(ML_ERROR, "Status return %d unregistering heartbeat down " - "callback!\n", ret); + o2hb_unregister_callback(&o2net_hb_up); + o2hb_unregister_callback(&o2net_hb_down); } int o2net_register_hb_callbacks(void) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4dae5df5e467..9606111fe89d 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -38,6 +38,9 @@ * locking semantics of the file system using the protocol. It should * be somewhere else, I'm sure, but right now it isn't. * + * New in version 8: + * - Replace delete inode votes with a cluster lock + * * New in version 7: * - DLM join domain includes the live nodemap * @@ -57,7 +60,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 7ULL +#define O2NET_PROTOCOL_VERSION 8ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 66821e178167..67e6866a2a4f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb, { int status; int extend; - u64 p_blkno; + u64 p_blkno, v_blkno; spin_lock(&OCFS2_I(dir)->ip_lock); extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)); spin_unlock(&OCFS2_I(dir)->ip_lock); if (extend) { - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1, - parent_fe_bh, handle, + u32 offset = OCFS2_I(dir)->ip_clusters; + + status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, + 1, parent_fe_bh, handle, data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { @@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb, } } - status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >> - (sb->s_blocksize_bits - 9)), - 1, &p_blkno, NULL); + v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir)); + status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, dir_i_size += dir->i_sb->s_blocksize; i_size_write(dir, dir_i_size); - dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size); + dir->i_blocks = ocfs2_inode_sector_count(dir); status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6087c4749fee..d836b98dd99a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -138,8 +138,10 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) { - hlist_del_init(&lockres->hash_node); - dlm_lockres_put(lockres); + if (!hlist_unhashed(&lockres->hash_node)) { + hlist_del_init(&lockres->hash_node); + dlm_lockres_put(lockres); + } } void __dlm_insert_lockres(struct dlm_ctxt *dlm, @@ -428,11 +430,10 @@ redo_bucket: dlm_lockres_put(res); - cond_resched_lock(&dlm->spinlock); - if (dropped) goto redo_bucket; } + cond_resched_lock(&dlm->spinlock); num += n; mlog(0, "%s: touched %d lockreses in bucket %d " "(tot=%d)\n", dlm->name, n, i, num); @@ -655,6 +656,8 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) dlm_kick_thread(dlm, NULL); while (dlm_migrate_all_locks(dlm)) { + /* Give dlm_thread time to purge the lockres' */ + msleep(500); mlog(0, "%s: more migration to do\n", dlm->name); } dlm_mark_domain_leaving(dlm); @@ -1031,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm) { int status = 0, tmpstat, node; struct domain_join_ctxt *ctxt; - enum dlm_query_join_response response; + enum dlm_query_join_response response = JOIN_DISALLOW; mlog_entry("%p", dlm); diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 77e4e6169a0d..6edffca99d98 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2424,6 +2424,57 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) dlm_lockres_put(res); } +/* Checks whether the lockres can be migrated. Returns 0 if yes, < 0 + * if not. If 0, numlocks is set to the number of locks in the lockres. + */ +static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + int *numlocks) +{ + int ret; + int i; + int count = 0; + struct list_head *queue, *iter; + struct dlm_lock *lock; + + assert_spin_locked(&res->spinlock); + + ret = -EINVAL; + if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + mlog(0, "cannot migrate lockres with unknown owner!\n"); + goto leave; + } + + if (res->owner != dlm->node_num) { + mlog(0, "cannot migrate lockres this node doesn't own!\n"); + goto leave; + } + + ret = 0; + queue = &res->granted; + for (i = 0; i < 3; i++) { + list_for_each(iter, queue) { + lock = list_entry(iter, struct dlm_lock, list); + ++count; + if (lock->ml.node == dlm->node_num) { + mlog(0, "found a lock owned by this node still " + "on the %s queue! will not migrate this " + "lockres\n", (i == 0 ? "granted" : + (i == 1 ? "converting" : + "blocked"))); + ret = -ENOTEMPTY; + goto leave; + } + } + queue++; + } + + *numlocks = count; + mlog(0, "migrateable lockres having %d locks\n", *numlocks); + +leave: + return ret; +} /* * DLM_MIGRATE_LOCKRES @@ -2437,14 +2488,12 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle = NULL; struct dlm_master_list_entry *oldmle = NULL; struct dlm_migratable_lockres *mres = NULL; - int ret = -EINVAL; + int ret = 0; const char *name; unsigned int namelen; int mle_added = 0; - struct list_head *queue, *iter; - int i; - struct dlm_lock *lock; - int empty = 1, wake = 0; + int numlocks; + int wake = 0; if (!dlm_grab(dlm)) return -EINVAL; @@ -2458,42 +2507,16 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * ensure this lockres is a proper candidate for migration */ spin_lock(&res->spinlock); - if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { - mlog(0, "cannot migrate lockres with unknown owner!\n"); - spin_unlock(&res->spinlock); - goto leave; - } - if (res->owner != dlm->node_num) { - mlog(0, "cannot migrate lockres this node doesn't own!\n"); + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); + if (ret < 0) { spin_unlock(&res->spinlock); goto leave; } - mlog(0, "checking queues...\n"); - queue = &res->granted; - for (i=0; i<3; i++) { - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); - empty = 0; - if (lock->ml.node == dlm->node_num) { - mlog(0, "found a lock owned by this node " - "still on the %s queue! will not " - "migrate this lockres\n", - i==0 ? "granted" : - (i==1 ? "converting" : "blocked")); - spin_unlock(&res->spinlock); - ret = -ENOTEMPTY; - goto leave; - } - } - queue++; - } - mlog(0, "all locks on this lockres are nonlocal. continuing\n"); spin_unlock(&res->spinlock); /* no work to do */ - if (empty) { + if (numlocks == 0) { mlog(0, "no locks were found on this lockres! done!\n"); - ret = 0; goto leave; } @@ -2729,15 +2752,26 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int ret; int lock_dropped = 0; + int numlocks; + spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { if (!__dlm_lockres_unused(res)) { mlog(ML_ERROR, "%s:%.*s: this node is not master, " "trying to free this but locks remain\n", dlm->name, res->lockname.len, res->lockname.name); } + spin_unlock(&res->spinlock); + goto leave; + } + + /* No need to migrate a lockres having no locks */ + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); + if (ret >= 0 && numlocks == 0) { + spin_unlock(&res->spinlock); goto leave; } + spin_unlock(&res->spinlock); /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 6d4a83d50152..c1807a42c49f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } } while (status != 0); + spin_lock(&dlm_reco_state_lock); switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_FINALIZE_SENT: @@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) ndata->node_num, dead_node); break; } + spin_unlock(&dlm_reco_state_lock); } mlog(0, "done requesting all lock info\n"); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 8ffa0916eb86..2b264c6ba039 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -256,18 +256,14 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm, break; } - mlog(0, "removing lockres %.*s:%p from purgelist\n", - lockres->lockname.len, lockres->lockname.name, lockres); - list_del_init(&lockres->purge); - dlm_lockres_put(lockres); - dlm->purge_count--; + dlm_lockres_get(lockres); /* This may drop and reacquire the dlm spinlock if it * has to do migration. */ - mlog(0, "calling dlm_purge_lockres!\n"); if (dlm_purge_lockres(dlm, lockres)) BUG(); - mlog(0, "DONE calling dlm_purge_lockres!\n"); + + dlm_lockres_put(lockres); /* Avoid adding any scheduling latencies */ cond_resched_lock(&dlm->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index e335541727f9..27e43b0c0eae 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = { + .get_osb = ocfs2_get_inode_osb, + .flags = 0, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || lockres->l_type == OCFS2_LOCK_TYPE_DATA || - lockres->l_type == OCFS2_LOCK_TYPE_RW; + lockres->l_type == OCFS2_LOCK_TYPE_RW || + lockres->l_type == OCFS2_LOCK_TYPE_OPEN; } static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) @@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, case OCFS2_LOCK_TYPE_DATA: ops = &ocfs2_inode_data_lops; break; + case OCFS2_LOCK_TYPE_OPEN: + ops = &ocfs2_inode_open_lops; + break; default: mlog_bug_on_msg(1, "type: %d\n", type); ops = NULL; /* thanks, gcc */ @@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode) goto bail; } + ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); + if (ret) { + mlog_errno(ret); + goto bail; + } + bail: mlog_exit(ret); return ret; @@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write) mlog_exit_void(); } +/* + * ocfs2_open_lock always get PR mode lock. + */ +int ocfs2_open_lock(struct inode *inode) +{ + int status = 0; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %llu take PRMODE open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + LKM_PRMODE, 0, 0); + if (status < 0) + mlog_errno(status); + +out: + mlog_exit(status); + return status; +} + +int ocfs2_try_open_lock(struct inode *inode, int write) +{ + int status = 0, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(!inode); + + mlog_entry_void(); + + mlog(0, "inode %llu try to take %s open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + goto out; + + lockres = &OCFS2_I(inode)->ip_open_lockres; + + level = write ? LKM_EXMODE : LKM_PRMODE; + + /* + * The file system may already holding a PRMODE/EXMODE open lock. + * Since we pass LKM_NOQUEUE, the request won't block waiting on + * other nodes and the -EAGAIN will indicate to the caller that + * this inode is still in use. + */ + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, + level, LKM_NOQUEUE, 0); + +out: + mlog_exit(status); + return status; +} + +/* + * ocfs2_open_unlock unlock PR and EX mode open locks. + */ +void ocfs2_open_unlock(struct inode *inode) +{ + struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog_entry_void(); + + mlog(0, "inode %llu drop open lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (ocfs2_mount_local(osb)) + goto out; + + if(lockres->l_ro_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + LKM_PRMODE); + if(lockres->l_ex_holders) + ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, + LKM_EXMODE); + +out: + mlog_exit_void(); +} + int ocfs2_data_lock_full(struct inode *inode, int write, int arg_flags) @@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) inode->i_blocks = 0; else - inode->i_blocks = - ocfs2_align_bytes_to_sectors(i_size_read(inode)); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_uid = be32_to_cpu(lvb->lvb_iuid); inode->i_gid = be32_to_cpu(lvb->lvb_igid); @@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode, { int status = 0; struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_lock_res *lockres = NULL; + struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; struct ocfs2_dinode *fe; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry_void(); + if (ocfs2_mount_local(osb)) + goto bail; + spin_lock(&oi->ip_lock); if (oi->ip_flags & OCFS2_INODE_DELETED) { mlog(0, "Orphaned inode %llu was deleted while we " @@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode, } spin_unlock(&oi->ip_lock); - if (!ocfs2_mount_local(osb)) { - lockres = &oi->ip_meta_lockres; - - if (!ocfs2_should_refresh_lock_res(lockres)) - goto bail; - } + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; /* This will discard any caching information we might have had * for the inode metadata. */ ocfs2_metadata_cache_purge(inode); - /* will do nothing for inode types that don't use the extent - * map (directories, bitmap files, etc) */ ocfs2_extent_map_trunc(inode, 0); - if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) { + if (ocfs2_meta_lvb_is_trustable(inode, lockres)) { mlog(0, "Trusting LVB on inode %llu\n", (unsigned long long)oi->ip_blkno); ocfs2_refresh_inode_from_lvb(inode); @@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode, status = 0; bail_refresh: - if (lockres) - ocfs2_complete_lock_res_refresh(lockres, status); + ocfs2_complete_lock_res_refresh(lockres, status); bail: mlog_exit(status); return status; @@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode, wait_event(osb->recovery_event, ocfs2_node_map_is_empty(osb, &osb->recovery_map)); - acquired = 0; lockres = &OCFS2_I(inode)->ip_meta_lockres; level = ex ? LKM_EXMODE : LKM_PRMODE; dlm_flags = 0; @@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode) * ocfs2_clear_inode has done it for us. */ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), - &OCFS2_I(inode)->ip_data_lockres); + &OCFS2_I(inode)->ip_open_lockres); if (err < 0) mlog_errno(err); status = err; err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), + &OCFS2_I(inode)->ip_data_lockres); + if (err < 0) + mlog_errno(err); + if (err < 0 && !status) + status = err; + + err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), &OCFS2_I(inode)->ip_meta_lockres); if (err < 0) mlog_errno(err); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index c343fca68cf1..59cb566e7983 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode, int write); int ocfs2_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); +int ocfs2_open_lock(struct inode *inode); +int ocfs2_try_open_lock(struct inode *inode, int write); +void ocfs2_open_unlock(struct inode *inode); int ocfs2_meta_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, int *level); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 80ac69f11d9f..ba2b2ab1c6e4 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -3,8 +3,7 @@ * * extent_map.c * - * In-memory extent map for OCFS2. Man, this code was prettier in - * the library. + * Block/Cluster mapping functions * * Copyright (C) 2004 Oracle. All rights reserved. * @@ -26,1016 +25,528 @@ #include <linux/fs.h> #include <linux/init.h> #include <linux/types.h> -#include <linux/slab.h> -#include <linux/rbtree.h> #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include <cluster/masklog.h> #include "ocfs2.h" +#include "alloc.h" #include "extent_map.h" #include "inode.h" #include "super.h" #include "buffer_head_io.h" - /* - * SUCK SUCK SUCK - * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h - */ - -struct ocfs2_extent_map_entry { - struct rb_node e_node; - int e_tree_depth; - struct ocfs2_extent_rec e_rec; -}; - -struct ocfs2_em_insert_context { - int need_left; - int need_right; - struct ocfs2_extent_map_entry *new_ent; - struct ocfs2_extent_map_entry *old_ent; - struct ocfs2_extent_map_entry *left_ent; - struct ocfs2_extent_map_entry *right_ent; -}; - -static struct kmem_cache *ocfs2_em_ent_cachep = NULL; - - -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent); -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth); -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent); -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el); -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_map_entry **ret_ent); -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt); - -/* returns 1 only if the rec contains all the given clusters -- that is that - * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + - * clusters) is >= the argument's endpoint */ -static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, - u32 cpos, u32 clusters) -{ - if (le32_to_cpu(rec->e_cpos) > cpos) - return 0; - if (cpos + clusters > le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) - return 0; - return 1; -} - - -/* - * Find an entry in the tree that intersects the region passed in. - * Note that this will find straddled intervals, it is up to the - * callers to enforce any boundary conditions. - * - * Callers must hold ip_lock. This lookup is not guaranteed to return - * a tree_depth 0 match, and as such can race inserts if the lock - * were not held. + * The extent caching implementation is intentionally trivial. * - * The rb_node garbage lets insertion share the search. Trivial - * callers pass NULL. + * We only cache a small number of extents stored directly on the + * inode, so linear order operations are acceptable. If we ever want + * to increase the size of the extent map, then these algorithms must + * get smarter. */ -static struct ocfs2_extent_map_entry * -ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, - u32 cpos, u32 clusters, - struct rb_node ***ret_p, - struct rb_node **ret_parent) + +void ocfs2_extent_map_init(struct inode *inode) { - struct rb_node **p = &em->em_extents.rb_node; - struct rb_node *parent = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - while (*p) - { - parent = *p; - ent = rb_entry(parent, struct ocfs2_extent_map_entry, - e_node); - if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { - p = &(*p)->rb_left; - ent = NULL; - } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters))) { - p = &(*p)->rb_right; - ent = NULL; - } else - break; - } + struct ocfs2_inode_info *oi = OCFS2_I(inode); - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; - return ent; + oi->ip_extent_map.em_num_items = 0; + INIT_LIST_HEAD(&oi->ip_extent_map.em_list); } -/* - * Find the leaf containing the interval we want. While we're on our - * way down the tree, fill in every record we see at any depth, because - * we might want it later. - * - * Note that this code is run without ip_lock. That's because it - * sleeps while reading. If someone is also filling the extent list at - * the same time we are, we might have to restart. - */ -static int ocfs2_extent_map_find_leaf(struct inode *inode, - u32 cpos, u32 clusters, - struct ocfs2_extent_list *el) +static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, + unsigned int cpos, + struct ocfs2_extent_map_item **ret_emi) { - int i, ret; - struct buffer_head *eb_bh = NULL; - u64 blkno; - u32 rec_end; - struct ocfs2_extent_block *eb; - struct ocfs2_extent_rec *rec; - - /* - * The bh data containing the el cannot change here, because - * we hold alloc_sem. So we can do this without other - * locks. - */ - while (el->l_tree_depth) - { - blkno = 0; - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - rec_end = (le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - - ret = -EBADR; - if (rec_end > OCFS2_I(inode)->ip_clusters) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - goto out_free; - } - - if (rec_end <= cpos) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } - if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } - continue; - } + unsigned int range; + struct ocfs2_extent_map_item *emi; - /* - * We've found a record that matches our - * interval. We don't insert it because we're - * about to traverse it. - */ - - /* Check to see if we're stradling */ - ret = -ESRCH; - if (!ocfs2_extent_rec_contains_clusters(rec, - cpos, - clusters)) { - mlog_errno(ret); - goto out_free; - } + *ret_emi = NULL; - /* - * If we've already found a record, the el has - * two records covering the same interval. - * EEEK! - */ - ret = -EBADR; - if (blkno) { - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, i, - (unsigned long long)le64_to_cpu(rec->e_blkno)); - goto out_free; - } + list_for_each_entry(emi, &em->em_list, ei_list) { + range = emi->ei_cpos + emi->ei_clusters; - blkno = le64_to_cpu(rec->e_blkno); - } + if (cpos >= emi->ei_cpos && cpos < range) { + list_move(&emi->ei_list, &em->em_list); - /* - * We don't support holes, and we're still up - * in the branches, so we'd better have found someone - */ - ret = -EBADR; - if (!blkno) { - ocfs2_error(inode->i_sb, - "No record found for (cpos = %u, clusters = %u) on inode %llu\n", - cpos, clusters, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_errno(ret); - goto out_free; - } - - if (eb_bh) { - brelse(eb_bh); - eb_bh = NULL; - } - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - blkno, &eb_bh, OCFS2_BH_CACHED, - inode); - if (ret) { - mlog_errno(ret); - goto out_free; - } - eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out_free; + *ret_emi = emi; + break; } - el = &eb->h_list; } +} - BUG_ON(el->l_tree_depth); - - for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { - rec = &el->l_recs[i]; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", - i, - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_clusters); - return ret; - } - - ret = ocfs2_extent_map_insert(inode, rec, - le16_to_cpu(el->l_tree_depth)); - if (ret && (ret != -EEXIST)) { - mlog_errno(ret); - goto out_free; - } +static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos, + unsigned int *phys, unsigned int *len, + unsigned int *flags) +{ + unsigned int coff; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map_item *emi; + + spin_lock(&oi->ip_lock); + + __ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi); + if (emi) { + coff = cpos - emi->ei_cpos; + *phys = emi->ei_phys + coff; + if (len) + *len = emi->ei_clusters - coff; + if (flags) + *flags = emi->ei_flags; } - ret = 0; + spin_unlock(&oi->ip_lock); -out_free: - if (eb_bh) - brelse(eb_bh); + if (emi == NULL) + return -ENOENT; - return ret; + return 0; } /* - * This lookup actually will read from disk. It has one invariant: - * It will never re-traverse blocks. This means that all inserts should - * be new regions or more granular regions (both allowed by insert). + * Forget about all clusters equal to or greater than cpos. */ -static int ocfs2_extent_map_lookup_read(struct inode *inode, - u32 cpos, - u32 clusters, - struct ocfs2_extent_map_entry **ret_ent) +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) { - int ret; - u64 blkno; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct buffer_head *bh = NULL; - struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; - struct ocfs2_extent_list *el; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (ent) { - if (!ent->e_tree_depth) { - spin_unlock(&OCFS2_I(inode)->ip_lock); - *ret_ent = ent; - return 0; - } - blkno = le64_to_cpu(ent->e_rec.e_blkno); - spin_unlock(&OCFS2_I(inode)->ip_lock); - - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; + struct list_head *p, *n; + struct ocfs2_extent_map_item *emi; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + LIST_HEAD(tmp_list); + unsigned int range; + + spin_lock(&oi->ip_lock); + list_for_each_safe(p, n, &em->em_list) { + emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); + + if (emi->ei_cpos >= cpos) { + /* Full truncate of this record. */ + list_move(&emi->ei_list, &tmp_list); + BUG_ON(em->em_num_items == 0); + em->em_num_items--; + continue; } - eb = (struct ocfs2_extent_block *)bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - brelse(bh); - return -EIO; - } - el = &eb->h_list; - } else { - spin_unlock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - if (bh) - brelse(bh); - return ret; + range = emi->ei_cpos + emi->ei_clusters; + if (range > cpos) { + /* Partial truncate */ + emi->ei_clusters = cpos - emi->ei_cpos; } - di = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - brelse(bh); - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); - return -EIO; - } - el = &di->id2.i_list; - } - - ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); - brelse(bh); - if (ret) { - mlog_errno(ret); - return ret; } + spin_unlock(&oi->ip_lock); - ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); - if (!ent) { - ret = -ESRCH; - mlog_errno(ret); - return ret; + list_for_each_safe(p, n, &tmp_list) { + emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); + list_del(&emi->ei_list); + kfree(emi); } - - /* FIXME: Make sure this isn't a corruption */ - BUG_ON(ent->e_tree_depth); - - *ret_ent = ent; - - return 0; } /* - * Callers must hold ip_lock. This can insert pieces of the tree, - * thus racing lookup if the lock weren't held. + * Is any part of emi2 contained within emi1 */ -static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, - struct ocfs2_extent_map_entry *ent) +static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1, + struct ocfs2_extent_map_item *emi2) { - struct rb_node **p, *parent; - struct ocfs2_extent_map_entry *old_ent; + unsigned int range1, range2; - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), - le32_to_cpu(ent->e_rec.e_clusters), - &p, &parent); - if (old_ent) - return -EEXIST; + /* + * Check if logical start of emi2 is inside emi1 + */ + range1 = emi1->ei_cpos + emi1->ei_clusters; + if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1) + return 1; - rb_link_node(&ent->e_node, parent, p); - rb_insert_color(&ent->e_node, &em->em_extents); + /* + * Check if logical end of emi2 is inside emi1 + */ + range2 = emi2->ei_cpos + emi2->ei_clusters; + if (range2 > emi1->ei_cpos && range2 <= range1) + return 1; return 0; } +static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest, + struct ocfs2_extent_map_item *src) +{ + dest->ei_cpos = src->ei_cpos; + dest->ei_phys = src->ei_phys; + dest->ei_clusters = src->ei_clusters; + dest->ei_flags = src->ei_flags; +} /* - * Simple rule: on any return code other than -EAGAIN, anything left - * in the insert_context will be freed. - * - * Simple rule #2: A return code of -EEXIST from this function or - * its calls to ocfs2_extent_map_insert_entry() signifies that another - * thread beat us to the insert. It is not an actual error, but it - * tells the caller we have no more work to do. + * Try to merge emi with ins. Returns 1 if merge succeeds, zero + * otherwise. */ -static int ocfs2_extent_map_try_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth, - struct ocfs2_em_insert_context *ctxt) +static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi, + struct ocfs2_extent_map_item *ins) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *old_ent; - - ctxt->need_left = 0; - ctxt->need_right = 0; - ctxt->old_ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - if (!ret) { - ctxt->new_ent = NULL; - goto out_unlock; - } - - /* Since insert_entry failed, the map MUST have old_ent */ - old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), - le32_to_cpu(rec->e_clusters), - NULL, NULL); - - BUG_ON(!old_ent); - - if (old_ent->e_tree_depth < tree_depth) { - /* Another thread beat us to the lower tree_depth */ - ret = -EEXIST; - goto out_unlock; - } - - if (old_ent->e_tree_depth == tree_depth) { - /* - * Another thread beat us to this tree_depth. - * Let's make sure we agree with that thread (the - * extent_rec should be identical). - */ - if (!memcmp(rec, &old_ent->e_rec, - sizeof(struct ocfs2_extent_rec))) - ret = 0; - else - /* FIXME: Should this be ESRCH/EBADR??? */ - ret = -EEXIST; - - goto out_unlock; - } - /* - * We do it in this order specifically so that no actual tree - * changes occur until we have all the pieces we need. We - * don't want malloc failures to leave an inconsistent tree. - * Whenever we drop the lock, another process could be - * inserting. Also note that, if another process just beat us - * to an insert, we might not need the same pieces we needed - * the first go round. In the end, the pieces we need will - * be used, and the pieces we don't will be freed. + * Handle contiguousness */ - ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > - le32_to_cpu(old_ent->e_rec.e_cpos)); - ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) > - (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); - ret = -EAGAIN; - if (ctxt->need_left) { - if (!ctxt->left_ent) - goto out_unlock; - *(ctxt->left_ent) = *old_ent; - ctxt->left_ent->e_rec.e_clusters = - cpu_to_le32(le32_to_cpu(rec->e_cpos) - - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); - } - if (ctxt->need_right) { - if (!ctxt->right_ent) - goto out_unlock; - *(ctxt->right_ent) = *old_ent; - ctxt->right_ent->e_rec.e_cpos = - cpu_to_le32(le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)); - ctxt->right_ent->e_rec.e_clusters = - cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + - le32_to_cpu(old_ent->e_rec.e_clusters)) - - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); - } - - rb_erase(&old_ent->e_node, &em->em_extents); - /* Now that he's erased, set him up for deletion */ - ctxt->old_ent = old_ent; - - if (ctxt->need_left) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->left_ent); - if (ret) - goto out_unlock; - ctxt->left_ent = NULL; + if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) && + ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) && + ins->ei_flags == emi->ei_flags) { + emi->ei_clusters += ins->ei_clusters; + return 1; + } else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys && + (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys && + ins->ei_flags == emi->ei_flags) { + emi->ei_phys = ins->ei_phys; + emi->ei_cpos = ins->ei_cpos; + emi->ei_clusters += ins->ei_clusters; + return 1; } - if (ctxt->need_right) { - ret = ocfs2_extent_map_insert_entry(em, - ctxt->right_ent); - if (ret) - goto out_unlock; - ctxt->right_ent = NULL; + /* + * Overlapping extents - this shouldn't happen unless we've + * split an extent to change it's flags. That is exceedingly + * rare, so there's no sense in trying to optimize it yet. + */ + if (ocfs2_ei_is_contained(emi, ins) || + ocfs2_ei_is_contained(ins, emi)) { + ocfs2_copy_emi_fields(emi, ins); + return 1; } - ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); - - if (!ret) - ctxt->new_ent = NULL; - -out_unlock: - spin_unlock(&OCFS2_I(inode)->ip_lock); - - return ret; + /* No merge was possible. */ + return 0; } - -static int ocfs2_extent_map_insert(struct inode *inode, - struct ocfs2_extent_rec *rec, - int tree_depth) +/* + * In order to reduce complexity on the caller, this insert function + * is intentionally liberal in what it will accept. + * + * The only rule is that the truncate call *must* be used whenever + * records have been deleted. This avoids inserting overlapping + * records with different physical mappings. + */ +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec) { - int ret; - struct ocfs2_em_insert_context ctxt = {0, }; - - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - return ret; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_extent_map *em = &oi->ip_extent_map; + struct ocfs2_extent_map_item *emi, *new_emi = NULL; + struct ocfs2_extent_map_item ins; + + ins.ei_cpos = le32_to_cpu(rec->e_cpos); + ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters); + ins.ei_flags = rec->e_flags; + +search: + spin_lock(&oi->ip_lock); + + list_for_each_entry(emi, &em->em_list, ei_list) { + if (ocfs2_try_to_merge_extent_map(emi, &ins)) { + list_move(&emi->ei_list, &em->em_list); + spin_unlock(&oi->ip_lock); + goto out; + } } - /* Zero e_clusters means a truncated tail record. It better be EOF */ - if (!rec->e_clusters) { - if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != - OCFS2_I(inode)->ip_map.em_clusters) { - ret = -EBADR; - mlog_errno(ret); - ocfs2_error(inode->i_sb, - "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", - (unsigned long long)le64_to_cpu(rec->e_blkno), - (unsigned long long)OCFS2_I(inode)->ip_blkno); - return ret; - } + /* + * No item could be merged. + * + * Either allocate and add a new item, or overwrite the last recently + * inserted. + */ - /* Ignore the truncated tail */ - return 0; - } + if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) { + if (new_emi == NULL) { + spin_unlock(&oi->ip_lock); - ret = -ENOMEM; - ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.new_ent) { - mlog_errno(ret); - return ret; - } + new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS); + if (new_emi == NULL) + goto out; - ctxt.new_ent->e_rec = *rec; - ctxt.new_ent->e_tree_depth = tree_depth; - - do { - ret = -ENOMEM; - if (ctxt.need_left && !ctxt.left_ent) { - ctxt.left_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.left_ent) - break; - } - if (ctxt.need_right && !ctxt.right_ent) { - ctxt.right_ent = - kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_NOFS); - if (!ctxt.right_ent) - break; + goto search; } - ret = ocfs2_extent_map_try_insert(inode, rec, - tree_depth, &ctxt); - } while (ret == -EAGAIN); - - if ((ret < 0) && (ret != -EEXIST)) - mlog_errno(ret); + ocfs2_copy_emi_fields(new_emi, &ins); + list_add(&new_emi->ei_list, &em->em_list); + em->em_num_items++; + new_emi = NULL; + } else { + BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0); + emi = list_entry(em->em_list.prev, + struct ocfs2_extent_map_item, ei_list); + list_move(&emi->ei_list, &em->em_list); + ocfs2_copy_emi_fields(emi, &ins); + } - if (ctxt.left_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); - if (ctxt.right_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); - if (ctxt.old_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); - if (ctxt.new_ent) - kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); + spin_unlock(&oi->ip_lock); - return ret; +out: + if (new_emi) + kfree(new_emi); } /* - * Append this record to the tail of the extent map. It must be - * tree_depth 0. The record might be an extension of an existing - * record, and as such that needs to be handled. eg: - * - * Existing record in the extent map: - * - * cpos = 10, len = 10 - * |---------| - * - * New Record: - * - * cpos = 10, len = 20 - * |------------------| - * - * The passed record is the new on-disk record. The new_clusters value - * is how many clusters were added to the file. If the append is a - * contiguous append, the new_clusters has been added to - * rec->e_clusters. If the append is an entirely new extent, then - * rec->e_clusters is == new_clusters. + * Return the 1st index within el which contains an extent start + * larger than v_cluster. */ -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters) +static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el, + u32 v_cluster) { - int ret; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - struct ocfs2_extent_rec *old; - - BUG_ON(!new_clusters); - BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); + int i; + struct ocfs2_extent_rec *rec; - if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; - mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (em->em_clusters + new_clusters), - "Inode %llu:\n" - "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" - "em->em_clusters = %u + new_clusters = %u = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), - le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), - em->em_clusters, new_clusters, - em->em_clusters + new_clusters); - - em->em_clusters += new_clusters; - - ret = -ENOENT; - if (le32_to_cpu(rec->e_clusters) > new_clusters) { - /* This is a contiguous append */ - ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, - NULL, NULL); - if (ent) { - old = &ent->e_rec; - BUG_ON((le32_to_cpu(rec->e_cpos) + - le32_to_cpu(rec->e_clusters)) != - (le32_to_cpu(old->e_cpos) + - le32_to_cpu(old->e_clusters) + - new_clusters)); - if (ent->e_tree_depth == 0) { - BUG_ON(le32_to_cpu(old->e_cpos) != - le32_to_cpu(rec->e_cpos)); - BUG_ON(le64_to_cpu(old->e_blkno) != - le64_to_cpu(rec->e_blkno)); - ret = 0; - } - /* - * Let non-leafs fall through as -ENOENT to - * force insertion of the new leaf. - */ - le32_add_cpu(&old->e_clusters, new_clusters); - } + if (v_cluster < le32_to_cpu(rec->e_cpos)) + break; } - if (ret == -ENOENT) - ret = ocfs2_extent_map_insert(inode, rec, 0); - if (ret < 0) - mlog_errno(ret); - return ret; + return i; } -#if 0 -/* Code here is included but defined out as it completes the extent - * map api and may be used in the future. */ - /* - * Look up the record containing this cluster offset. This record is - * part of the extent map. Do not free it. Any changes you make to - * it will reflect in the extent map. So, if your last extent - * is (cpos = 10, clusters = 10) and you truncate the file by 5 - * clusters, you can do: + * Figure out the size of a hole which starts at v_cluster within the given + * extent list. * - * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); - * rec->e_clusters -= 5; + * If there is no more allocation past v_cluster, we return the maximum + * cluster size minus v_cluster. * - * The lookup does not read from disk. If the map isn't filled in for - * an entry, you won't find it. - * - * Also note that the returned record is valid until alloc_sem is - * dropped. After that, truncate and extend can happen. Caveat Emptor. + * If we have in-inode extents, then el points to the dinode list and + * eb_bh is NULL. Otherwise, eb_bh should point to the extent block + * containing el. */ -int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, - struct ocfs2_extent_rec **rec, - int *tree_depth) +static int ocfs2_figure_hole_clusters(struct inode *inode, + struct ocfs2_extent_list *el, + struct buffer_head *eb_bh, + u32 v_cluster, + u32 *num_clusters) { - int ret = -ENOENT; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + int ret, i; + struct buffer_head *next_eb_bh = NULL; + struct ocfs2_extent_block *eb, *next_eb; - *rec = NULL; + i = ocfs2_search_for_hole_index(el, v_cluster); - if (cpos >= OCFS2_I(inode)->ip_clusters) - return -EINVAL; + if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) { + eb = (struct ocfs2_extent_block *)eb_bh->b_data; - if (cpos >= em->em_clusters) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * Check the next leaf for any extents. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters ; - } - - ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, - NULL, NULL); - if (ent) { - *rec = &ent->e_rec; - if (tree_depth) - *tree_depth = ent->e_tree_depth; - ret = 0; - } + if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) + goto no_more_extents; - return ret; -} + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh, OCFS2_BH_CACHED, inode); + if (ret) { + mlog_errno(ret); + goto out; + } + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; -int ocfs2_extent_map_get_clusters(struct inode *inode, - u32 v_cpos, int count, - u32 *p_cpos, int *ret_count) -{ - int ret; - u32 coff, ccount; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent = NULL; + if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { + ret = -EROFS; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); + goto out; + } - *p_cpos = ccount = 0; + el = &next_eb->h_list; - if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) - return -EINVAL; + i = ocfs2_search_for_hole_index(el, v_cluster); + } - if ((v_cpos + count) > em->em_clusters) { +no_more_extents: + if (i == le16_to_cpu(el->l_next_free_rec)) { /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters + * We're at the end of our existing allocation. Just + * return the maximum number of clusters we could + * possibly allocate. */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; + *num_clusters = UINT_MAX - v_cluster; + } else { + *num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster; } + ret = 0; +out: + brelse(next_eb_bh); + return ret; +} - ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); - if (ret) - return ret; +/* + * Return the index of the extent record which contains cluster #v_cluster. + * -1 is returned if it was not found. + * + * Should work fine on interior and exterior nodes. + */ +static int ocfs2_search_extent_list(struct ocfs2_extent_list *el, + u32 v_cluster) +{ + int ret = -1; + int i; + struct ocfs2_extent_rec *rec; + u32 rec_end, rec_start, clusters; - if (ent) { - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, - v_cpos, - count)) - return -ESRCH; + for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { + rec = &el->l_recs[i]; - coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); - *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, - le64_to_cpu(ent->e_rec.e_blkno)) + - coff; + rec_start = le32_to_cpu(rec->e_cpos); + clusters = ocfs2_rec_clusters(el, rec); - if (ret_count) - *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; + rec_end = rec_start + clusters; - return 0; + if (v_cluster >= rec_start && v_cluster < rec_end) { + ret = i; + break; + } } - - return -ENOENT; + return ret; } -#endif /* 0 */ - -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count) +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) { - int ret; - u64 boff; - u32 cpos, clusters; - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - struct ocfs2_extent_map_entry *ent = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; + int ret, i; + unsigned int flags = 0; + struct buffer_head *di_bh = NULL; + struct buffer_head *eb_bh = NULL; + struct ocfs2_dinode *di; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; + u32 coff; - *p_blkno = 0; - - cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - clusters = ocfs2_blocks_to_clusters(inode->i_sb, - (u64)count + bpc - 1); - if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { - ret = -EINVAL; - mlog_errno(ret); - return ret; - } - - if ((cpos + clusters) > em->em_clusters) { - /* - * Size changed underneath us on disk. Drop any - * straddling records and update our idea of - * i_clusters - */ - ocfs2_extent_map_drop(inode, em->em_clusters - 1); - em->em_clusters = OCFS2_I(inode)->ip_clusters; - } + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; - ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); + ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, + &di_bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); - return ret; + goto out; } - if (ent) - { - rec = &ent->e_rec; + di = (struct ocfs2_dinode *) di_bh->b_data; + el = &di->id2.i_list; - /* We should never find ourselves straddling an interval */ - if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { - ret = -ESRCH; + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { mlog_errno(ret); - return ret; + goto out; } - boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - - le32_to_cpu(rec->e_cpos)); - boff += (v_blkno & (u64)(bpc - 1)); - *p_blkno = le64_to_cpu(rec->e_blkno) + boff; + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; - if (ret_count) { - *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, - le32_to_cpu(rec->e_clusters)) - boff; + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; } - - return 0; } - return -ENOENT; -} - -int ocfs2_extent_map_init(struct inode *inode) -{ - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - - em->em_extents = RB_ROOT; - em->em_clusters = 0; - - return 0; -} - -/* Needs the lock */ -static void __ocfs2_extent_map_drop(struct inode *inode, - u32 new_clusters, - struct rb_node **free_head, - struct ocfs2_extent_map_entry **tail_ent) -{ - struct rb_node *node, *next; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, + v_cluster, + num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + } + } else { + rec = &el->l_recs[i]; - *free_head = NULL; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); - ent = NULL; - node = rb_last(&em->em_extents); - while (node) - { - next = rb_prev(node); + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) - break; + coff = v_cluster - le32_to_cpu(rec->e_cpos); - rb_erase(&ent->e_node, &em->em_extents); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; - node->rb_right = *free_head; - *free_head = node; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; - ent = NULL; - node = next; - } + flags = rec->e_flags; - /* Do we have an entry straddling new_clusters? */ - if (tail_ent) { - if (ent && - ((le32_to_cpu(ent->e_rec.e_cpos) + - le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) - *tail_ent = ent; - else - *tail_ent = NULL; + ocfs2_extent_map_insert_rec(inode, rec); } -} - -static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) -{ - struct rb_node *node; - struct ocfs2_extent_map_entry *ent; - while (free_head) { - node = free_head; - free_head = node->rb_right; + if (extent_flags) + *extent_flags = flags; - ent = rb_entry(node, struct ocfs2_extent_map_entry, - e_node); - kmem_cache_free(ocfs2_em_ent_cachep, ent); - } +out: + brelse(di_bh); + brelse(eb_bh); + return ret; } /* - * Remove all entries past new_clusters, inclusive of an entry that - * contains new_clusters. This is effectively a cache forget. - * - * If you want to also clip the last extent by some number of clusters, - * you need to call ocfs2_extent_map_trunc(). - * This code does not check or modify ip_clusters. + * This expects alloc_sem to be held. The allocation cannot change at + * all while the map is in the process of being updated. */ -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags) { - struct rb_node *free_head = NULL; - struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; - struct ocfs2_extent_map_entry *ent; - - spin_lock(&OCFS2_I(inode)->ip_lock); + int ret; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 cpos, num_clusters, p_cluster; + u64 boff = 0; - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); + cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); - if (ent) { - rb_erase(&ent->e_node, &em->em_extents); - ent->e_node.rb_right = free_head; - free_head = &ent->e_node; + ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, + extent_flags); + if (ret) { + mlog_errno(ret); + goto out; } - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} - -/* - * Remove all entries past new_clusters and also clip any extent - * straddling new_clusters, if there is one. This does not check - * or modify ip_clusters - */ -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) -{ - struct rb_node *free_head = NULL; - struct ocfs2_extent_map_entry *ent = NULL; - - spin_lock(&OCFS2_I(inode)->ip_lock); - - __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); - - if (ent) - ent->e_rec.e_clusters = cpu_to_le32(new_clusters - - le32_to_cpu(ent->e_rec.e_cpos)); - - OCFS2_I(inode)->ip_map.em_clusters = new_clusters; - - spin_unlock(&OCFS2_I(inode)->ip_lock); - - if (free_head) - __ocfs2_extent_map_drop_cleanup(free_head); - - return 0; -} + /* + * p_cluster == 0 indicates a hole. + */ + if (p_cluster) { + boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + boff += (v_blkno & (u64)(bpc - 1)); + } -int __init init_ocfs2_extent_maps(void) -{ - ocfs2_em_ent_cachep = - kmem_cache_create("ocfs2_em_ent", - sizeof(struct ocfs2_extent_map_entry), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - if (!ocfs2_em_ent_cachep) - return -ENOMEM; + *p_blkno = boff; - return 0; -} + if (ret_count) { + *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + *ret_count -= v_blkno & (u64)(bpc - 1); + } -void exit_ocfs2_extent_maps(void) -{ - kmem_cache_destroy(ocfs2_em_ent_cachep); +out: + return ret; } diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index fa3745efa886..de91e3e41a22 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -25,22 +25,29 @@ #ifndef _EXTENT_MAP_H #define _EXTENT_MAP_H -int init_ocfs2_extent_maps(void); -void exit_ocfs2_extent_maps(void); +struct ocfs2_extent_map_item { + unsigned int ei_cpos; + unsigned int ei_phys; + unsigned int ei_clusters; + unsigned int ei_flags; -/* - * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem - * to be held. The allocation cannot change at all while the map is - * in the process of being updated. - */ -int ocfs2_extent_map_init(struct inode *inode); -int ocfs2_extent_map_append(struct inode *inode, - struct ocfs2_extent_rec *rec, - u32 new_clusters); -int ocfs2_extent_map_get_blocks(struct inode *inode, - u64 v_blkno, int count, - u64 *p_blkno, int *ret_count); -int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters); -int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters); + struct list_head ei_list; +}; + +#define OCFS2_MAX_EXTENT_MAP_ITEMS 3 +struct ocfs2_extent_map { + unsigned int em_num_items; + struct list_head em_list; +}; + +void ocfs2_extent_map_init(struct inode *inode); +void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster); +void ocfs2_extent_map_insert_rec(struct inode *inode, + struct ocfs2_extent_rec *rec); + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, + u32 *num_clusters, unsigned int *extent_flags); +int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, + u64 *ret_count, unsigned int *extent_flags); #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f2cd3bf9efb2..520a2a6d7670 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -33,6 +33,7 @@ #include <linux/sched.h> #include <linux/pipe_fs_i.h> #include <linux/mount.h> +#include <linux/writeback.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle, mlog_entry_void(); i_size_write(inode, new_i_size); - inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_ctime = inode->i_mtime = CURRENT_TIME; status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); @@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, { int status; handle_t *handle; + struct ocfs2_dinode *di; mlog_entry_void(); @@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + + /* + * Do this before setting i_size. + */ + status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); + if (status) { + mlog_errno(status); + goto out_commit; + } + + i_size_write(inode, new_i_size); + inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + + di = (struct ocfs2_dinode *) fe_bh->b_data; + di->i_size = cpu_to_le64(new_i_size); + di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) mlog_errno(status); +out_commit: ocfs2_commit_trans(osb, handle); out: + mlog_exit(status); return status; } @@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, mlog_errno(status); goto bail; } - ocfs2_data_unlock(inode, 1); - - if (le32_to_cpu(fe->i_clusters) == - ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { - mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", - fe->i_clusters); - /* No allocation change is required, so lets fast path - * this truncate. */ - status = ocfs2_simple_size_update(inode, di_bh, new_i_size); - if (status < 0) - mlog_errno(status); - goto bail; - } /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the @@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } status = ocfs2_commit_truncate(osb, inode, di_bh, tc); if (status < 0) { mlog_errno(status); - goto bail; + goto bail_unlock_data; } /* TODO: orphan dir cleanup here. */ +bail_unlock_data: + ocfs2_data_unlock(inode, 1); + bail: mlog_exit(status); @@ -397,6 +416,7 @@ bail: */ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, + u32 *logical_offset, u32 clusters_to_add, struct buffer_head *fe_bh, handle_t *handle, @@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, block = ocfs2_clusters_to_blocks(osb->sb, bit_off); mlog(0, "Allocating %u clusters at block %u for inode %llu\n", num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, - num_bits, meta_ac); + status = ocfs2_insert_extent(osb, handle, inode, fe_bh, + *logical_offset, block, num_bits, + meta_ac); if (status < 0) { mlog_errno(status); goto leave; } - le32_add_cpu(&fe->i_clusters, num_bits); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); - status = ocfs2_journal_dirty(handle, fe_bh); if (status < 0) { mlog_errno(status); @@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, } clusters_to_add -= num_bits; + *logical_offset += num_bits; if (clusters_to_add) { mlog(0, "need to alloc once more, clusters = %u, wanted = " @@ -494,14 +511,87 @@ leave: return status; } +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Called from ocfs2_extend_allocation() for file systems which don't + * support holes, and from ocfs2_write() for file systems which + * understand sparse inodes. + */ +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, + u32 clusters_to_add, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret, num_free_extents; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + *data_ac = NULL; + + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), + le32_to_cpu(di->i_clusters), clusters_to_add); + + num_free_extents = ocfs2_num_free_extents(osb, inode, di); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { + ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} + static int ocfs2_extend_allocation(struct inode *inode, u32 clusters_to_add) { int status = 0; int restart_func = 0; int drop_alloc_sem = 0; - int credits, num_free_extents; - u32 prev_clusters; + int credits; + u32 prev_clusters, logical_start; struct buffer_head *bh = NULL; struct ocfs2_dinode *fe = NULL; handle_t *handle = NULL; @@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode, mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); + /* + * This function only exists for file systems which don't + * support holes. + */ + BUG_ON(ocfs2_sparse_alloc(osb)); + status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (status < 0) { @@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode, goto leave; } + logical_start = OCFS2_I(inode)->ip_clusters; + restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " - "clusters_to_add = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), - fe->i_clusters, clusters_to_add); - - num_free_extents = ocfs2_num_free_extents(osb, - inode, - fe); - if (num_free_extents < 0) { - status = num_free_extents; - mlog_errno(status); - goto leave; - } - - if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - } - - status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - /* blocks peope in read/write from reading our allocation * until we're done changing it. We depend on i_mutex to block * other extend/truncate calls while we're here. Ordering wrt @@ -566,6 +634,13 @@ restart_all: down_write(&OCFS2_I(inode)->ip_alloc_sem); drop_alloc_sem = 1; + status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, + &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -590,6 +665,7 @@ restarted_transaction: status = ocfs2_do_extend_allocation(osb, inode, + &logical_start, clusters_to_add, bh, handle, @@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode, size_t tail_to_skip) { int ret = 0; - u32 clusters_to_add; + u32 clusters_to_add = 0; BUG_ON(!tail_to_skip && !di_bh); @@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode, goto out; BUG_ON(new_i_size < i_size_read(inode)); + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + BUG_ON(tail_to_skip != 0); + goto out_update_size; + } + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - OCFS2_I(inode)->ip_clusters; @@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode, goto out_unlock; } +out_update_size: if (!tail_to_skip) { /* We're being called from ocfs2_setattr() which wants * us to update i_size */ @@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode, } out_unlock: - ocfs2_data_unlock(inode, 1); + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ocfs2_data_unlock(inode, 1); out: return ret; @@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) ret = ocfs2_meta_lock(inode, NULL, 0); if (ret) { - mlog_errno(ret); + if (ret != -ENOENT) + mlog_errno(ret); goto out; } @@ -1035,10 +1119,49 @@ out: return ret; } +/* + * Will look for holes and unwritten extents in the range starting at + * pos for count bytes (inclusive). + */ +static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, - int appending) + int appending, + int *direct_io) { int ret = 0, meta_level = appending; struct inode *inode = dentry->d_inode; @@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, } else { saved_pos = *ppos; } + + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { + loff_t end = saved_pos + count; + + /* + * Skip the O_DIRECT checks if we don't need + * them. + */ + if (!direct_io || !(*direct_io)) + break; + + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode)) { + *direct_io = 0; + break; + } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, saved_pos, + count); + if (ret == 1) { + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); + break; + } + + /* + * The rest of this loop is concerned with legacy file + * systems which don't support sparse files. + */ + newsize = count + saved_pos; mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", @@ -1141,55 +1307,264 @@ out: return ret; } +static inline void +ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) +{ + const struct iovec *iov = *iovp; + size_t base = *basep; + + do { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } while (bytes); + *iovp = iov; + *basep = base; +} + +static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, + const struct iovec *cur_iov, + size_t iov_offset) +{ + int ret; + char *buf; + struct page *src_page = NULL; + + buf = cur_iov->iov_base + iov_offset; + + if (!segment_eq(get_fs(), KERNEL_DS)) { + /* + * Pull in the user page. We want to do this outside + * of the meta data locks in order to preserve locking + * order in case of page fault. + */ + ret = get_user_pages(current, current->mm, + (unsigned long)buf & PAGE_CACHE_MASK, 1, + 0, 0, &src_page, NULL); + if (ret == 1) + bp->b_src_buf = kmap(src_page); + else + src_page = ERR_PTR(-EFAULT); + } else { + bp->b_src_buf = buf; + } + + return src_page; +} + +static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, + struct page *page) +{ + if (page) { + kunmap(page); + page_cache_release(page); + } +} + +static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, + const struct iovec *iov, + unsigned long nr_segs, + size_t count, + ssize_t o_direct_written) +{ + int ret = 0; + ssize_t copied, total = 0; + size_t iov_offset = 0; + const struct iovec *cur_iov = iov; + struct ocfs2_buffered_write_priv bp; + struct page *page; + + /* + * handle partial DIO write. Adjust cur_iov if needed. + */ + ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); + + do { + bp.b_cur_off = iov_offset; + bp.b_cur_iov = cur_iov; + + page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); + if (IS_ERR(page)) { + ret = PTR_ERR(page); + goto out; + } + + copied = ocfs2_buffered_write_cluster(file, *ppos, count, + ocfs2_map_and_write_user_data, + &bp); + + ocfs2_put_write_source(&bp, page); + + if (copied < 0) { + mlog_errno(copied); + ret = copied; + goto out; + } + + total += copied; + *ppos = *ppos + copied; + count -= copied; + + ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); + } while(count); + +out: + return total ? total : ret; +} + +static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, + unsigned long *nr_segs) +{ + size_t ocount; /* original count */ + unsigned long seg; + + ocount = 0; + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + *counted = ocount; + return 0; +} + static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - int ret, rw_level, have_alloc_sem = 0; - struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_path.dentry->d_inode; - int appending = filp->f_flags & O_APPEND ? 1 : 0; - - mlog_entry("(0x%p, %u, '%.*s')\n", filp, + int ret, direct_io, appending, rw_level, have_alloc_sem = 0; + int can_do_direct, sync = 0; + ssize_t written = 0; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + loff_t *ppos = &iocb->ki_pos; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_path.dentry->d_inode; + + mlog_entry("(0x%p, %u, '%.*s')\n", file, (unsigned int)nr_segs, - filp->f_path.dentry->d_name.len, - filp->f_path.dentry->d_name.name); + file->f_path.dentry->d_name.len, + file->f_path.dentry->d_name.name); - /* happy write of zero bytes */ if (iocb->ki_left == 0) return 0; + ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); + if (ret) + return ret; + + count = ocount; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + appending = file->f_flags & O_APPEND ? 1 : 0; + direct_io = file->f_flags & O_DIRECT ? 1 : 0; + mutex_lock(&inode->i_mutex); + +relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ - if (filp->f_flags & O_DIRECT) { - have_alloc_sem = 1; + if (direct_io) { down_read(&inode->i_alloc_sem); + have_alloc_sem = 1; } /* concurrent O_DIRECT writes are allowed */ - rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; + rw_level = !direct_io; ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - rw_level = -1; mlog_errno(ret); - goto out; + goto out_sems; } - ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, - iocb->ki_left, appending); + can_do_direct = direct_io; + ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, + iocb->ki_left, appending, + &can_do_direct); if (ret < 0) { mlog_errno(ret); goto out; } - /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_rw_locked(iocb); + /* + * We can't complete the direct I/O as requested, fall back to + * buffered I/O. + */ + if (direct_io && !can_do_direct) { + ocfs2_rw_unlock(inode, rw_level); + up_read(&inode->i_alloc_sem); + + have_alloc_sem = 0; + rw_level = -1; - ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); + direct_io = 0; + sync = 1; + goto relock; + } + + if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) + sync = 1; + + /* + * XXX: Is it ok to execute these checks a second time? + */ + ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); + if (ret) + goto out; + + /* + * Set pos so that sync_page_range_nolock() below understands + * where to start from. We might've moved it around via the + * calls above. The range we want to actually sync starts from + * *ppos here. + * + */ + pos = *ppos; + + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_rw_locked(iocb, rw_level); + + if (direct_io) { + written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, + ppos, count, ocount); + if (written < 0) { + ret = written; + goto out_dio; + } + } else { + written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, + count, written); + if (written < 0) { + ret = written; + if (ret != -EFAULT || ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } +out_dio: /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io @@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, } out: + if (rw_level != -1) + ocfs2_rw_unlock(inode, rw_level); + +out_sems: if (have_alloc_sem) up_read(&inode->i_alloc_sem); - if (rw_level != -1) - ocfs2_rw_unlock(inode, rw_level); + + if (written > 0 && sync) { + ssize_t err; + + err = sync_page_range_nolock(inode, file->f_mapping, pos, count); + if (err < 0) + written = err; + } + mutex_unlock(&inode->i_mutex); mlog_exit(ret); + return written ? written : ret; +} + +static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + struct splice_desc *sd) +{ + int ret, count, total = 0; + ssize_t copied = 0; + struct ocfs2_splice_write_priv sp; + + ret = buf->ops->pin(pipe, buf); + if (ret) + goto out; + + sp.s_sd = sd; + sp.s_buf = buf; + sp.s_pipe = pipe; + sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; + sp.s_buf_offset = buf->offset; + + count = sd->len; + if (count + sp.s_offset > PAGE_CACHE_SIZE) + count = PAGE_CACHE_SIZE - sp.s_offset; + + do { + /* + * splice wants us to copy up to one page at a + * time. For pagesize > cluster size, this means we + * might enter ocfs2_buffered_write_cluster() more + * than once, so keep track of our progress here. + */ + copied = ocfs2_buffered_write_cluster(sd->file, + (loff_t)sd->pos + total, + count, + ocfs2_map_and_write_splice_data, + &sp); + if (copied < 0) { + mlog_errno(copied); + ret = copied; + goto out; + } + + count -= copied; + sp.s_offset += copied; + sp.s_buf_offset += copied; + total += copied; + } while (count); + + ret = 0; +out: + + return total ? total : ret; +} + +static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + int ret, err; + struct address_space *mapping = out->f_mapping; + struct inode *inode = mapping->host; + + ret = __splice_from_pipe(pipe, out, ppos, len, flags, + ocfs2_splice_write_actor); + if (ret > 0) { + *ppos += ret; + + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + if (err) + ret = err; + } + } + return ret; } @@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, goto out; } - ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); + ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, + NULL); if (ret < 0) { mlog_errno(ret); goto out_unlock; } /* ok, we're done with i_size and alloc work */ - ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); out_unlock: ocfs2_rw_unlock(inode, 1); @@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } rw_level = 0; /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_rw_locked(iocb); + ocfs2_iocb_set_rw_locked(iocb, rw_level); } /* diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index cc973f01f6ce..2c4460fced52 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted { }; int ocfs2_do_extend_allocation(struct ocfs2_super *osb, struct inode *inode, + u32 *cluster_start, u32 clusters_to_add, struct buffer_head *fe_bh, handle_t *handle, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason); +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, + u32 clusters_to_add, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 8fc52d6d0ce7..b25ef63781ba 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -164,8 +164,10 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb) } status = o2hb_register_callback(&osb->osb_hb_up); - if (status < 0) + if (status < 0) { mlog_errno(status); + o2hb_unregister_callback(&osb->osb_hb_down); + } bail: return status; @@ -173,18 +175,11 @@ bail: void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb) { - int status; - if (ocfs2_mount_local(osb)) return; - status = o2hb_unregister_callback(&osb->osb_hb_down); - if (status < 0) - mlog_errno(status); - - status = o2hb_unregister_callback(&osb->osb_hb_up); - if (status < 0) - mlog_errno(status); + o2hb_unregister_callback(&osb->osb_hb_down); + o2hb_unregister_callback(&osb->osb_hb_up); } void ocfs2_stop_heartbeat(struct ocfs2_super *osb) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 28ab56f2b98c..21a605079c62 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DIRSYNC; } -struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, - u64 blkno, - int delete_vote) -{ - struct ocfs2_find_inode_args args; - - /* ocfs2_ilookup_for_vote should *only* be called from the - * vote thread */ - BUG_ON(current != osb->vote_task); - - args.fi_blkno = blkno; - args.fi_flags = OCFS2_FI_FLAG_NOWAIT; - if (delete_vote) - args.fi_flags |= OCFS2_FI_FLAG_DELETE; - args.fi_ino = ino_from_blkno(osb->sb, blkno); - return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); -} - struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags) { struct inode *inode = NULL; @@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque) if (oi->ip_blkno != args->fi_blkno) goto bail; - /* OCFS2_FI_FLAG_NOWAIT is *only* set from - * ocfs2_ilookup_for_vote which won't create an inode for one - * that isn't found. The vote thread which doesn't want to get - * an inode which is in the process of going away - otherwise - * the call to __wait_on_freeing_inode in find_inode_fast will - * cause it to deadlock on an inode which may be waiting on a - * vote (or lock release) in delete_inode */ - if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && - (inode->i_state & (I_FREEING|I_CLEAR))) { - /* As stated above, we're not going to return an - * inode. In the case of a delete vote, the voting - * code is going to signal the other node to go - * ahead. Mark that state here, so this freeing inode - * has the state when it gets to delete_inode. */ - if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { - spin_lock(&oi->ip_lock); - ocfs2_mark_inode_remotely_deleted(inode); - spin_unlock(&oi->ip_lock); - } - goto bail; - } - ret = 1; bail: mlog_exit(ret); @@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, goto bail; } + OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); + OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); + inode->i_version = 1; inode->i_generation = le32_to_cpu(fe->i_generation); inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); @@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, if (S_ISLNK(inode->i_mode) && !fe->i_clusters) inode->i_blocks = 0; else - inode->i_blocks = - ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); @@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)fe->i_blkno); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; - OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); - inode->i_nlink = le16_to_cpu(fe->i_links_count); if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) @@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, OCFS2_LOCK_TYPE_META, 0, inode); + + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, 0, inode); } ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, @@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, * cluster lock before trusting anything anyway. */ can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) - && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK) + && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) && !ocfs2_mount_local(osb); /* @@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_LOCK_TYPE_META, generation, inode); + ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, + OCFS2_LOCK_TYPE_OPEN, + 0, inode); + if (can_lock) { + status = ocfs2_open_lock(inode); + if (status) { + make_bad_inode(inode); + mlog_errno(status); + return status; + } status = ocfs2_meta_lock(inode, NULL, 0); if (status) { make_bad_inode(inode); @@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } + if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { + status = ocfs2_try_open_lock(inode, 0); + if (status) { + make_bad_inode(inode); + return status; + } + } + status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, can_lock ? inode : NULL); if (status < 0) { @@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, struct buffer_head *fe_bh) { int status = 0; - handle_t *handle = NULL; struct ocfs2_truncate_context *tc = NULL; struct ocfs2_dinode *fe; + handle_t *handle = NULL; mlog_entry_void(); fe = (struct ocfs2_dinode *) fe_bh->b_data; - /* zero allocation, zero truncate :) */ - if (!fe->i_clusters) - goto bail; + if (fe->i_clusters) { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto bail; - } + status = ocfs2_journal_access(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } - status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); - if (status < 0) { - mlog_errno(status); - goto bail; - } + i_size_write(inode, 0); - ocfs2_commit_trans(osb, handle); - handle = NULL; + status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); + if (status < 0) { + mlog_errno(status); + goto out; + } - status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); - if (status < 0) { - mlog_errno(status); - goto bail; - } + ocfs2_commit_trans(osb, handle); + handle = NULL; - status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); - if (status < 0) { - mlog_errno(status); - goto bail; + status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); + if (status < 0) { + mlog_errno(status); + goto out; + } } -bail: + +out: if (handle) ocfs2_commit_trans(osb, handle); - mlog_exit(status); return status; } @@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode, struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di; - /* We've already voted on this so it should be readonly - no - * spinlock needed. */ - orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + di = (struct ocfs2_dinode *) di_bh->b_data; + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); if (status) @@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode, goto bail; } - status = ocfs2_request_delete_vote(inode); - /* -EBUSY means that other nodes are still using the - * inode. We're done here though, so avoid doing anything on - * disk and let them worry about deleting it. */ - if (status == -EBUSY) { + /* + * This is how ocfs2 determines whether an inode is still live + * within the cluster. Every node takes a shared read lock on + * the inode open lock in ocfs2_read_locked_inode(). When we + * get to ->delete_inode(), each node tries to convert it's + * lock to an exclusive. Trylocks are serialized by the inode + * meta data lock. If the upconvert suceeds, we know the inode + * is no longer live and can be deleted. + * + * Though we call this with the meta data lock held, the + * trylock keeps us from ABBA deadlock. + */ + status = ocfs2_try_open_lock(inode, 1); + if (status == -EAGAIN) { status = 0; mlog(0, "Skipping delete of %llu because it is in use on" "other nodes\n", (unsigned long long)oi->ip_blkno); @@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode, goto bail; } - spin_lock(&oi->ip_lock); - if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { - /* Nobody knew which slot this inode was orphaned - * into. This may happen during node death and - * recovery knows how to clean it up so we can safely - * ignore this inode for now on. */ - mlog(0, "Nobody knew where inode %llu was orphaned!\n", - (unsigned long long)oi->ip_blkno); - } else { - *wipe = 1; - - mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", - (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); - } - spin_unlock(&oi->ip_lock); + *wipe = 1; + mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", + (unsigned long long)oi->ip_blkno, + le16_to_cpu(di->i_orphaned_slot)); bail: return status; @@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode) mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, "Inode=%lu\n", inode->i_ino); + /* For remove delete_inode vote, we hold open lock before, + * now it is time to unlock PR and EX open locks. */ + ocfs2_open_unlock(inode); + /* Do these before all the other work so that we don't bounce * the vote thread while waiting to destroy the locks. */ ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); + ocfs2_mark_lockres_freeing(&oi->ip_open_lockres); /* We very well may get a clear_inode before all an inodes * metadata has hit disk. Of course, we can't drop any cluster @@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode) "Clear inode of %llu, inode has io markers\n", (unsigned long long)oi->ip_blkno); - ocfs2_extent_map_drop(inode, 0); - ocfs2_extent_map_init(inode); + ocfs2_extent_map_trunc(inode, 0); status = ocfs2_drop_inode_locks(inode); if (status < 0) @@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode) ocfs2_lock_res_free(&oi->ip_rw_lockres); ocfs2_lock_res_free(&oi->ip_meta_lockres); ocfs2_lock_res_free(&oi->ip_data_lockres); + ocfs2_lock_res_free(&oi->ip_open_lockres); ocfs2_metadata_cache_purge(inode); @@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode) mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - /* Testing ip_orphaned_slot here wouldn't work because we may - * not have gotten a delete_inode vote from any other nodes - * yet. */ if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) generic_delete_inode(inode); else @@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode, return NULL; } - tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, - &p_blkno, NULL); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); if (tmperr < 0) { mlog_errno(tmperr); goto fail; @@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode, if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) inode->i_blocks = 0; else - inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); + inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 1a7dd2945b34..03ae075869ee 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -26,6 +26,8 @@ #ifndef OCFS2_INODE_H #define OCFS2_INODE_H +#include "extent_map.h" + /* OCFS2 Inode Private Data */ struct ocfs2_inode_info { @@ -34,6 +36,7 @@ struct ocfs2_inode_info struct ocfs2_lock_res ip_rw_lockres; struct ocfs2_lock_res ip_meta_lockres; struct ocfs2_lock_res ip_data_lockres; + struct ocfs2_lock_res ip_open_lockres; /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; @@ -42,9 +45,7 @@ struct ocfs2_inode_info spinlock_t ip_lock; u32 ip_open_count; u32 ip_clusters; - struct ocfs2_extent_map ip_map; struct list_head ip_io_markers; - int ip_orphaned_slot; struct mutex ip_io_mutex; @@ -64,6 +65,8 @@ struct ocfs2_inode_info struct ocfs2_caching_info ip_metadata_cache; + struct ocfs2_extent_map ip_extent_map; + struct inode vfs_inode; }; @@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); /* Flags for ocfs2_iget() */ -#define OCFS2_FI_FLAG_NOWAIT 0x1 -#define OCFS2_FI_FLAG_DELETE 0x2 -#define OCFS2_FI_FLAG_SYSFILE 0x4 -#define OCFS2_FI_FLAG_NOLOCK 0x8 +#define OCFS2_FI_FLAG_SYSFILE 0x4 +#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags); -struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, - u64 blkno, - int delete_vote); int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, @@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); void ocfs2_set_inode_flags(struct inode *inode); +static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) +{ + int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; + + return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 825cb0ae1b4c..5a8a90d1c787 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -649,29 +649,20 @@ bail: static int ocfs2_force_read_journal(struct inode *inode) { int status = 0; - int i, p_blocks; - u64 v_blkno, p_blkno; -#define CONCURRENT_JOURNAL_FILL 32 + int i; + u64 v_blkno, p_blkno, p_blocks, num_blocks; +#define CONCURRENT_JOURNAL_FILL 32ULL struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; mlog_entry_void(); - BUG_ON(inode->i_blocks != - ocfs2_align_bytes_to_sectors(i_size_read(inode))); - memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); - mlog(0, "Force reading %llu blocks\n", - (unsigned long long)(inode->i_blocks >> - (inode->i_sb->s_blocksize_bits - 9))); - + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); v_blkno = 0; - while (v_blkno < - (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) { - + while (v_blkno < num_blocks) { status = ocfs2_extent_map_get_blocks(inode, v_blkno, - 1, &p_blkno, - &p_blocks); + &p_blkno, &p_blocks, NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, continue; iter = ocfs2_iget(osb, le64_to_cpu(de->inode), - OCFS2_FI_FLAG_NOLOCK); + OCFS2_FI_FLAG_ORPHAN_RECOVERY); if (IS_ERR(iter)) continue; @@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; - oi->ip_orphaned_slot = slot; spin_unlock(&oi->ip_lock); iput(inode); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d026b4f27757..3db5de4506da 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, /* We may be deleting metadata blocks, so metadata alloc dinode + one desc. block for each possible delete. */ if (tree_depth && next_free == 1 && - le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del) + ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) credits += 1 + tree_depth; /* update to the truncate log. */ diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 51b020447683..af01158b39f5 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) int ret = 0, lock_level = 0; struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); - /* We don't want to support shared writable mappings yet. */ - if (!ocfs2_mount_local(osb) && + /* + * Only support shared writeable mmap for local mounts which + * don't know about holes. + */ + if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f7fa52bb3f6b..2bcf353fd7c5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0); if (IS_ERR(inode)) { - mlog(ML_ERROR, "Unable to create inode %llu\n", - (unsigned long long)blkno); ret = ERR_PTR(-EACCES); goto bail_unlock; } @@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, * unlink. */ spin_lock(&oi->ip_lock); oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED; - oi->ip_orphaned_slot = OCFS2_INVALID_SLOT; spin_unlock(&oi->ip_lock); bail_add: @@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb, i_size_write(inode, inode->i_sb->s_blocksize); inode->i_nlink = 2; - inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize); + inode->i_blocks = ocfs2_inode_sector_count(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { mlog_errno(status); @@ -1098,7 +1095,7 @@ static int ocfs2_rename(struct inode *old_dir, BUG(); } - /* Assume a directory heirarchy thusly: + /* Assume a directory hierarchy thusly: * a/b/c * a/d * a,b,c, and d are all directories. @@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, struct buffer_head **bhs = NULL; const char *c; struct super_block *sb = osb->sb; - u64 p_blkno; - int p_blocks; + u64 p_blkno, p_blocks; int virtual, blocks, status, i, bytes_left; bytes_left = i_size_read(inode) + 1; @@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno, - &p_blocks); + status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks, + NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; if (l > ocfs2_fast_symlink_chars(sb)) { + u32 offset = 0; + inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh, + status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, + new_fe_bh, handle, data_ac, NULL, NULL); if (status < 0) { @@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } i_size_write(inode, newsize); - inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize); + inode->i_blocks = ocfs2_inode_sector_count(inode); } else { inode->i_op = &ocfs2_fast_symlink_inode_operations; memcpy((char *) fe->id2.i_symlink, symname, l); @@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, /* Record which orphan dir our inode now resides * in. delete_inode will use this to determine which orphan * dir to lock. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num; - spin_unlock(&OCFS2_I(inode)->ip_lock); + fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); mlog(0, "Inode %llu orphaned in slot %d\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index db8e77cd35d3..82cc92dcf8a6 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -46,11 +46,6 @@ #include "endian.h" #include "ocfs2_lockid.h" -struct ocfs2_extent_map { - u32 em_clusters; - struct rb_root em_extents; -}; - /* Most user visible OCFS2 inodes will have very few pieces of * metadata, but larger files (including bitmaps, etc) must be taken * into account when designing an access scheme. We allow a small @@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode) return 1; } +static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes) return (unsigned long)((bytes + 511) >> 9); } +static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb, + unsigned long pg_index) +{ + u32 clusters = pg_index; + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + + if (unlikely(PAGE_CACHE_SHIFT > cbits)) + clusters = pg_index << (PAGE_CACHE_SHIFT - cbits); + else if (PAGE_CACHE_SHIFT < cbits) + clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT); + + return clusters; +} + +/* + * Find the 1st page index which covers the given clusters. + */ +static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb, + u32 clusters) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned long index = clusters; + + if (PAGE_CACHE_SHIFT > cbits) { + index = clusters >> (PAGE_CACHE_SHIFT - cbits); + } else if (PAGE_CACHE_SHIFT < cbits) { + index = clusters << (cbits - PAGE_CACHE_SHIFT); + } + + return index; +} + +static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) +{ + unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits; + unsigned int pages_per_cluster = 1; + + if (PAGE_CACHE_SHIFT < cbits) + pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT); + + return pages_per_cluster; +} + #define ocfs2_set_bit ext2_set_bit #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e61e218f5e0b..71306479c68f 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -86,7 +86,8 @@ OCFS2_SB(sb)->s_feature_incompat &= ~(mask) #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB -#define OCFS2_FEATURE_INCOMPAT_SUPP OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT +#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ + | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC) #define OCFS2_FEATURE_RO_COMPAT_SUPP 0 /* @@ -155,6 +156,12 @@ #define OCFS2_FL_MODIFIABLE (0x000100FF) /* User modifiable flags */ /* + * Extent record flags (e_node.leaf.flags) + */ +#define OCFS2_EXT_UNWRITTEN (0x01) /* Extent is allocated but + * unwritten */ + +/* * ioctl commands */ #define OCFS2_IOC_GETFLAGS _IOR('f', 1, long) @@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { /* * On disk extent record for OCFS2 * It describes a range of clusters on disk. + * + * Length fields are divided into interior and leaf node versions. + * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes. */ struct ocfs2_extent_rec { /*00*/ __le32 e_cpos; /* Offset into the file, in clusters */ - __le32 e_clusters; /* Clusters covered by this extent */ + union { + __le32 e_int_clusters; /* Clusters covered by all children */ + struct { + __le16 e_leaf_clusters; /* Clusters covered by this + extent */ + __u8 e_reserved1; + __u8 e_flags; /* Extent flags */ + }; + }; __le64 e_blkno; /* Physical disk offset, in blocks */ /*10*/ }; @@ -311,7 +329,10 @@ struct ocfs2_extent_list { /*00*/ __le16 l_tree_depth; /* Extent tree depth from this point. 0 means data extents hang directly off this - header (a leaf) */ + header (a leaf) + NOTE: The high 8 bits cannot be + used - tree_depth is never that big. + */ __le16 l_count; /* Number of extent records */ __le16 l_next_free_rec; /* Next unused extent slot */ __le16 l_reserved1; @@ -446,7 +467,9 @@ struct ocfs2_dinode { __le32 i_ctime_nsec; __le32 i_mtime_nsec; __le32 i_attr; - __le32 i_reserved1; + __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL + was set in i_flags */ + __le16 i_reserved1; /*70*/ __le64 i_reserved2[8]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 4d5d5655c185..4ca02b1c38ac 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -44,6 +44,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_DENTRY, + OCFS2_LOCK_TYPE_OPEN, OCFS2_NUM_LOCK_TYPES }; @@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_DENTRY: c = 'N'; break; + case OCFS2_LOCK_TYPE_OPEN: + c = 'O'; + break; default: c = '\0'; } @@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = { * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", + [OCFS2_LOCK_TYPE_OPEN] = "Open", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 2d3ac32cb74e..d921a28329dc 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) goto bail; } - status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL); + status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6dbb11762759..0da655ae5d6f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le32_to_cpu(fe->i_clusters))); spin_unlock(&OCFS2_I(alloc_inode)->ip_lock); i_size_write(alloc_inode, le64_to_cpu(fe->i_size)); - alloc_inode->i_blocks = - ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode)); + alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode); status = 0; bail: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 6534f92424dd..5c9e8243691f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -806,9 +806,6 @@ static int __init ocfs2_init(void) ocfs2_print_version(); - if (init_ocfs2_extent_maps()) - return -ENOMEM; - status = init_ocfs2_uptodate_cache(); if (status < 0) { mlog_errno(status); @@ -837,7 +834,6 @@ leave: if (status < 0) { ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); - exit_ocfs2_extent_maps(); } mlog_exit(status); @@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void) unregister_filesystem(&ocfs2_fs_type); - exit_ocfs2_extent_maps(); - exit_ocfs2_uptodate_cache(); mlog_exit_void(); @@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data, ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_meta_lockres); ocfs2_lock_res_init_once(&oi->ip_data_lockres); + ocfs2_lock_res_init_once(&oi->ip_open_lockres); ocfs2_metadata_cache_init(&oi->vfs_inode); diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index f30e63b9910c..4f82a2f0efef 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -63,17 +63,10 @@ struct ocfs2_msg_hdr __be32 h_node_num; /* node sending this particular message. */ }; -/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this - * for the network. */ -#define OCFS2_VOTE_FILENAME_LEN 256 struct ocfs2_vote_msg { struct ocfs2_msg_hdr v_hdr; - union { - __be32 v_generic1; - __be32 v_orphaned_slot; /* Used during delete votes */ - __be32 v_nlink; /* Used during unlink votes */ - } md1; /* Message type dependant 1 */ + __be32 v_reserved1; }; /* Responses are given these values to maintain backwards @@ -86,7 +79,6 @@ struct ocfs2_response_msg { struct ocfs2_msg_hdr r_hdr; __be32 r_response; - __be32 r_orphaned_slot; }; struct ocfs2_vote_work { @@ -96,7 +88,6 @@ struct ocfs2_vote_work { enum ocfs2_vote_request { OCFS2_VOTE_REQ_INVALID = 0, - OCFS2_VOTE_REQ_DELETE, OCFS2_VOTE_REQ_MOUNT, OCFS2_VOTE_REQ_UMOUNT, OCFS2_VOTE_REQ_LAST @@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb, ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num); } -void ocfs2_mark_inode_remotely_deleted(struct inode *inode) -{ - struct ocfs2_inode_info *oi = OCFS2_I(inode); - - assert_spin_locked(&oi->ip_lock); - /* We set the SKIP_DELETE flag on the inode so we don't try to - * delete it in delete_inode ourselves, thus avoiding - * unecessary lock pinging. If the other node failed to wipe - * the inode as a result of a crash, then recovery will pick - * up the slack. */ - oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE; -} - -static int ocfs2_process_delete_request(struct inode *inode, - int *orphaned_slot) -{ - int response = OCFS2_RESPONSE_BUSY; - - mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n", - inode->i_ino, inode->i_nlink, *orphaned_slot); - - spin_lock(&OCFS2_I(inode)->ip_lock); - - /* Whatever our vote response is, we want to make sure that - * the orphaned slot is recorded properly on this node *and* - * on the requesting node. Technically, if the requesting node - * did not know which slot the inode is orphaned in but we - * respond with BUSY he doesn't actually need the orphaned - * slot, but it doesn't hurt to do it here anyway. */ - if ((*orphaned_slot) != OCFS2_INVALID_SLOT) { - mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != - OCFS2_INVALID_SLOT && - OCFS2_I(inode)->ip_orphaned_slot != - (*orphaned_slot), - "Inode %llu: This node thinks it's " - "orphaned in slot %d, messaged it's in %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - OCFS2_I(inode)->ip_orphaned_slot, - *orphaned_slot); - - mlog(0, "Setting orphaned slot for inode %llu to %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - *orphaned_slot); - - OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot; - } else { - mlog(0, "Sending back orphaned slot %d for inode %llu\n", - OCFS2_I(inode)->ip_orphaned_slot, - (unsigned long long)OCFS2_I(inode)->ip_blkno); - - *orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; - } - - /* vote no if the file is still open. */ - if (OCFS2_I(inode)->ip_open_count) { - mlog(0, "open count = %u\n", - OCFS2_I(inode)->ip_open_count); - spin_unlock(&OCFS2_I(inode)->ip_lock); - goto done; - } - spin_unlock(&OCFS2_I(inode)->ip_lock); - - /* directories are a bit ugly... What if someone is sitting in - * it? We want to make sure the inode is removed completely as - * a result of the iput in process_vote. */ - if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) { - mlog(0, "i_count = %u\n", atomic_read(&inode->i_count)); - goto done; - } - - if (filemap_fdatawrite(inode->i_mapping)) { - mlog(ML_ERROR, "Could not sync inode %llu for delete!\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto done; - } - sync_mapping_buffers(inode->i_mapping); - truncate_inode_pages(inode->i_mapping, 0); - ocfs2_extent_map_trunc(inode, 0); - - spin_lock(&OCFS2_I(inode)->ip_lock); - /* double check open count - someone might have raced this - * thread into ocfs2_file_open while we were writing out - * data. If we're to allow a wipe of this inode now, we *must* - * hold the spinlock until we've marked it. */ - if (OCFS2_I(inode)->ip_open_count) { - mlog(0, "Raced to wipe! open count = %u\n", - OCFS2_I(inode)->ip_open_count); - spin_unlock(&OCFS2_I(inode)->ip_lock); - goto done; - } - - /* Mark the inode as being wiped from disk. */ - ocfs2_mark_inode_remotely_deleted(inode); - spin_unlock(&OCFS2_I(inode)->ip_lock); - - /* Not sure this is necessary anymore. */ - d_prune_aliases(inode); - - /* If we get here, then we're voting 'yes', so commit the - * delete on our side. */ - response = OCFS2_RESPONSE_OK; -done: - return response; -} - static void ocfs2_process_vote(struct ocfs2_super *osb, struct ocfs2_vote_msg *msg) { int net_status, vote_response; - int orphaned_slot = 0; - unsigned int node_num, generation; + unsigned int node_num; u64 blkno; enum ocfs2_vote_request request; - struct inode *inode = NULL; struct ocfs2_msg_hdr *hdr = &msg->v_hdr; struct ocfs2_response_msg response; /* decode the network mumbo jumbo into local variables. */ request = be32_to_cpu(hdr->h_request); blkno = be64_to_cpu(hdr->h_blkno); - generation = be32_to_cpu(hdr->h_generation); node_num = be32_to_cpu(hdr->h_node_num); - if (request == OCFS2_VOTE_REQ_DELETE) - orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot); - mlog(0, "processing vote: request = %u, blkno = %llu, " - "generation = %u, node_num = %u, priv1 = %u\n", request, - (unsigned long long)blkno, generation, node_num, - be32_to_cpu(msg->md1.v_generic1)); + mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n", + request, (unsigned long long)blkno, node_num); if (!ocfs2_is_valid_vote_request(request)) { mlog(ML_ERROR, "Invalid vote request %d from node %u\n", @@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb, break; } - /* We cannot process the remaining message types before we're - * fully mounted. It's perfectly safe however to send a 'yes' - * response as we can't possibly have any of the state they're - * asking us to modify yet. */ - if (atomic_read(&osb->vol_state) == VOLUME_INIT) - goto respond; - - /* If we get here, then the request is against an inode. */ - inode = ocfs2_ilookup_for_vote(osb, blkno, - request == OCFS2_VOTE_REQ_DELETE); - - /* Not finding the inode is perfectly valid - it means we're - * not interested in what the other node is about to do to it - * so in those cases we automatically respond with an - * affirmative. Cluster locking ensures that we won't race - * interest in the inode with this vote request. */ - if (!inode) - goto respond; - - /* Check generation values. It's possible for us to get a - * request against a stale inode. If so then we proceed as if - * we had not found an inode in the first place. */ - if (inode->i_generation != generation) { - mlog(0, "generation passed %u != inode generation = %u, " - "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, " - "message type = %u\n", generation, inode->i_generation, - OCFS2_I(inode)->ip_flags, - (unsigned long long)OCFS2_I(inode)->ip_blkno, - (unsigned long long)blkno, atomic_read(&inode->i_count), - request); - iput(inode); - inode = NULL; - goto respond; - } - - switch (request) { - case OCFS2_VOTE_REQ_DELETE: - vote_response = ocfs2_process_delete_request(inode, - &orphaned_slot); - break; - default: - mlog(ML_ERROR, "node %u, invalid request: %u\n", - node_num, request); - vote_response = OCFS2_RESPONSE_BAD_MSG; - } - respond: /* Response struture is small so we just put it on the stack * and stuff it inline. */ @@ -357,7 +190,6 @@ respond: response.r_hdr.h_generation = hdr->h_generation; response.r_hdr.h_node_num = cpu_to_be32(osb->node_num); response.r_response = cpu_to_be32(vote_response); - response.r_orphaned_slot = cpu_to_be32(orphaned_slot); net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE, osb->net_key, @@ -373,9 +205,6 @@ respond: && net_status != -ENOTCONN) mlog(ML_ERROR, "message to node %u fails with error %d!\n", node_num, net_status); - - if (inode) - iput(inode); } static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb) @@ -634,8 +463,7 @@ bail: static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, u64 blkno, unsigned int generation, - enum ocfs2_vote_request type, - u32 priv) + enum ocfs2_vote_request type) { struct ocfs2_vote_msg *request; struct ocfs2_msg_hdr *hdr; @@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, hdr->h_request = cpu_to_be32(type); hdr->h_blkno = cpu_to_be64(blkno); hdr->h_generation = cpu_to_be32(generation); - - request->md1.v_generic1 = cpu_to_be32(priv); } return request; @@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb, struct ocfs2_vote_msg *request, struct ocfs2_net_response_cb *callback) { - int status, response; + int status, response = -EBUSY; unsigned int response_id; struct ocfs2_msg_hdr *hdr; @@ -686,109 +512,12 @@ bail: return status; } -static int ocfs2_request_vote(struct inode *inode, - struct ocfs2_vote_msg *request, - struct ocfs2_net_response_cb *callback) -{ - int status; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - if (ocfs2_inode_is_new(inode)) - return 0; - - status = -EAGAIN; - while (status == -EAGAIN) { - if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) && - signal_pending(current)) - return -ERESTARTSYS; - - status = ocfs2_super_lock(osb, 0); - if (status < 0) { - mlog_errno(status); - break; - } - - status = 0; - if (!ocfs2_node_map_is_only(osb, &osb->mounted_map, - osb->node_num)) - status = ocfs2_do_request_vote(osb, request, callback); - - ocfs2_super_unlock(osb, 0); - } - return status; -} - -static void ocfs2_delete_response_cb(void *priv, - struct ocfs2_response_msg *resp) -{ - int orphaned_slot, node; - struct inode *inode = priv; - - orphaned_slot = be32_to_cpu(resp->r_orphaned_slot); - node = be32_to_cpu(resp->r_hdr.h_node_num); - mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n", - node, (unsigned long long)OCFS2_I(inode)->ip_blkno, - orphaned_slot); - - /* The other node may not actually know which slot the inode - * is orphaned in. */ - if (orphaned_slot == OCFS2_INVALID_SLOT) - return; - - /* Ok, the responding node knows which slot this inode is - * orphaned in. We verify that the information is correct and - * then record this in the inode. ocfs2_delete_inode will use - * this information to determine which lock to take. */ - spin_lock(&OCFS2_I(inode)->ip_lock); - mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot && - OCFS2_I(inode)->ip_orphaned_slot - != OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's " - "orphaned in slot %d, we think it's in %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - be32_to_cpu(resp->r_hdr.h_node_num), - orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot); - - OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot; - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - -int ocfs2_request_delete_vote(struct inode *inode) -{ - int orphaned_slot, status; - struct ocfs2_net_response_cb delete_cb; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_vote_msg *request; - - spin_lock(&OCFS2_I(inode)->ip_lock); - orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; - spin_unlock(&OCFS2_I(inode)->ip_lock); - - delete_cb.rc_cb = ocfs2_delete_response_cb; - delete_cb.rc_priv = inode; - - mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot); - - status = -ENOMEM; - request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno, - inode->i_generation, - OCFS2_VOTE_REQ_DELETE, orphaned_slot); - if (request) { - status = ocfs2_request_vote(inode, request, &delete_cb); - - kfree(request); - } - - return status; -} - int ocfs2_request_mount_vote(struct ocfs2_super *osb) { int status; struct ocfs2_vote_msg *request = NULL; - request = ocfs2_new_vote_request(osb, 0ULL, 0, - OCFS2_VOTE_REQ_MOUNT, 0); + request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT); if (!request) { status = -ENOMEM; goto bail; @@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) int status; struct ocfs2_vote_msg *request = NULL; - request = ocfs2_new_vote_request(osb, 0ULL, 0, - OCFS2_VOTE_REQ_UMOUNT, 0); + request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT); if (!request) { status = -ENOMEM; goto bail; @@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, be32_to_cpu(work->w_msg.v_hdr.h_generation)); mlog(0, "h_node_num = %u\n", be32_to_cpu(work->w_msg.v_hdr.h_node_num)); - mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1)); spin_lock(&osb->vote_task_lock); list_add_tail(&work->w_list, &osb->vote_list); diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h index 53ebc1c69e56..9ea46f62de31 100644 --- a/fs/ocfs2/vote.h +++ b/fs/ocfs2/vote.h @@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb) wake_up(&osb->vote_event); } -int ocfs2_request_delete_vote(struct inode *inode); int ocfs2_request_mount_vote(struct ocfs2_super *osb); int ocfs2_request_umount_vote(struct ocfs2_super *osb); int ocfs2_register_net_handlers(struct ocfs2_super *osb); void ocfs2_unregister_net_handlers(struct ocfs2_super *osb); -void ocfs2_mark_inode_remotely_deleted(struct inode *inode); - void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb, int node_num); #endif |