/* -*- mode: c; c-basic-offset: 8; -*- * vim: noexpandtab sw=8 ts=8 sts=0: * * extent_map.c * * In-memory extent map for OCFS2. Man, this code was prettier in * the library. * * Copyright (C) 2004 Oracle. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License, version 2, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 021110-1307, USA. */ #include <linux/fs.h> #include <linux/init.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/rbtree.h> #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include <cluster/masklog.h> #include "ocfs2.h" #include "extent_map.h" #include "inode.h" #include "super.h" #include "buffer_head_io.h" /* * SUCK SUCK SUCK * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h */ struct ocfs2_extent_map_entry { struct rb_node e_node; int e_tree_depth; struct ocfs2_extent_rec e_rec; }; struct ocfs2_em_insert_context { int need_left; int need_right; struct ocfs2_extent_map_entry *new_ent; struct ocfs2_extent_map_entry *old_ent; struct ocfs2_extent_map_entry *left_ent; struct ocfs2_extent_map_entry *right_ent; }; static kmem_cache_t *ocfs2_em_ent_cachep = NULL; static struct ocfs2_extent_map_entry * ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, u32 cpos, u32 clusters, struct rb_node ***ret_p, struct rb_node **ret_parent); static int ocfs2_extent_map_insert(struct inode *inode, struct ocfs2_extent_rec *rec, int tree_depth); static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, struct ocfs2_extent_map_entry *ent); static int ocfs2_extent_map_find_leaf(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_list *el); static int ocfs2_extent_map_lookup_read(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_map_entry **ret_ent); static int ocfs2_extent_map_try_insert(struct inode *inode, struct ocfs2_extent_rec *rec, int tree_depth, struct ocfs2_em_insert_context *ctxt); /* returns 1 only if the rec contains all the given clusters -- that is that * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos + * clusters) is >= the argument's endpoint */ static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec, u32 cpos, u32 clusters) { if (le32_to_cpu(rec->e_cpos) > cpos) return 0; if (cpos + clusters > le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) return 0; return 1; } /* * Find an entry in the tree that intersects the region passed in. * Note that this will find straddled intervals, it is up to the * callers to enforce any boundary conditions. * * Callers must hold ip_lock. This lookup is not guaranteed to return * a tree_depth 0 match, and as such can race inserts if the lock * were not held. * * The rb_node garbage lets insertion share the search. Trivial * callers pass NULL. */ static struct ocfs2_extent_map_entry * ocfs2_extent_map_lookup(struct ocfs2_extent_map *em, u32 cpos, u32 clusters, struct rb_node ***ret_p, struct rb_node **ret_parent) { struct rb_node **p = &em->em_extents.rb_node; struct rb_node *parent = NULL; struct ocfs2_extent_map_entry *ent = NULL; while (*p) { parent = *p; ent = rb_entry(parent, struct ocfs2_extent_map_entry, e_node); if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) { p = &(*p)->rb_left; ent = NULL; } else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) + le32_to_cpu(ent->e_rec.e_clusters))) { p = &(*p)->rb_right; ent = NULL; } else break; } if (ret_p != NULL) *ret_p = p; if (ret_parent != NULL) *ret_parent = parent; return ent; } /* * Find the leaf containing the interval we want. While we're on our * way down the tree, fill in every record we see at any depth, because * we might want it later. * * Note that this code is run without ip_lock. That's because it * sleeps while reading. If someone is also filling the extent list at * the same time we are, we might have to restart. */ static int ocfs2_extent_map_find_leaf(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_list *el) { int i, ret; struct buffer_head *eb_bh = NULL; u64 blkno; u32 rec_end; struct ocfs2_extent_block *eb; struct ocfs2_extent_rec *rec; /* * The bh data containing the el cannot change here, because * we hold alloc_sem. So we can do this without other * locks. */ while (el->l_tree_depth) { blkno = 0; for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; rec_end = (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)); ret = -EBADR; if (rec_end > OCFS2_I(inode)->ip_clusters) { mlog_errno(ret); ocfs2_error(inode->i_sb, "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", i, (unsigned long long)le64_to_cpu(rec->e_blkno), (unsigned long long)OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_clusters); goto out_free; } if (rec_end <= cpos) { ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } continue; } if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) { ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } continue; } /* * We've found a record that matches our * interval. We don't insert it because we're * about to traverse it. */ /* Check to see if we're stradling */ ret = -ESRCH; if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { mlog_errno(ret); goto out_free; } /* * If we've already found a record, the el has * two records covering the same interval. * EEEK! */ ret = -EBADR; if (blkno) { mlog_errno(ret); ocfs2_error(inode->i_sb, "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n", cpos, clusters, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)blkno, i, (unsigned long long)le64_to_cpu(rec->e_blkno)); goto out_free; } blkno = le64_to_cpu(rec->e_blkno); } /* * We don't support holes, and we're still up * in the branches, so we'd better have found someone */ ret = -EBADR; if (!blkno) { ocfs2_error(inode->i_sb, "No record found for (cpos = %u, clusters = %u) on inode %llu\n", cpos, clusters, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog_errno(ret); goto out_free; } if (eb_bh) { brelse(eb_bh); eb_bh = NULL; } ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &eb_bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); goto out_free; } eb = (struct ocfs2_extent_block *)eb_bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); ret = -EIO; goto out_free; } el = &eb->h_list; } BUG_ON(el->l_tree_depth); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > OCFS2_I(inode)->ip_clusters) { ret = -EBADR; mlog_errno(ret); ocfs2_error(inode->i_sb, "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n", i, (unsigned long long)le64_to_cpu(rec->e_blkno), (unsigned long long)OCFS2_I(inode)->ip_blkno, OCFS2_I(inode)->ip_clusters); return ret; } ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } } ret = 0; out_free: if (eb_bh) brelse(eb_bh); return ret; } /* * This lookup actually will read from disk. It has one invariant: * It will never re-traverse blocks. This means that all inserts should * be new regions or more granular regions (both allowed by insert). */ static int ocfs2_extent_map_lookup_read(struct inode *inode, u32 cpos, u32 clusters, struct ocfs2_extent_map_entry **ret_ent) { int ret; u64 blkno; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; struct buffer_head *bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_dinode *di; struct ocfs2_extent_list *el; spin_lock(&OCFS2_I(inode)->ip_lock); ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); if (ent) { if (!ent->e_tree_depth) { spin_unlock(&OCFS2_I(inode)->ip_lock); *ret_ent = ent; return 0; } blkno = le64_to_cpu(ent->e_rec.e_blkno); spin_unlock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); if (bh) brelse(bh); return ret; } eb = (struct ocfs2_extent_block *)bh->b_data; if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); brelse(bh); return -EIO; } el = &eb->h_list; } else { spin_unlock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, &bh, OCFS2_BH_CACHED, inode); if (ret) { mlog_errno(ret); if (bh) brelse(bh); return ret; } di = (struct ocfs2_dinode *)bh->b_data; if (!OCFS2_IS_VALID_DINODE(di)) { brelse(bh); OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di); return -EIO; } el = &di->id2.i_list; } ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el); brelse(bh); if (ret) { mlog_errno(ret); return ret; } ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL); if (!ent) { ret = -ESRCH; mlog_errno(ret); return ret; } /* FIXME: Make sure this isn't a corruption */ BUG_ON(ent->e_tree_depth); *ret_ent = ent; return 0; } /* * Callers must hold ip_lock. This can insert pieces of the tree, * thus racing lookup if the lock weren't held. */ static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em, struct ocfs2_extent_map_entry *ent) { struct rb_node **p, *parent; struct ocfs2_extent_map_entry *old_ent; old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos), le32_to_cpu(ent->e_rec.e_clusters), &p, &parent); if (old_ent) return -EEXIST; rb_link_node(&ent->e_node, parent, p); rb_insert_color(&ent->e_node, &em->em_extents); return 0; } /* * Simple rule: on any return code other than -EAGAIN, anything left * in the insert_context will be freed. * * Simple rule #2: A return code of -EEXIST from this function or * its calls to ocfs2_extent_map_insert_entry() signifies that another * thread beat us to the insert. It is not an actual error, but it * tells the caller we have no more work to do. */ static int ocfs2_extent_map_try_insert(struct inode *inode, struct ocfs2_extent_rec *rec, int tree_depth, struct ocfs2_em_insert_context *ctxt) { int ret; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *old_ent; ctxt->need_left = 0; ctxt->need_right = 0; ctxt->old_ent = NULL; spin_lock(&OCFS2_I(inode)->ip_lock); ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); if (!ret) { ctxt->new_ent = NULL; goto out_unlock; } /* Since insert_entry failed, the map MUST have old_ent */ old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), NULL, NULL); BUG_ON(!old_ent); if (old_ent->e_tree_depth < tree_depth) { /* Another thread beat us to the lower tree_depth */ ret = -EEXIST; goto out_unlock; } if (old_ent->e_tree_depth == tree_depth) { /* * Another thread beat us to this tree_depth. * Let's make sure we agree with that thread (the * extent_rec should be identical). */ if (!memcmp(rec, &old_ent->e_rec, sizeof(struct ocfs2_extent_rec))) ret = 0; else /* FIXME: Should this be ESRCH/EBADR??? */ ret = -EEXIST; goto out_unlock; } /* * We do it in this order specifically so that no actual tree * changes occur until we have all the pieces we need. We * don't want malloc failures to leave an inconsistent tree. * Whenever we drop the lock, another process could be * inserting. Also note that, if another process just beat us * to an insert, we might not need the same pieces we needed * the first go round. In the end, the pieces we need will * be used, and the pieces we don't will be freed. */ ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) > le32_to_cpu(old_ent->e_rec.e_cpos)); ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) + le32_to_cpu(old_ent->e_rec.e_clusters)) > (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters))); ret = -EAGAIN; if (ctxt->need_left) { if (!ctxt->left_ent) goto out_unlock; *(ctxt->left_ent) = *old_ent; ctxt->left_ent->e_rec.e_clusters = cpu_to_le32(le32_to_cpu(rec->e_cpos) - le32_to_cpu(ctxt->left_ent->e_rec.e_cpos)); } if (ctxt->need_right) { if (!ctxt->right_ent) goto out_unlock; *(ctxt->right_ent) = *old_ent; ctxt->right_ent->e_rec.e_cpos = cpu_to_le32(le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)); ctxt->right_ent->e_rec.e_clusters = cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) + le32_to_cpu(old_ent->e_rec.e_clusters)) - le32_to_cpu(ctxt->right_ent->e_rec.e_cpos)); } rb_erase(&old_ent->e_node, &em->em_extents); /* Now that he's erased, set him up for deletion */ ctxt->old_ent = old_ent; if (ctxt->need_left) { ret = ocfs2_extent_map_insert_entry(em, ctxt->left_ent); if (ret) goto out_unlock; ctxt->left_ent = NULL; } if (ctxt->need_right) { ret = ocfs2_extent_map_insert_entry(em, ctxt->right_ent); if (ret) goto out_unlock; ctxt->right_ent = NULL; } ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent); if (!ret) ctxt->new_ent = NULL; out_unlock: spin_unlock(&OCFS2_I(inode)->ip_lock); return ret; } static int ocfs2_extent_map_insert(struct inode *inode, struct ocfs2_extent_rec *rec, int tree_depth) { int ret; struct ocfs2_em_insert_context ctxt = {0, }; if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > OCFS2_I(inode)->ip_map.em_clusters) { ret = -EBADR; mlog_errno(ret); return ret; } /* Zero e_clusters means a truncated tail record. It better be EOF */ if (!rec->e_clusters) { if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != OCFS2_I(inode)->ip_map.em_clusters) { ret = -EBADR; mlog_errno(ret); ocfs2_error(inode->i_sb, "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n", (unsigned long long)le64_to_cpu(rec->e_blkno), (unsigned long long)OCFS2_I(inode)->ip_blkno); return ret; } /* Ignore the truncated tail */ return 0; } ret = -ENOMEM; ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, GFP_NOFS); if (!ctxt.new_ent) { mlog_errno(ret); return ret; } ctxt.new_ent->e_rec = *rec; ctxt.new_ent->e_tree_depth = tree_depth; do { ret = -ENOMEM; if (ctxt.need_left && !ctxt.left_ent) { ctxt.left_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, GFP_NOFS); if (!ctxt.left_ent) break; } if (ctxt.need_right && !ctxt.right_ent) { ctxt.right_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, GFP_NOFS); if (!ctxt.right_ent) break; } ret = ocfs2_extent_map_try_insert(inode, rec, tree_depth, &ctxt); } while (ret == -EAGAIN); if ((ret < 0) && (ret != -EEXIST)) mlog_errno(ret); if (ctxt.left_ent) kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent); if (ctxt.right_ent) kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent); if (ctxt.old_ent) kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent); if (ctxt.new_ent) kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent); return ret; } /* * Append this record to the tail of the extent map. It must be * tree_depth 0. The record might be an extension of an existing * record, and as such that needs to be handled. eg: * * Existing record in the extent map: * * cpos = 10, len = 10 * |---------| * * New Record: * * cpos = 10, len = 20 * |------------------| * * The passed record is the new on-disk record. The new_clusters value * is how many clusters were added to the file. If the append is a * contiguous append, the new_clusters has been added to * rec->e_clusters. If the append is an entirely new extent, then * rec->e_clusters is == new_clusters. */ int ocfs2_extent_map_append(struct inode *inode, struct ocfs2_extent_rec *rec, u32 new_clusters) { int ret; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; struct ocfs2_extent_rec *old; BUG_ON(!new_clusters); BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters); if (em->em_clusters < OCFS2_I(inode)->ip_clusters) { /* * Size changed underneath us on disk. Drop any * straddling records and update our idea of * i_clusters */ ocfs2_extent_map_drop(inode, em->em_clusters - 1); em->em_clusters = OCFS2_I(inode)->ip_clusters; } mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != (em->em_clusters + new_clusters), "Inode %llu:\n" "rec->e_cpos = %u + rec->e_clusters = %u = %u\n" "em->em_clusters = %u + new_clusters = %u = %u\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters), le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters), em->em_clusters, new_clusters, em->em_clusters + new_clusters); em->em_clusters += new_clusters; ret = -ENOENT; if (le32_to_cpu(rec->e_clusters) > new_clusters) { /* This is a contiguous append */ ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1, NULL, NULL); if (ent) { old = &ent->e_rec; BUG_ON((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) != (le32_to_cpu(old->e_cpos) + le32_to_cpu(old->e_clusters) + new_clusters)); if (ent->e_tree_depth == 0) { BUG_ON(le32_to_cpu(old->e_cpos) != le32_to_cpu(rec->e_cpos)); BUG_ON(le64_to_cpu(old->e_blkno) != le64_to_cpu(rec->e_blkno)); ret = 0; } /* * Let non-leafs fall through as -ENOENT to * force insertion of the new leaf. */ le32_add_cpu(&old->e_clusters, new_clusters); } } if (ret == -ENOENT) ret = ocfs2_extent_map_insert(inode, rec, 0); if (ret < 0) mlog_errno(ret); return ret; } #if 0 /* Code here is included but defined out as it completes the extent * map api and may be used in the future. */ /* * Look up the record containing this cluster offset. This record is * part of the extent map. Do not free it. Any changes you make to * it will reflect in the extent map. So, if your last extent * is (cpos = 10, clusters = 10) and you truncate the file by 5 * clusters, you can do: * * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec); * rec->e_clusters -= 5; * * The lookup does not read from disk. If the map isn't filled in for * an entry, you won't find it. * * Also note that the returned record is valid until alloc_sem is * dropped. After that, truncate and extend can happen. Caveat Emptor. */ int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos, struct ocfs2_extent_rec **rec, int *tree_depth) { int ret = -ENOENT; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; *rec = NULL; if (cpos >= OCFS2_I(inode)->ip_clusters) return -EINVAL; if (cpos >= em->em_clusters) { /* * Size changed underneath us on disk. Drop any * straddling records and update our idea of * i_clusters */ ocfs2_extent_map_drop(inode, em->em_clusters - 1); em->em_clusters = OCFS2_I(inode)->ip_clusters ; } ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1, NULL, NULL); if (ent) { *rec = &ent->e_rec; if (tree_depth) *tree_depth = ent->e_tree_depth; ret = 0; } return ret; } int ocfs2_extent_map_get_clusters(struct inode *inode, u32 v_cpos, int count, u32 *p_cpos, int *ret_count) { int ret; u32 coff, ccount; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent = NULL; *p_cpos = ccount = 0; if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters) return -EINVAL; if ((v_cpos + count) > em->em_clusters) { /* * Size changed underneath us on disk. Drop any * straddling records and update our idea of * i_clusters */ ocfs2_extent_map_drop(inode, em->em_clusters - 1); em->em_clusters = OCFS2_I(inode)->ip_clusters; } ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent); if (ret) return ret; if (ent) { /* We should never find ourselves straddling an interval */ if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec, v_cpos, count)) return -ESRCH; coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos); *p_cpos = ocfs2_blocks_to_clusters(inode->i_sb, le64_to_cpu(ent->e_rec.e_blkno)) + coff; if (ret_count) *ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff; return 0; } return -ENOENT; } #endif /* 0 */ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, int count, u64 *p_blkno, int *ret_count) { int ret; u64 boff; u32 cpos, clusters; int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); struct ocfs2_extent_map_entry *ent = NULL; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_rec *rec; *p_blkno = 0; cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno); clusters = ocfs2_blocks_to_clusters(inode->i_sb, (u64)count + bpc - 1); if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) { ret = -EINVAL; mlog_errno(ret); return ret; } if ((cpos + clusters) > em->em_clusters) { /* * Size changed underneath us on disk. Drop any * straddling records and update our idea of * i_clusters */ ocfs2_extent_map_drop(inode, em->em_clusters - 1); em->em_clusters = OCFS2_I(inode)->ip_clusters; } ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent); if (ret) { mlog_errno(ret); return ret; } if (ent) { rec = &ent->e_rec; /* We should never find ourselves straddling an interval */ if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) { ret = -ESRCH; mlog_errno(ret); return ret; } boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos - le32_to_cpu(rec->e_cpos)); boff += (v_blkno & (u64)(bpc - 1)); *p_blkno = le64_to_cpu(rec->e_blkno) + boff; if (ret_count) { *ret_count = ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(rec->e_clusters)) - boff; } return 0; } return -ENOENT; } int ocfs2_extent_map_init(struct inode *inode) { struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; em->em_extents = RB_ROOT; em->em_clusters = 0; return 0; } /* Needs the lock */ static void __ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters, struct rb_node **free_head, struct ocfs2_extent_map_entry **tail_ent) { struct rb_node *node, *next; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; *free_head = NULL; ent = NULL; node = rb_last(&em->em_extents); while (node) { next = rb_prev(node); ent = rb_entry(node, struct ocfs2_extent_map_entry, e_node); if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters) break; rb_erase(&ent->e_node, &em->em_extents); node->rb_right = *free_head; *free_head = node; ent = NULL; node = next; } /* Do we have an entry straddling new_clusters? */ if (tail_ent) { if (ent && ((le32_to_cpu(ent->e_rec.e_cpos) + le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters)) *tail_ent = ent; else *tail_ent = NULL; } } static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head) { struct rb_node *node; struct ocfs2_extent_map_entry *ent; while (free_head) { node = free_head; free_head = node->rb_right; ent = rb_entry(node, struct ocfs2_extent_map_entry, e_node); kmem_cache_free(ocfs2_em_ent_cachep, ent); } } /* * Remove all entries past new_clusters, inclusive of an entry that * contains new_clusters. This is effectively a cache forget. * * If you want to also clip the last extent by some number of clusters, * you need to call ocfs2_extent_map_trunc(). * This code does not check or modify ip_clusters. */ int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters) { struct rb_node *free_head = NULL; struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map; struct ocfs2_extent_map_entry *ent; spin_lock(&OCFS2_I(inode)->ip_lock); __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); if (ent) { rb_erase(&ent->e_node, &em->em_extents); ent->e_node.rb_right = free_head; free_head = &ent->e_node; } spin_unlock(&OCFS2_I(inode)->ip_lock); if (free_head) __ocfs2_extent_map_drop_cleanup(free_head); return 0; } /* * Remove all entries past new_clusters and also clip any extent * straddling new_clusters, if there is one. This does not check * or modify ip_clusters */ int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters) { struct rb_node *free_head = NULL; struct ocfs2_extent_map_entry *ent = NULL; spin_lock(&OCFS2_I(inode)->ip_lock); __ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent); if (ent) ent->e_rec.e_clusters = cpu_to_le32(new_clusters - le32_to_cpu(ent->e_rec.e_cpos)); OCFS2_I(inode)->ip_map.em_clusters = new_clusters; spin_unlock(&OCFS2_I(inode)->ip_lock); if (free_head) __ocfs2_extent_map_drop_cleanup(free_head); return 0; } int __init init_ocfs2_extent_maps(void) { ocfs2_em_ent_cachep = kmem_cache_create("ocfs2_em_ent", sizeof(struct ocfs2_extent_map_entry), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (!ocfs2_em_ent_cachep) return -ENOMEM; return 0; } void exit_ocfs2_extent_maps(void) { kmem_cache_destroy(ocfs2_em_ent_cachep); }