From 9e69296999362c4e4b2821b64389b47e86e4821b Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:41 -0400 Subject: pnfsblock: basic extent code Adds structures and basic create/delete code for extents. Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Zhang Jingwang Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extents.c | 89 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 fs/nfs/blocklayout/extents.c (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c new file mode 100644 index 000000000000..d0ca7604d33e --- /dev/null +++ b/fs/nfs/blocklayout/extents.c @@ -0,0 +1,89 @@ +/* + * linux/fs/nfs/blocklayout/blocklayout.h + * + * Module for the NFSv4.1 pNFS block layout driver. + * + * Copyright (c) 2006 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Fred Isaman + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include "blocklayout.h" +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void print_bl_extent(struct pnfs_block_extent *be) +{ + dprintk("PRINT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); + dprintk(" be_length %llu\n", (u64)be->be_length); + dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); + dprintk(" be_state %d\n", be->be_state); + } +} + +static void +destroy_extent(struct kref *kref) +{ + struct pnfs_block_extent *be; + + be = container_of(kref, struct pnfs_block_extent, be_refcnt); + dprintk("%s be=%p\n", __func__, be); + kfree(be); +} + +void +bl_put_extent(struct pnfs_block_extent *be) +{ + if (be) { + dprintk("%s enter %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_put(&be->be_refcnt, destroy_extent); + } +} + +struct pnfs_block_extent *bl_alloc_extent(void) +{ + struct pnfs_block_extent *be; + + be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); + if (!be) + return NULL; + INIT_LIST_HEAD(&be->be_node); + kref_init(&be->be_refcnt); + be->be_inval = NULL; + return be; +} + +static void print_elist(struct list_head *list) +{ + struct pnfs_block_extent *be; + dprintk("****************\n"); + dprintk("Extent list looks like:\n"); + list_for_each_entry(be, list, be_node) { + print_bl_extent(be); + } + dprintk("****************\n"); +} -- cgit v1.2.3 From 03341d2cc91c700fc38883e572043a6a8f17dd5c Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:45 -0400 Subject: pnfsblock: merge extents Replace a stub, so that extents underlying the layouts are properly added, merged, or ignored as necessary. Signed-off-by: Fred Isaman [pnfsblock: delete the new node before put it] Signed-off-by: Mingyang Guo Signed-off-by: Benny Halevy Signed-off-by: Peng Tao Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 13 +++++ fs/nfs/blocklayout/extents.c | 106 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 3e05b08d5347..581d8f47a723 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -80,6 +80,14 @@ enum extentclass4 { EXTENT_LISTS = 2, }; +static inline int bl_choose_list(enum exstate4 state) +{ + if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) + return RO_EXTENT; + else + return RW_EXTENT; +} + struct pnfs_block_layout { struct pnfs_layout_hdr bl_layout; struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ @@ -137,5 +145,10 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, /* blocklayoutdm.c */ void bl_free_block_dev(struct pnfs_block_dev *bdev); +/* extents.c */ void bl_put_extent(struct pnfs_block_extent *be); +struct pnfs_block_extent *bl_alloc_extent(void); +int bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); + #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index d0ca7604d33e..ee4891f32492 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -87,3 +87,109 @@ static void print_elist(struct list_head *list) } dprintk("****************\n"); } + +static inline int +extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) +{ + /* Note this assumes new->be_f_offset >= old->be_f_offset */ + return (new->be_state == old->be_state) && + ((new->be_state == PNFS_BLOCK_NONE_DATA) || + ((new->be_v_offset - old->be_v_offset == + new->be_f_offset - old->be_f_offset) && + new->be_mdev == old->be_mdev)); +} + +/* Adds new to appropriate list in bl, modifying new and removing existing + * extents as appropriate to deal with overlaps. + * + * See bl_find_get_extent for list constraints. + * + * Refcount on new is already set. If end up not using it, or error out, + * need to put the reference. + * + * bl->bl_ext_lock is held by caller. + */ +int +bl_add_merge_extent(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be, *tmp; + sector_t end = new->be_f_offset + new->be_length; + struct list_head *list; + + dprintk("%s enter with be=%p\n", __func__, new); + print_bl_extent(new); + list = &bl->bl_extents[bl_choose_list(new->be_state)]; + print_elist(list); + + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(be, tmp, list, be_node) { + if (new->be_f_offset < be->be_f_offset) + break; + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + bl_put_extent(new); + return 0; + } else + goto out_err; + } else if (new->be_f_offset <= + be->be_f_offset + be->be_length) { + /* new overlaps or abuts existing be */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else if (new->be_f_offset != + be->be_f_offset + be->be_length) + goto out_err; + } + } + /* Note that if we never hit the above break, be will not point to a + * valid extent. However, in that case &be->be_node==list. + */ + list_add_tail(&new->be_node, &be->be_node); + dprintk("%s: inserting new\n", __func__); + print_elist(list); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + be = list_prepare_entry(new, list, be_node); + list_for_each_entry_safe_continue(be, tmp, list, be_node) { + if (end < be->be_f_offset) + break; + /* new overlaps or abuts existing be */ + if (extents_consistent(be, new)) { + if (end < be->be_f_offset + be->be_length) { + /* extend new to fully cover be */ + end = be->be_f_offset + be->be_length; + new->be_length = end - new->be_f_offset; + } + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else if (end != be->be_f_offset) { + list_del(&new->be_node); + goto out_err; + } + } + dprintk("%s: after merging\n", __func__); + print_elist(list); + /* FIXME - The per-list consistency checks have all been done, + * should now check cross-list consistency. + */ + return 0; + + out_err: + bl_put_extent(new); + return -EIO; +} -- cgit v1.2.3 From 6d742ba538f98164f3c5e05cdcadb4ec6ddf504f Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:48 -0400 Subject: pnfsblock: bl_find_get_extent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement bl_find_get_extent(), one of the core extent manipulation routines. [pnfsblock: Lookup list entry of layouts and tags in reverse order] Signed-off-by: Zhang Jingwang Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Jim Rees pnfsblock: fix print format warnings for sector_t and size_t gcc spews warnings about these on x86_64, e.g.: fs/nfs/blocklayout/blocklayout.c:74: warning: format ‘%Lu’ expects type ‘long long unsigned int’, but argument 2 has type ‘sector_t’ fs/nfs/blocklayout/blocklayout.c:388: warning: format ‘%d’ expects type ‘int’, but argument 5 has type ‘size_t’ Signed-off-by: Benny Halevy Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.h | 3 +++ fs/nfs/blocklayout/extents.c | 47 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index d645880f61a0..3e1b5fc152d7 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -153,6 +153,9 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, void bl_free_block_dev(struct pnfs_block_dev *bdev); /* extents.c */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read); void bl_put_extent(struct pnfs_block_extent *be); struct pnfs_block_extent *bl_alloc_extent(void); int bl_add_merge_extent(struct pnfs_block_layout *bl, diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index ee4891f32492..8fa93e23cb24 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -193,3 +193,50 @@ bl_add_merge_extent(struct pnfs_block_layout *bl, bl_put_extent(new); return -EIO; } + +/* Returns extent, or NULL. If a second READ extent exists, it is returned + * in cow_read, if given. + * + * The extents are kept in two seperate ordered lists, one for READ and NONE, + * one for READWRITE and INVALID. Within each list, we assume: + * 1. Extents are ordered by file offset. + * 2. For any given isect, there is at most one extents that matches. + */ +struct pnfs_block_extent * +bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent **cow_read) +{ + struct pnfs_block_extent *be, *cow, *ret; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + cow = ret = NULL; + spin_lock(&bl->bl_ext_lock); + for (i = 0; i < EXTENT_LISTS; i++) { + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + if (!ret) + ret = be; + else if (be->be_state != PNFS_BLOCK_READ_DATA) + bl_put_extent(be); + else + cow = be; + break; + } + } + if (ret && + (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) + break; + } + spin_unlock(&bl->bl_ext_lock); + if (cow_read) + *cow_read = cow; + print_bl_extent(ret); + return ret; +} -- cgit v1.2.3 From c1c2a4cd352269f1fb585b4a5c63abe24dd946c6 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:49 -0400 Subject: pnfsblock: add extent manipulation functions Adds working implementations of various support functions to handle INVAL extents, needed by writes, such as bl_mark_sectors_init and bl_is_sector_init. [pnfsblock: fix 64-bit compiler warnings for extent manipulation] Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Benny Halevy [Implement release_inval_marks] Signed-off-by: Zhang Jingwang Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 7 +- fs/nfs/blocklayout/blocklayout.h | 30 ++++- fs/nfs/blocklayout/extents.c | 253 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 287 insertions(+), 3 deletions(-) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 6cd7f4f3acdb..8c29a189f09b 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -78,10 +78,15 @@ release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) spin_unlock(&bl->bl_ext_lock); } -/* STUB */ static void release_inval_marks(struct pnfs_inval_markings *marks) { + struct pnfs_inval_tracking *pos, *temp; + + list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { + list_del(&pos->it_link); + kfree(pos); + } return; } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 3e1b5fc152d7..fcf47b55b5ce 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -38,6 +38,9 @@ #include "../pnfs.h" +#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT) +#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) + struct block_mount_id { spinlock_t bm_lock; /* protects list */ struct list_head bm_devlist; /* holds pnfs_block_dev */ @@ -56,8 +59,23 @@ enum exstate4 { PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ }; +#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ + +struct my_tree { + sector_t mtt_step_size; /* Internal sector alignment */ + struct list_head mtt_stub; /* Should be a radix tree */ +}; + struct pnfs_inval_markings { - /* STUB */ + spinlock_t im_lock; + struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ + sector_t im_block_size; /* Server blocksize in sectors */ +}; + +struct pnfs_inval_tracking { + struct list_head it_link; + int it_sector; + int it_tags; }; /* sector_t fields are all in 512-byte sectors */ @@ -76,7 +94,11 @@ struct pnfs_block_extent { static inline void BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) { - /* STUB */ + spin_lock_init(&marks->im_lock); + INIT_LIST_HEAD(&marks->im_tree.mtt_stub); + marks->im_block_size = blocksize; + marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, + blocksize); } enum extentclass4 { @@ -156,8 +178,12 @@ void bl_free_block_dev(struct pnfs_block_dev *bdev); struct pnfs_block_extent * bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, struct pnfs_block_extent **cow_read); +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages); void bl_put_extent(struct pnfs_block_extent *be); struct pnfs_block_extent *bl_alloc_extent(void); +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); int bl_add_merge_extent(struct pnfs_block_layout *bl, struct pnfs_block_extent *new); diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 8fa93e23cb24..473faee9cdef 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -33,6 +33,259 @@ #include "blocklayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* Bit numbers */ +#define EXTENT_INITIALIZED 0 +#define EXTENT_WRITTEN 1 +#define EXTENT_IN_COMMIT 2 +#define INTERNAL_EXISTS MY_MAX_TAGS +#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) + +/* Returns largest t<=s s.t. t%base==0 */ +static inline sector_t normalize(sector_t s, int base) +{ + sector_t tmp = s; /* Since do_div modifies its argument */ + return s - do_div(tmp, base); +} + +static inline sector_t normalize_up(sector_t s, int base) +{ + return normalize(s + base - 1, base); +} + +/* Complete stub using list while determine API wanted */ + +/* Returns tags, or negative */ +static int32_t _find_entry(struct my_tree *tree, u64 s) +{ + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu) enter\n", __func__, s); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) + return pos->it_tags & INTERNAL_MASK; + else + break; + } + return -ENOENT; +} + +static inline +int _has_tag(struct my_tree *tree, u64 s, int32_t tag) +{ + int32_t tags; + + dprintk("%s(%llu, %i) enter\n", __func__, s, tag); + s = normalize(s, tree->mtt_step_size); + tags = _find_entry(tree, s); + if ((tags < 0) || !(tags & (1 << tag))) + return 0; + else + return 1; +} + +/* Creates entry with tag, or if entry already exists, unions tag to it. + * If storage is not NULL, newly created entry will use it. + * Returns number of entries added, or negative on error. + */ +static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, + struct pnfs_inval_tracking *storage) +{ + int found = 0; + struct pnfs_inval_tracking *pos; + + dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector > s) + continue; + else if (pos->it_sector == s) { + found = 1; + break; + } else + break; + } + if (found) { + pos->it_tags |= (1 << tag); + return 0; + } else { + struct pnfs_inval_tracking *new; + if (storage) + new = storage; + else { + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + } + new->it_sector = s; + new->it_tags = (1 << tag); + list_add(&new->it_link, &pos->it_link); + return 1; + } +} + +/* XXXX Really want option to not create */ +/* Over range, unions tag with existing entries, else creates entry with tag */ +static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) +{ + u64 i; + + dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); + for (i = normalize(s, tree->mtt_step_size); i < s + length; + i += tree->mtt_step_size) + if (_add_entry(tree, i, tag, NULL)) + return -ENOMEM; + return 0; +} + +/* Ensure that future operations on given range of tree will not malloc */ +static int _preload_range(struct my_tree *tree, u64 offset, u64 length) +{ + u64 start, end, s; + int count, i, used = 0, status = -ENOMEM; + struct pnfs_inval_tracking **storage; + + dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); + start = normalize(offset, tree->mtt_step_size); + end = normalize_up(offset + length, tree->mtt_step_size); + count = (int)(end - start) / (int)tree->mtt_step_size; + + /* Pre-malloc what memory we might need */ + storage = kmalloc(sizeof(*storage) * count, GFP_NOFS); + if (!storage) + return -ENOMEM; + for (i = 0; i < count; i++) { + storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), + GFP_NOFS); + if (!storage[i]) + goto out_cleanup; + } + + /* Now need lock - HOW??? */ + + for (s = start; s < end; s += tree->mtt_step_size) + used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); + + /* Unlock - HOW??? */ + status = 0; + + out_cleanup: + for (i = used; i < count; i++) { + if (!storage[i]) + break; + kfree(storage[i]); + } + kfree(storage); + return status; +} + +static void set_needs_init(sector_t *array, sector_t offset) +{ + sector_t *p = array; + + dprintk("%s enter\n", __func__); + if (!p) + return; + while (*p < offset) + p++; + if (*p == offset) + return; + else if (*p == ~0) { + *p++ = offset; + *p = ~0; + return; + } else { + sector_t *save = p; + dprintk("%s Adding %llu\n", __func__, (u64)offset); + while (*p != ~0) + p++; + p++; + memmove(save + 1, save, (char *)p - (char *)save); + *save = offset; + return; + } +} + +/* We are relying on page lock to serialize this */ +int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); + spin_unlock(&marks->im_lock); + return rv; +} + +/* Marks sectors in [offest, offset_length) as having been initialized. + * All lengths are step-aligned, where step is min(pagesize, blocksize). + * Notes where partial block is initialized, and helps prepare it for + * complete initialization later. + */ +/* Currently assumes offset is page-aligned */ +int bl_mark_sectors_init(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length, + sector_t **pages) +{ + sector_t s, start, end; + sector_t *array = NULL; /* Pages to mark */ + + dprintk("%s(offset=%llu,len=%llu) enter\n", + __func__, (u64)offset, (u64)length); + s = max((sector_t) 3, + 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS))); + dprintk("%s set max=%llu\n", __func__, (u64)s); + if (pages) { + array = kmalloc(s * sizeof(sector_t), GFP_NOFS); + if (!array) + goto outerr; + array[0] = ~0; + } + + start = normalize(offset, marks->im_block_size); + end = normalize_up(offset + length, marks->im_block_size); + if (_preload_range(&marks->im_tree, start, end - start)) + goto outerr; + + spin_lock(&marks->im_lock); + + for (s = normalize_up(start, PAGE_CACHE_SECTORS); + s < offset; s += PAGE_CACHE_SECTORS) { + dprintk("%s pre-area pages\n", __func__); + /* Portion of used block is not initialized */ + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) + goto out_unlock; + for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); + s < end; s += PAGE_CACHE_SECTORS) { + dprintk("%s post-area pages\n", __func__); + if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED)) + set_needs_init(array, s); + } + + spin_unlock(&marks->im_lock); + + if (pages) { + if (array[0] == ~0) { + kfree(array); + *pages = NULL; + } else + *pages = array; + } + return 0; + + out_unlock: + spin_unlock(&marks->im_lock); + outerr: + if (pages) { + kfree(array); + *pages = NULL; + } + return -ENOMEM; +} + static void print_bl_extent(struct pnfs_block_extent *be) { dprintk("PRINT EXTENT extent %p\n", be); -- cgit v1.2.3 From 9f3770422c771da32c1d14e650c695eec27dbd1d Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:50 -0400 Subject: pnfsblock: merge rw extents Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/extents.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 473faee9cdef..292aadfd4d46 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -493,3 +493,50 @@ bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, print_bl_extent(ret); return ret; } + +/* Helper function to set_to_rw that initialize a new extent */ +static void +_prep_new_extent(struct pnfs_block_extent *new, + struct pnfs_block_extent *orig, + sector_t offset, sector_t length, int state) +{ + kref_init(&new->be_refcnt); + /* don't need to INIT_LIST_HEAD(&new->be_node) */ + memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); + new->be_mdev = orig->be_mdev; + new->be_f_offset = offset; + new->be_length = length; + new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; + new->be_state = state; + new->be_inval = orig->be_inval; +} + +/* Tries to merge be with extent in front of it in list. + * Frees storage if not used. + */ +static struct pnfs_block_extent * +_front_merge(struct pnfs_block_extent *be, struct list_head *head, + struct pnfs_block_extent *storage) +{ + struct pnfs_block_extent *prev; + + if (!storage) + goto no_merge; + if (&be->be_node == head || be->be_node.prev == head) + goto no_merge; + prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); + if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || + !extents_consistent(prev, be)) + goto no_merge; + _prep_new_extent(storage, prev, prev->be_f_offset, + prev->be_length + be->be_length, prev->be_state); + list_replace(&prev->be_node, &storage->be_node); + bl_put_extent(prev); + list_del(&be->be_node); + bl_put_extent(be); + return storage; + + no_merge: + kfree(storage); + return be; +} -- cgit v1.2.3 From 90ace12ac42f65d1f077c5ef5ec2efafdcac338f Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:51 -0400 Subject: pnfsblock: encode_layoutcommit In blocklayout driver. There are two things happening while layoutcommit/cleanup. 1. the modified extents are encoded. 2. On cleanup the extents are put back on the layout rw extents list, for reads. In the new system where actual xdr encoding is done in encode_layoutcommit() directly into xdr buffer, these are the new commit stages: 1. On setup_layoutcommit, the range is adjusted as before and a structure is allocated for communication with bl_encode_layoutcommit && bl_cleanup_layoutcommit (Generic layer provides a void-star to hang it on) 2. bl_encode_layoutcommit is called to do the actual encoding directly into xdr. The commit-extent-list is not freed and is stored on above structure. FIXME: The code is not yet converted to the new XDR cleanup 3. On cleanup the commit-extent-list is put back by a call to set_to_rw() as before, but with no need for XDR decoding of the list as before. And the commit-extent-list is freed. Finally allocated structure is freed. [rm inode and pnfs_layout_hdr args from cleanup_layoutcommit()] [pnfsblock: get rid of deprecated xdr macros] Signed-off-by: Jim Rees Signed-off-by: Peng Tao Signed-off-by: Fred Isaman [blocklayout: encode_layoutcommit implementation] Signed-off-by: Boaz Harrosh [pnfsblock: fix bug setting up layoutcommit.] Signed-off-by: Tao Guo [pnfsblock: prevent commit list corruption] [pnfsblock: fix layoutcommit with an empty opaque] Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 + fs/nfs/blocklayout/blocklayout.h | 12 +++ fs/nfs/blocklayout/extents.c | 176 +++++++++++++++++++++++++++++---------- 3 files changed, 146 insertions(+), 44 deletions(-) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 8c29a189f09b..d096835cfd6b 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -155,6 +155,8 @@ static void bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *arg) { + dprintk("%s enter\n", __func__); + encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); } static void diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index fcf47b55b5ce..3caaefce85a5 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -91,6 +91,15 @@ struct pnfs_block_extent { struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ }; +/* Shortened extent used by LAYOUTCOMMIT */ +struct pnfs_block_short_extent { + struct list_head bse_node; + struct nfs4_deviceid bse_devid; + struct block_device *bse_mdev; + sector_t bse_f_offset; /* the starting offset in the file */ + sector_t bse_length; /* the size of the extent */ +}; + static inline void BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) { @@ -184,6 +193,9 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, void bl_put_extent(struct pnfs_block_extent *be); struct pnfs_block_extent *bl_alloc_extent(void); int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); +int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg); int bl_add_merge_extent(struct pnfs_block_layout *bl, struct pnfs_block_extent *new); diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 292aadfd4d46..84bf24087720 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -286,6 +286,49 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks, return -ENOMEM; } +/* Marks sectors in [offest, offset+length) as having been written to disk. + * All lengths should be block aligned. + */ +static int mark_written_sectors(struct pnfs_inval_markings *marks, + sector_t offset, sector_t length) +{ + int status; + + dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, + (u64)offset, (u64)length); + spin_lock(&marks->im_lock); + status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); + spin_unlock(&marks->im_lock); + return status; +} + +static void print_short_extent(struct pnfs_block_short_extent *be) +{ + dprintk("PRINT SHORT EXTENT extent %p\n", be); + if (be) { + dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); + dprintk(" be_length %llu\n", (u64)be->bse_length); + } +} + +static void print_clist(struct list_head *list, unsigned int count) +{ + struct pnfs_block_short_extent *be; + unsigned int i = 0; + + ifdebug(FACILITY) { + printk(KERN_DEBUG "****************\n"); + printk(KERN_DEBUG "Extent list looks like:\n"); + list_for_each_entry(be, list, bse_node) { + i++; + print_short_extent(be); + } + if (i != count) + printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); + printk(KERN_DEBUG "****************\n"); + } +} + static void print_bl_extent(struct pnfs_block_extent *be) { dprintk("PRINT EXTENT extent %p\n", be); @@ -378,65 +421,67 @@ bl_add_merge_extent(struct pnfs_block_layout *bl, /* Scan for proper place to insert, extending new to the left * as much as possible. */ - list_for_each_entry_safe(be, tmp, list, be_node) { - if (new->be_f_offset < be->be_f_offset) + list_for_each_entry_safe_reverse(be, tmp, list, be_node) { + if (new->be_f_offset >= be->be_f_offset + be->be_length) break; - if (end <= be->be_f_offset + be->be_length) { - /* new is a subset of existing be*/ + if (new->be_f_offset >= be->be_f_offset) { + if (end <= be->be_f_offset + be->be_length) { + /* new is a subset of existing be*/ + if (extents_consistent(be, new)) { + dprintk("%s: new is subset, ignoring\n", + __func__); + bl_put_extent(new); + return 0; + } else { + goto out_err; + } + } else { + /* |<-- be -->| + * |<-- new -->| */ + if (extents_consistent(be, new)) { + /* extend new to fully replace be */ + new->be_length += new->be_f_offset - + be->be_f_offset; + new->be_f_offset = be->be_f_offset; + new->be_v_offset = be->be_v_offset; + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { + goto out_err; + } + } + } else if (end >= be->be_f_offset + be->be_length) { + /* new extent overlap existing be */ if (extents_consistent(be, new)) { - dprintk("%s: new is subset, ignoring\n", - __func__); - bl_put_extent(new); - return 0; - } else + /* extend new to fully replace be */ + dprintk("%s: removing %p\n", __func__, be); + list_del(&be->be_node); + bl_put_extent(be); + } else { goto out_err; - } else if (new->be_f_offset <= - be->be_f_offset + be->be_length) { - /* new overlaps or abuts existing be */ - if (extents_consistent(be, new)) { + } + } else if (end > be->be_f_offset) { + /* |<-- be -->| + *|<-- new -->| */ + if (extents_consistent(new, be)) { /* extend new to fully replace be */ - new->be_length += new->be_f_offset - - be->be_f_offset; - new->be_f_offset = be->be_f_offset; - new->be_v_offset = be->be_v_offset; + new->be_length += be->be_f_offset + be->be_length - + new->be_f_offset - new->be_length; dprintk("%s: removing %p\n", __func__, be); list_del(&be->be_node); bl_put_extent(be); - } else if (new->be_f_offset != - be->be_f_offset + be->be_length) + } else { goto out_err; + } } } /* Note that if we never hit the above break, be will not point to a * valid extent. However, in that case &be->be_node==list. */ - list_add_tail(&new->be_node, &be->be_node); + list_add(&new->be_node, &be->be_node); dprintk("%s: inserting new\n", __func__); print_elist(list); - /* Scan forward for overlaps. If we find any, extend new and - * remove the overlapped extent. - */ - be = list_prepare_entry(new, list, be_node); - list_for_each_entry_safe_continue(be, tmp, list, be_node) { - if (end < be->be_f_offset) - break; - /* new overlaps or abuts existing be */ - if (extents_consistent(be, new)) { - if (end < be->be_f_offset + be->be_length) { - /* extend new to fully cover be */ - end = be->be_f_offset + be->be_length; - new->be_length = end - new->be_f_offset; - } - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else if (end != be->be_f_offset) { - list_del(&new->be_node); - goto out_err; - } - } - dprintk("%s: after merging\n", __func__); - print_elist(list); /* FIXME - The per-list consistency checks have all been done, * should now check cross-list consistency. */ @@ -494,6 +539,49 @@ bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, return ret; } +int +encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + struct xdr_stream *xdr, + const struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_short_extent *lce, *save; + unsigned int count = 0; + __be32 *p, *xdr_start; + + dprintk("%s enter\n", __func__); + /* BUG - creation of bl_commit is buggy - need to wait for + * entire block to be marked WRITTEN before it can be added. + */ + spin_lock(&bl->bl_ext_lock); + /* Want to adjust for possible truncate */ + /* We now want to adjust argument range */ + + /* XDR encode the ranges found */ + xdr_start = xdr_reserve_space(xdr, 8); + if (!xdr_start) + goto out; + list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { + p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); + if (!p) + break; + p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + list_del(&lce->bse_node); + list_add_tail(&lce->bse_node, &bl->bl_committing); + bl->bl_count--; + count++; + } + xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); + xdr_start[1] = cpu_to_be32(count); +out: + spin_unlock(&bl->bl_ext_lock); + dprintk("%s found %i ranges\n", __func__, count); + return 0; +} + /* Helper function to set_to_rw that initialize a new extent */ static void _prep_new_extent(struct pnfs_block_extent *new, -- cgit v1.2.3 From b2be7811dd94816f3df76708c8eb7f55bf7289e2 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:52 -0400 Subject: pnfsblock: cleanup_layoutcommit In blocklayout driver. There are two things happening while layoutcommit/cleanup. 1. the modified extents are encoded. 2. On cleanup the extents are put back on the layout rw extents list, for reads. In the new system where actual xdr encoding is done in encode_layoutcommit() directly into xdr buffer, these are the new commit stages: 1. On setup_layoutcommit, the range is adjusted as before and a structure is allocated for communication with bl_encode_layoutcommit && bl_cleanup_layoutcommit (Generic layer provides a void-star to hang it on) 2. bl_encode_layoutcommit is called to do the actual encoding directly into xdr. The commit-extent-list is not freed and is stored on above structure. FIXME: The code is not yet converted to the new XDR cleanup 3. On cleanup the commit-extent-list is put back by a call to set_to_rw() as before, but with no need for XDR decoding of the list as before. And the commit-extent-list is freed. Finally allocated structure is freed. [rm inode and pnfs_layout_hdr args from cleanup_layoutcommit()] Signed-off-by: Jim Rees [pnfsblock: introduce bl_committing list] Signed-off-by: Peng Tao [pnfsblock: SQUASHME: adjust to API change] Signed-off-by: Fred Isaman [blocklayout: encode_layoutcommit implementation] Signed-off-by: Boaz Harrosh [pnfsblock: fix bug setting up layoutcommit.] Signed-off-by: Tao Guo [pnfsblock: cleanup_layoutcommit wants a status parameter] Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 4 + fs/nfs/blocklayout/blocklayout.h | 3 + fs/nfs/blocklayout/extents.c | 210 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 217 insertions(+) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d096835cfd6b..6c1bafb8920b 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -162,6 +162,10 @@ bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, static void bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) { + struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + + dprintk("%s enter\n", __func__); + clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); } static void free_blk_mountid(struct block_mount_id *mid) diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 3caaefce85a5..6a703b79c33d 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -196,6 +196,9 @@ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *arg); +void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status); int bl_add_merge_extent(struct pnfs_block_layout *bl, struct pnfs_block_extent *new); diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 84bf24087720..7521940dcca5 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -329,6 +329,73 @@ static void print_clist(struct list_head *list, unsigned int count) } } +/* Note: In theory, we should do more checking that devid's match between + * old and new, but if they don't, the lists are too corrupt to salvage anyway. + */ +/* Note this is very similar to bl_add_merge_extent */ +static void add_to_commitlist(struct pnfs_block_layout *bl, + struct pnfs_block_short_extent *new) +{ + struct list_head *clist = &bl->bl_commit; + struct pnfs_block_short_extent *old, *save; + sector_t end = new->bse_f_offset + new->bse_length; + + dprintk("%s enter\n", __func__); + print_short_extent(new); + print_clist(clist, bl->bl_count); + bl->bl_count++; + /* Scan for proper place to insert, extending new to the left + * as much as possible. + */ + list_for_each_entry_safe(old, save, clist, bse_node) { + if (new->bse_f_offset < old->bse_f_offset) + break; + if (end <= old->bse_f_offset + old->bse_length) { + /* Range is already in list */ + bl->bl_count--; + kfree(new); + return; + } else if (new->bse_f_offset <= + old->bse_f_offset + old->bse_length) { + /* new overlaps or abuts existing be */ + if (new->bse_mdev == old->bse_mdev) { + /* extend new to fully replace old */ + new->bse_length += new->bse_f_offset - + old->bse_f_offset; + new->bse_f_offset = old->bse_f_offset; + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + } + /* Note that if we never hit the above break, old will not point to a + * valid extent. However, in that case &old->bse_node==list. + */ + list_add_tail(&new->bse_node, &old->bse_node); + /* Scan forward for overlaps. If we find any, extend new and + * remove the overlapped extent. + */ + old = list_prepare_entry(new, clist, bse_node); + list_for_each_entry_safe_continue(old, save, clist, bse_node) { + if (end < old->bse_f_offset) + break; + /* new overlaps or abuts old */ + if (new->bse_mdev == old->bse_mdev) { + if (end < old->bse_f_offset + old->bse_length) { + /* extend new to fully cover old */ + end = old->bse_f_offset + old->bse_length; + new->bse_length = end - new->bse_f_offset; + } + list_del(&old->bse_node); + bl->bl_count--; + kfree(old); + } + } + dprintk("%s: after merging\n", __func__); + print_clist(clist, bl->bl_count); +} + static void print_bl_extent(struct pnfs_block_extent *be) { dprintk("PRINT EXTENT extent %p\n", be); @@ -539,6 +606,34 @@ bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, return ret; } +/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ +static struct pnfs_block_extent * +bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) +{ + struct pnfs_block_extent *be, *ret = NULL; + int i; + + dprintk("%s enter with isect %llu\n", __func__, (u64)isect); + for (i = 0; i < EXTENT_LISTS; i++) { + if (ret) + break; + list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { + if (isect >= be->be_f_offset + be->be_length) + break; + if (isect >= be->be_f_offset) { + /* We have found an extent */ + dprintk("%s Get %p (%i)\n", __func__, be, + atomic_read(&be->be_refcnt.refcount)); + kref_get(&be->be_refcnt); + ret = be; + break; + } + } + } + print_bl_extent(ret); + return ret; +} + int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, struct xdr_stream *xdr, @@ -628,3 +723,118 @@ _front_merge(struct pnfs_block_extent *be, struct list_head *head, kfree(storage); return be; } + +static u64 +set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) +{ + u64 rv = offset + length; + struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; + struct pnfs_block_extent *children[3]; + struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; + int i = 0, j; + + dprintk("%s(%llu, %llu)\n", __func__, offset, length); + /* Create storage for up to three new extents e1, e2, e3 */ + e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); + e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); + e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); + /* BUG - we are ignoring any failure */ + if (!e1 || !e2 || !e3) + goto out_nosplit; + + spin_lock(&bl->bl_ext_lock); + be = bl_find_get_extent_locked(bl, offset); + rv = be->be_f_offset + be->be_length; + if (be->be_state != PNFS_BLOCK_INVALID_DATA) { + spin_unlock(&bl->bl_ext_lock); + goto out_nosplit; + } + /* Add e* to children, bumping e*'s krefs */ + if (be->be_f_offset != offset) { + _prep_new_extent(e1, be, be->be_f_offset, + offset - be->be_f_offset, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e1; + print_bl_extent(e1); + } else + merge1 = e1; + _prep_new_extent(e2, be, offset, + min(length, be->be_f_offset + be->be_length - offset), + PNFS_BLOCK_READWRITE_DATA); + children[i++] = e2; + print_bl_extent(e2); + if (offset + length < be->be_f_offset + be->be_length) { + _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, + be->be_f_offset + be->be_length - + offset - length, + PNFS_BLOCK_INVALID_DATA); + children[i++] = e3; + print_bl_extent(e3); + } else + merge2 = e3; + + /* Remove be from list, and insert the e* */ + /* We don't get refs on e*, since this list is the base reference + * set when init'ed. + */ + if (i < 3) + children[i] = NULL; + new = children[0]; + list_replace(&be->be_node, &new->be_node); + bl_put_extent(be); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); + for (j = 1; j < i; j++) { + old = new; + new = children[j]; + list_add(&new->be_node, &old->be_node); + } + if (merge2) { + /* This is a HACK, should just create a _back_merge function */ + new = list_entry(new->be_node.next, + struct pnfs_block_extent, be_node); + new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); + } + spin_unlock(&bl->bl_ext_lock); + + /* Since we removed the base reference above, be is now scheduled for + * destruction. + */ + bl_put_extent(be); + dprintk("%s returns %llu after split\n", __func__, rv); + return rv; + + out_nosplit: + kfree(e1); + kfree(e2); + kfree(e3); + dprintk("%s returns %llu without splitting\n", __func__, rv); + return rv; +} + +void +clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, + const struct nfs4_layoutcommit_args *arg, + int status) +{ + struct pnfs_block_short_extent *lce, *save; + + dprintk("%s status %d\n", __func__, status); + list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { + if (likely(!status)) { + u64 offset = lce->bse_f_offset; + u64 end = offset + lce->bse_length; + + do { + offset = set_to_rw(bl, offset, end - offset); + } while (offset < end); + list_del(&lce->bse_node); + + kfree(lce); + } else { + list_del(&lce->bse_node); + spin_lock(&bl->bl_ext_lock); + add_to_commitlist(bl, lce); + spin_unlock(&bl->bl_ext_lock); + } + } +} -- cgit v1.2.3 From 31e6306a4046926b598484f1cacf69309382eac6 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Sat, 30 Jul 2011 20:52:55 -0400 Subject: pnfsblock: note written INVAL areas for layoutcommit Signed-off-by: Peng Tao Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Benny Halevy Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 32 ++++++++++++++ fs/nfs/blocklayout/blocklayout.h | 2 + fs/nfs/blocklayout/extents.c | 95 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+) (limited to 'fs/nfs/blocklayout/extents.c') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 2e373826db80..21efef7c2fd2 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -329,6 +329,30 @@ out: return PNFS_NOT_ATTEMPTED; } +static void mark_extents_written(struct pnfs_block_layout *bl, + __u64 offset, __u32 count) +{ + sector_t isect, end; + struct pnfs_block_extent *be; + + dprintk("%s(%llu, %u)\n", __func__, offset, count); + if (count == 0) + return; + isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; + end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); + end >>= SECTOR_SHIFT; + while (isect < end) { + sector_t len; + be = bl_find_get_extent(bl, isect, NULL); + BUG_ON(!be); /* FIXME */ + len = min(end, be->be_f_offset + be->be_length) - isect; + if (be->be_state == PNFS_BLOCK_INVALID_DATA) + bl_mark_for_commit(be, isect, len); /* What if fails? */ + isect += len; + bl_put_extent(be); + } +} + /* This is basically copied from mpage_end_io_read */ static void bl_end_io_write(struct bio *bio, int err) { @@ -355,6 +379,14 @@ static void bl_write_cleanup(struct work_struct *work) dprintk("%s enter\n", __func__); task = container_of(work, struct rpc_task, u.tk_work); wdata = container_of(task, struct nfs_write_data, task); + if (!wdata->task.tk_status) { + /* Marks for LAYOUTCOMMIT */ + /* BUG - this should be called after each bio, not after + * all finish, unless have some way of storing success/failure + */ + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), + wdata->args.offset, wdata->args.count); + } pnfs_ld_write_done(wdata); } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 6a703b79c33d..f27d827960a3 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -201,5 +201,7 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, int status); int bl_add_merge_extent(struct pnfs_block_layout *bl, struct pnfs_block_extent *new); +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c index 7521940dcca5..19fa7b0b8c00 100644 --- a/fs/nfs/blocklayout/extents.c +++ b/fs/nfs/blocklayout/extents.c @@ -217,6 +217,48 @@ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) return rv; } +/* Assume start, end already sector aligned */ +static int +_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) +{ + struct pnfs_inval_tracking *pos; + u64 expect = 0; + + dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); + list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { + if (pos->it_sector >= end) + continue; + if (!expect) { + if ((pos->it_sector == end - tree->mtt_step_size) && + (pos->it_tags & (1 << tag))) { + expect = pos->it_sector - tree->mtt_step_size; + if (pos->it_sector < tree->mtt_step_size || expect < start) + return 1; + continue; + } else { + return 0; + } + } + if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) + return 0; + expect -= tree->mtt_step_size; + if (expect < start) + return 1; + } + return 0; +} + +static int is_range_written(struct pnfs_inval_markings *marks, + sector_t start, sector_t end) +{ + int rv; + + spin_lock(&marks->im_lock); + rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); + spin_unlock(&marks->im_lock); + return rv; +} + /* Marks sectors in [offest, offset_length) as having been initialized. * All lengths are step-aligned, where step is min(pagesize, blocksize). * Notes where partial block is initialized, and helps prepare it for @@ -396,6 +438,59 @@ static void add_to_commitlist(struct pnfs_block_layout *bl, print_clist(clist, bl->bl_count); } +/* Note the range described by offset, length is guaranteed to be contained + * within be. + */ +int bl_mark_for_commit(struct pnfs_block_extent *be, + sector_t offset, sector_t length) +{ + sector_t new_end, end = offset + length; + struct pnfs_block_short_extent *new; + struct pnfs_block_layout *bl = container_of(be->be_inval, + struct pnfs_block_layout, + bl_inval); + + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) + return -ENOMEM; + + mark_written_sectors(be->be_inval, offset, length); + /* We want to add the range to commit list, but it must be + * block-normalized, and verified that the normalized range has + * been entirely written to disk. + */ + new->bse_f_offset = offset; + offset = normalize(offset, bl->bl_blocksize); + if (offset < new->bse_f_offset) { + if (is_range_written(be->be_inval, offset, new->bse_f_offset)) + new->bse_f_offset = offset; + else + new->bse_f_offset = offset + bl->bl_blocksize; + } + new_end = normalize_up(end, bl->bl_blocksize); + if (end < new_end) { + if (is_range_written(be->be_inval, end, new_end)) + end = new_end; + else + end = new_end - bl->bl_blocksize; + } + if (end <= new->bse_f_offset) { + kfree(new); + return 0; + } + new->bse_length = end - new->bse_f_offset; + new->bse_devid = be->be_devid; + new->bse_mdev = be->be_mdev; + + spin_lock(&bl->bl_ext_lock); + /* new will be freed, either by add_to_commitlist if it decides not + * to use it, or after LAYOUTCOMMIT uses it in the commitlist. + */ + add_to_commitlist(bl, new); + spin_unlock(&bl->bl_ext_lock); + return 0; +} + static void print_bl_extent(struct pnfs_block_extent *be) { dprintk("PRINT EXTENT extent %p\n", be); -- cgit v1.2.3