diff options
author | Christoph Hellwig <hch@lst.de> | 2014-09-10 17:23:34 +0200 |
---|---|---|
committer | Trond Myklebust <trond.myklebust@primarydata.com> | 2014-09-10 21:47:03 +0200 |
commit | 8067253c8cc531b6f367b9f5942bdc6168385701 (patch) | |
tree | 78d7663906883132df08a2a1aecf6e385fd766d1 /fs/nfs/blocklayout/blocklayout.h | |
parent | pnfs/blocklayout: don't set pages uptodate (diff) | |
download | linux-8067253c8cc531b6f367b9f5942bdc6168385701.tar.xz linux-8067253c8cc531b6f367b9f5942bdc6168385701.zip |
pnfs/blocklayout: rewrite extent tracking
Currently the block layout driver tracks extents in three separate
data structures:
- the two list of pnfs_block_extent structures returned by the server
- the list of sectors that were in invalid state but have been written to
- a list of pnfs_block_short_extent structures for LAYOUTCOMMIT
All of these share the property that they are not only highly inefficient
data structures, but also that operations on them are even more inefficient
than nessecary.
In addition there are various implementation defects like:
- using an int to track sectors, causing corruption for large offsets
- incorrect normalization of page or block granularity ranges
- insufficient error handling
- incorrect synchronization as extents can be modified while they are in
use
This patch replace all three data with a single unified rbtree structure
tracking all extents, as well as their in-memory state, although we still
need to instance for read-only and read-write extent due to the arcane
client side COW feature in the block layouts spec.
To fix the problem of extent possibly being modified while in use we make
sure to return a copy of the extent for use in the write path - the
extent can only be invalidated by a layout recall or return which has
to wait until the I/O operations finished due to refcounts on the layout
segment.
The new extent tree work similar to the schemes used by block based
filesystems like XFS or ext4.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Diffstat (limited to 'fs/nfs/blocklayout/blocklayout.h')
-rw-r--r-- | fs/nfs/blocklayout/blocklayout.h | 112 |
1 files changed, 23 insertions, 89 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 9838fb020473..b4f66d875f12 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -63,82 +63,28 @@ enum exstate4 { PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ }; -#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ - -struct my_tree { - sector_t mtt_step_size; /* Internal sector alignment */ - struct list_head mtt_stub; /* Should be a radix tree */ -}; - -struct pnfs_inval_markings { - spinlock_t im_lock; - struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ - sector_t im_block_size; /* Server blocksize in sectors */ - struct list_head im_extents; /* Short extents for INVAL->RW conversion */ -}; - -struct pnfs_inval_tracking { - struct list_head it_link; - int it_sector; - int it_tags; -}; - /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { - struct kref be_refcnt; - struct list_head be_node; /* link into lseg list */ - struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ + union { + struct rb_node be_node; + struct list_head be_list; + }; + struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ struct block_device *be_mdev; sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ enum exstate4 be_state; /* the state of this extent */ - struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ -}; - -/* Shortened extent used by LAYOUTCOMMIT */ -struct pnfs_block_short_extent { - struct list_head bse_node; - struct nfs4_deviceid bse_devid; - struct block_device *bse_mdev; - sector_t bse_f_offset; /* the starting offset in the file */ - sector_t bse_length; /* the size of the extent */ +#define EXTENT_WRITTEN 1 +#define EXTENT_COMMITTING 2 + unsigned int be_tag; }; -static inline void -BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) -{ - spin_lock_init(&marks->im_lock); - INIT_LIST_HEAD(&marks->im_tree.mtt_stub); - INIT_LIST_HEAD(&marks->im_extents); - marks->im_block_size = blocksize; - marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, - blocksize); -} - -enum extentclass4 { - RW_EXTENT = 0, /* READWRTE and INVAL */ - RO_EXTENT = 1, /* READ and NONE */ - EXTENT_LISTS = 2, -}; - -static inline int bl_choose_list(enum exstate4 state) -{ - if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) - return RO_EXTENT; - else - return RW_EXTENT; -} - struct pnfs_block_layout { - struct pnfs_layout_hdr bl_layout; - struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + struct pnfs_layout_hdr bl_layout; + struct rb_root bl_ext_rw; + struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ - struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ - struct list_head bl_commit; /* Needs layout commit */ - struct list_head bl_committing; /* Layout committing */ - unsigned int bl_count; /* entries in bl_commit */ - sector_t bl_blocksize; /* Server blocksize in sectors */ }; #define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) @@ -183,29 +129,17 @@ int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, /* blocklayoutdm.c */ void bl_free_block_dev(struct pnfs_block_dev *bdev); -/* extents.c */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read); -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length); -void bl_put_extent(struct pnfs_block_extent *be); -struct pnfs_block_extent *bl_alloc_extent(void); -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); -int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg); -void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status); -int bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new); -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new); -int bl_push_one_short_extent(struct pnfs_inval_markings *marks); -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks); -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); +/* extent_tree.c */ +int ext_tree_insert(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, + sector_t end); +int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len); +bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw); +int ext_tree_encode_commit(struct pnfs_block_layout *bl, + struct xdr_stream *xdr); +void ext_tree_mark_committed(struct pnfs_block_layout *bl, int status); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ |