diff options
Diffstat (limited to 'fs/xfs/xfs_buf.c')
-rw-r--r-- | fs/xfs/xfs_buf.c | 109 |
1 files changed, 91 insertions, 18 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e839907e8492..b21ea2ba768d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone; #define xb_to_gfp(flags) \ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) +/* + * Locking orders + * + * xfs_buf_ioacct_inc: + * xfs_buf_ioacct_dec: + * b_sema (caller holds) + * b_lock + * + * xfs_buf_stale: + * b_sema (caller holds) + * b_lock + * lru_lock + * + * xfs_buf_rele: + * b_lock + * pag_buf_lock + * lru_lock + * + * xfs_buftarg_wait_rele + * lru_lock + * b_lock (trylock due to inversion) + * + * xfs_buftarg_isolate + * lru_lock + * b_lock (trylock due to inversion) + */ static inline int xfs_buf_is_vmapped( @@ -749,6 +775,30 @@ _xfs_buf_read( return xfs_buf_submit(bp); } +/* + * If the caller passed in an ops structure and the buffer doesn't have ops + * assigned, set the ops and use them to verify the contents. If the contents + * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no + * recorded errors and is already in XBF_DONE state. + */ +int +xfs_buf_ensure_ops( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + ASSERT(bp->b_flags & XBF_DONE); + ASSERT(bp->b_error == 0); + + if (!ops || bp->b_ops) + return 0; + + bp->b_ops = ops; + bp->b_ops->verify_read(bp); + if (bp->b_error) + bp->b_flags &= ~XBF_DONE; + return bp->b_error; +} + xfs_buf_t * xfs_buf_read_map( struct xfs_buftarg *target, @@ -762,26 +812,32 @@ xfs_buf_read_map( flags |= XBF_READ; bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp) { - trace_xfs_buf_read(bp, flags, _RET_IP_); + if (!bp) + return NULL; - if (!(bp->b_flags & XBF_DONE)) { - XFS_STATS_INC(target->bt_mount, xb_get_read); - bp->b_ops = ops; - _xfs_buf_read(bp, flags); - } else if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ - xfs_buf_relse(bp); - return NULL; - } else { - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - } + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!(bp->b_flags & XBF_DONE)) { + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + return bp; } + xfs_buf_ensure_ops(bp, ops); + + if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); return bp; } @@ -1006,8 +1062,18 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + /* + * We grab the b_lock here first to serialise racing xfs_buf_rele() + * calls. The pag_buf_lock being taken on the last reference only + * serialises against racing lookups in xfs_buf_find(). IOWs, the second + * to last reference we drop here is not serialised against the last + * reference until we take bp->b_lock. Hence if we don't grab b_lock + * first, the last "release" reference can win the race to the lock and + * free the buffer before the second-to-last reference is processed, + * leading to a use-after-free scenario. + */ spin_lock(&bp->b_lock); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers( * is only safely useable for callers that can track I/O completion by higher * level means, e.g. AIL pushing as the @buffer_list is consumed in this * function. + * + * Note: this function will skip buffers it would block on, and in doing so + * leaves them on @buffer_list so they can be retried on a later pass. As such, + * it is up to the caller to ensure that the buffer list is fully submitted or + * cancelled appropriately when they are finished with the list. Failure to + * cancel or resubmit the list until it is empty will result in leaked buffers + * at unmount time. */ int xfs_buf_delwri_submit_nowait( |