diff options
Diffstat (limited to 'mm/filemap.c')
-rw-r--r-- | mm/filemap.c | 246 |
1 files changed, 172 insertions, 74 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..f2bb5ff0293d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -41,6 +41,7 @@ #include <linux/delayacct.h> #include <linux/psi.h> #include <linux/ramfs.h> +#include <linux/page_idle.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -987,44 +988,46 @@ void __init pagecache_init(void) page_writeback_init(); } -/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ -struct wait_page_key { - struct page *page; - int bit_nr; - int page_match; -}; - -struct wait_page_queue { - struct page *page; - int bit_nr; - wait_queue_entry_t wait; -}; - static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { + int ret; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); - if (wait_page->page != key->page) - return 0; - key->page_match = 1; - - if (wait_page->bit_nr != key->bit_nr) + if (!wake_page_match(wait_page, key)) return 0; /* - * Stop walking if it's locked. - * Is this safe if put_and_wait_on_page_locked() is in use? - * Yes: the waker must hold a reference to this page, and if PG_locked - * has now already been set by another task, that task must also hold - * a reference to the *same usage* of this page; so there is no need - * to walk on to wake even the put_and_wait_on_page_locked() callers. + * If it's an exclusive wait, we get the bit for it, and + * stop walking if we can't. + * + * If it's a non-exclusive wait, then the fact that this + * wake function was called means that the bit already + * was cleared, and we don't care if somebody then + * re-took it. */ - if (test_bit(key->bit_nr, &key->page->flags)) - return -1; + ret = 0; + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + ret = 1; + } + wait->flags |= WQ_FLAG_WOKEN; + + wake_up_state(wait->private, mode); - return autoremove_wake_function(wait, mode, sync, key); + /* + * Ok, we have successfully done what we're waiting for, + * and we can unconditionally remove the wait entry. + * + * Note that this has to be the absolute last thing we do, + * since after list_del_init(&wait->entry) the wait entry + * might be de-allocated and the process might even have + * exited. + */ + list_del_init_careful(&wait->entry); + return ret; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1103,16 +1106,31 @@ enum behavior { */ }; +/* + * Attempt to check (or get) the page bit, and mark the + * waiter woken if successful. + */ +static inline bool trylock_page_bit_common(struct page *page, int bit_nr, + struct wait_queue_entry *wait) +{ + if (wait->flags & WQ_FLAG_EXCLUSIVE) { + if (test_and_set_bit(bit_nr, &page->flags)) + return false; + } else if (test_bit(bit_nr, &page->flags)) + return false; + + wait->flags |= WQ_FLAG_WOKEN; + return true; +} + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; - bool bit_is_set; bool thrashing = false; bool delayacct = false; unsigned long pflags; - int ret = 0; if (bit_nr == PG_locked && !PageUptodate(page) && PageWorkingset(page)) { @@ -1130,48 +1148,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, wait_page.page = page; wait_page.bit_nr = bit_nr; - for (;;) { - spin_lock_irq(&q->lock); + /* + * Do one last check whether we can get the + * page bit synchronously. + * + * Do the SetPageWaiters() marking before that + * to let any waker we _just_ missed know they + * need to wake us up (otherwise they'll never + * even go to the slow case that looks at the + * page queue), and add ourselves to the wait + * queue if we need to sleep. + * + * This part needs to be done under the queue + * lock to avoid races. + */ + spin_lock_irq(&q->lock); + SetPageWaiters(page); + if (!trylock_page_bit_common(page, bit_nr, wait)) + __add_wait_queue_entry_tail(q, wait); + spin_unlock_irq(&q->lock); - if (likely(list_empty(&wait->entry))) { - __add_wait_queue_entry_tail(q, wait); - SetPageWaiters(page); - } + /* + * From now on, all the logic will be based on + * the WQ_FLAG_WOKEN flag, and the and the page + * bit testing (and setting) will be - or has + * already been - done by the wake function. + * + * We can drop our reference to the page. + */ + if (behavior == DROP) + put_page(page); + for (;;) { set_current_state(state); - spin_unlock_irq(&q->lock); - - bit_is_set = test_bit(bit_nr, &page->flags); - if (behavior == DROP) - put_page(page); - - if (likely(bit_is_set)) - io_schedule(); - - if (behavior == EXCLUSIVE) { - if (!test_and_set_bit_lock(bit_nr, &page->flags)) - break; - } else if (behavior == SHARED) { - if (!test_bit(bit_nr, &page->flags)) - break; - } - - if (signal_pending_state(state, current)) { - ret = -EINTR; + if (signal_pending_state(state, current)) break; - } - if (behavior == DROP) { - /* - * We can no longer safely access page->flags: - * even if CONFIG_MEMORY_HOTREMOVE is not enabled, - * there is a risk of waiting forever on a page reused - * for something that keeps it locked indefinitely. - * But best check for -EINTR above before breaking. - */ + if (wait->flags & WQ_FLAG_WOKEN) break; - } + + io_schedule(); } finish_wait(q, wait); @@ -1190,7 +1207,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, * bother with signals either. */ - return ret; + return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } void wait_on_page_bit(struct page *page, int bit_nr) @@ -1207,6 +1224,44 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); +static int __wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait, bool set) +{ + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + if (set) + ret = !trylock_page(page); + else + ret = PageLocked(page); + /* + * If we were succesful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; +} + +static int wait_on_page_locked_async(struct page *page, + struct wait_page_queue *wait) +{ + if (!PageLocked(page)) + return 0; + return __wait_on_page_locked_async(compound_head(page), wait, false); +} + /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -1369,6 +1424,11 @@ int __lock_page_killable(struct page *__page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +int __lock_page_async(struct page *page, struct wait_page_queue *wait) +{ + return __wait_on_page_locked_async(page, wait, true); +} + /* * Return values: * 1 - page is locked; mmap_lock is still held. @@ -1589,6 +1649,9 @@ EXPORT_SYMBOL(find_lock_entry); * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the * page is already in cache. If the page was allocated, unlock it before * returning so the caller can do the same dance. + * * %FGP_WRITE - The page will be written + * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask + * * %FGP_NOWAIT - Don't get blocked by page lock * * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even * if the %GFP flags specified for %FGP_CREAT are atomic. @@ -1630,6 +1693,11 @@ repeat: if (fgp_flags & FGP_ACCESSED) mark_page_accessed(page); + else if (fgp_flags & FGP_WRITE) { + /* Clear idle flag for buffer write */ + if (page_is_idle(page)) + clear_page_idle(page); + } no_page: if (!page && (fgp_flags & FGP_CREAT)) { @@ -2028,7 +2096,7 @@ find_page: page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & IOCB_NOIO) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,22 +2106,34 @@ find_page: goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - /* * See comment in do_read_cache_page on why * wait_on_page_locked is used to avoid unnecessarily * serialisations and why it's safe. */ - error = wait_on_page_locked_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) { + if (written) { + put_page(page); + goto out; + } + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) { + put_page(page); + goto would_block; + } + error = wait_on_page_locked_killable(page); + } if (unlikely(error)) goto readpage_error; if (PageUptodate(page)) @@ -2141,7 +2221,10 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); if (unlikely(error)) goto readpage_error; @@ -2160,6 +2243,11 @@ page_not_up_to_date_locked: } readpage: + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2249,9 +2337,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read); * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * * Return: * * number of bytes copied, even for partial reads - * * negative error code if nothing was read + * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) |