summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-09-07 05:49:49 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-07 05:49:49 +0200
commitd34fc1adf01ff87026da85fb972dc259dc347540 (patch)
tree27356073d423187157b7cdb69da32b53102fb9e7 /fs
parentx86/mm: Document how CR4.PCIDE restore works (diff)
parentmm,fork: introduce MADV_WIPEONFORK (diff)
downloadlinux-d34fc1adf01ff87026da85fb972dc259dc347540.tar.xz
linux-d34fc1adf01ff87026da85fb972dc259dc347540.zip
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - DAX updates - OCFS2 - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (119 commits) mm,fork: introduce MADV_WIPEONFORK x86,mpx: make mpx depend on x86-64 to free up VMA flag mm: add /proc/pid/smaps_rollup mm: hugetlb: clear target sub-page last when clearing huge page mm: oom: let oom_reap_task and exit_mmap run concurrently swap: choose swap device according to numa node mm: replace TIF_MEMDIE checks by tsk_is_oom_victim mm, oom: do not rely on TIF_MEMDIE for memory reserves access z3fold: use per-cpu unbuddied lists mm, swap: don't use VMA based swap readahead if HDD is used as swap mm, swap: add sysfs interface for VMA based swap readahead mm, swap: VMA based swap readahead mm, swap: fix swap readahead marking mm, swap: add swap readahead hit statistics mm/vmalloc.c: don't reinvent the wheel but use existing llist API mm/vmstat.c: fix wrong comment selftests/memfd: add memfd_create hugetlbfs selftest mm/shmem: add hugetlbfs support to memfd_create() mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() ...
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c29
-rw-r--r--fs/afs/cache.c43
-rw-r--r--fs/buffer.c31
-rw-r--r--fs/ceph/cache.c31
-rw-r--r--fs/cifs/cache.c31
-rw-r--r--fs/dax.c363
-rw-r--r--fs/ext2/file.c25
-rw-r--r--fs/ext4/file.c48
-rw-r--r--fs/ext4/inode.c15
-rw-r--r--fs/fscache/page.c5
-rw-r--r--fs/hugetlbfs/inode.c30
-rw-r--r--fs/nfs/fscache-index.c40
-rw-r--r--fs/nilfs2/page.c3
-rw-r--r--fs/ocfs2/acl.c2
-rw-r--r--fs/ocfs2/acl.h7
-rw-r--r--fs/ocfs2/alloc.c22
-rw-r--r--fs/ocfs2/alloc.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c42
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/file.c7
-rw-r--r--fs/ocfs2/journal.c1
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/suballoc.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/proc/base.c2
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/meminfo.c10
-rw-r--r--fs/proc/task_mmu.c197
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/sync.c5
-rw-r--r--fs/userfaultfd.c21
-rw-r--r--fs/xfs/xfs_file.c2
35 files changed, 365 insertions, 670 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 103ca5e1267b..64c58eb26159 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OKAY;
}
-static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
-{
- struct v9fs_inode *v9inode = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- for (;;) {
- nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
- first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
const struct fscache_cookie_def v9fs_cache_inode_index_def = {
.name = "9p.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
.get_attr = v9fs_cache_inode_get_attr,
.get_aux = v9fs_cache_inode_get_aux,
.check_aux = v9fs_cache_inode_check_aux,
- .now_uncached = v9fs_cache_inode_now_uncached,
};
void v9fs_cache_inode_get_cookie(struct inode *inode)
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 577763c3d88b..1fe855191261 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
const void *buffer,
uint16_t buflen);
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data);
struct fscache_netfs afs_cache_netfs = {
.name = "afs",
@@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = {
.get_attr = afs_vnode_cache_get_attr,
.get_aux = afs_vnode_cache_get_aux,
.check_aux = afs_vnode_cache_check_aux,
- .now_uncached = afs_vnode_cache_now_uncached,
};
/*
@@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
_leave(" = SUCCESS");
return FSCACHE_CHECKAUX_OKAY;
}
-
-/*
- * indication the cookie is no longer uncached
- * - this function is called when the backing store currently caching a cookie
- * is removed
- * - the netfs should use this to clean up any markers indicating cached pages
- * - this is mandatory for any object that may have data
- */
-static void afs_vnode_cache_now_uncached(void *cookie_netfs_data)
-{
- struct afs_vnode *vnode = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- _enter("{%x,%x,%Lx}",
- vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version);
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- for (;;) {
- /* grab a bunch of pages to clean */
- nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping,
- first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-
- _leave("");
-}
diff --git a/fs/buffer.c b/fs/buffer.c
index 5715dac7821f..50da0e102ca0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
struct pagevec pvec;
pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
pgoff_t end;
- int i;
+ int i, count;
struct buffer_head *bh;
struct buffer_head *head;
end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
pagevec_init(&pvec, 0);
- while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
+ while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) {
+ count = pagevec_count(&pvec);
+ for (i = 0; i < count; i++) {
struct page *page = pvec.pages[i];
- index = page->index;
- if (index > end)
- break;
if (!page_has_buffers(page))
continue;
/*
@@ -1670,7 +1667,9 @@ unlock_page:
}
pagevec_release(&pvec);
cond_resched();
- index++;
+ /* End of range already reached? */
+ if (index > end || !index)
+ break;
}
}
EXPORT_SYMBOL(clean_bdev_aliases);
@@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
pagevec_init(&pvec, 0);
do {
- unsigned want, nr_pages, i;
+ unsigned nr_pages, i;
- want = min_t(unsigned, end - index, PAGEVEC_SIZE);
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ end - 1);
if (nr_pages == 0)
break;
@@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
lastoff < page_offset(page))
goto check_range;
- /* Searching done if the page index is out of range. */
- if (page->index >= end)
- goto not_found;
-
lock_page(page);
if (likely(page->mapping == inode->i_mapping) &&
page_has_buffers(page)) {
@@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
unlock_page(page);
lastoff = page_offset(page) + PAGE_SIZE;
}
-
- /* Searching done if fewer pages returned than wanted. */
- if (nr_pages < want)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index < end);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 337f88673ed9..174d6e6569a8 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
return FSCACHE_CHECKAUX_OKAY;
}
-static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
-{
- struct ceph_inode_info* ci = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- dout("ceph inode 0x%p now uncached", ci);
-
- while (1) {
- nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
-
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.name = "CEPH.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.get_attr = ceph_fscache_inode_get_attr,
.get_aux = ceph_fscache_inode_get_aux,
.check_aux = ceph_fscache_inode_check_aux,
- .now_uncached = ceph_fscache_inode_now_uncached,
};
void ceph_fscache_register_inode_cookie(struct inode *inode)
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index 6c665bf4a27c..2c14020e5e1d 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OKAY;
}
-static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
- struct cifsInodeInfo *cifsi = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi);
-
- for (;;) {
- nr_pages = pagevec_lookup(&pvec,
- cifsi->vfs_inode.i_mapping, first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
const struct fscache_cookie_def cifs_fscache_inode_object_def = {
.name = "CIFS.uniqueid",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
@@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = {
.get_attr = cifs_fscache_inode_get_attr,
.get_aux = cifs_fscache_inode_get_aux,
.check_aux = cifs_fscache_inode_check_aux,
- .now_uncached = cifs_fscache_inode_now_uncached,
};
diff --git a/fs/dax.c b/fs/dax.c
index ab925dc6647a..6afcacb3a87b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -42,6 +42,9 @@
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+/* The 'colour' (ie low bits) within a PMD of a page offset. */
+#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
+
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void)
}
fs_initcall(init_dax_wait_table);
+/*
+ * We use lowest available bit in exceptional entry for locking, one bit for
+ * the entry size (PMD) and two more to tell us if the entry is a zero page or
+ * an empty entry that is just used for locking. In total four special bits.
+ *
+ * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
+ * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
+ * block allocation.
+ */
+#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
+#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
+#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
+#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
+#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
+
+static unsigned long dax_radix_sector(void *entry)
+{
+ return (unsigned long)entry >> RADIX_DAX_SHIFT;
+}
+
+static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+{
+ return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
+ ((unsigned long)sector << RADIX_DAX_SHIFT) |
+ RADIX_DAX_ENTRY_LOCK);
+}
+
+static unsigned int dax_radix_order(void *entry)
+{
+ if ((unsigned long)entry & RADIX_DAX_PMD)
+ return PMD_SHIFT - PAGE_SHIFT;
+ return 0;
+}
+
static int dax_is_pmd_entry(void *entry)
{
return (unsigned long)entry & RADIX_DAX_PMD;
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry)
static int dax_is_zero_entry(void *entry)
{
- return (unsigned long)entry & RADIX_DAX_HZP;
+ return (unsigned long)entry & RADIX_DAX_ZERO_PAGE;
}
static int dax_is_empty_entry(void *entry)
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
* the range covered by the PMD map to the same bit lock.
*/
if (dax_is_pmd_entry(entry))
- index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+ index &= ~PG_PMD_COLOUR;
key->mapping = mapping;
key->entry_start = index;
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
}
/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree. This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
+static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+ pgoff_t index, void *entry, bool wake_all)
+{
+ struct exceptional_entry_key key;
+ wait_queue_head_t *wq;
+
+ wq = dax_entry_waitqueue(mapping, index, entry, &key);
+
+ /*
+ * Checking for locked entry and prepare_to_wait_exclusive() happens
+ * under mapping->tree_lock, ditto for entry handling in our callers.
+ * So at this point all tasks that could have seen our entry locked
+ * must be in the waitqueue and the following check will see them.
+ */
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+}
+
+/*
* Check whether the given slot is locked. The function must be called with
* mapping->tree_lock held
*/
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
for (;;) {
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
&slot);
- if (!entry || !radix_tree_exceptional_entry(entry) ||
+ if (!entry ||
+ WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) ||
!slot_locked(mapping, slot)) {
if (slotp)
*slotp = slot;
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping,
}
static void put_locked_mapping_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+ pgoff_t index)
{
- if (!radix_tree_exceptional_entry(entry)) {
- unlock_page(entry);
- put_page(entry);
- } else {
- dax_unlock_mapping_entry(mapping, index);
- }
+ dax_unlock_mapping_entry(mapping, index);
}
/*
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
static void put_unlocked_mapping_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- if (!radix_tree_exceptional_entry(entry))
+ if (!entry)
return;
/* We have to wake up next waiter for the radix tree entry lock */
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
}
/*
- * Find radix tree entry at given index. If it points to a page, return with
- * the page locked. If it points to the exceptional entry, return with the
- * radix tree entry locked. If the radix tree doesn't contain given index,
- * create empty exceptional entry for the index and return with it locked.
+ * Find radix tree entry at given index. If it points to an exceptional entry,
+ * return it with the radix tree entry locked. If the radix tree doesn't
+ * contain given index, create an empty exceptional entry for the index and
+ * return with it locked.
*
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
* either return that locked entry or will return an error. This error will
- * happen if there are any 4k entries (either zero pages or DAX entries)
- * within the 2MiB range that we are requesting.
+ * happen if there are any 4k entries within the 2MiB range that we are
+ * requesting.
*
* We always favor 4k entries over 2MiB entries. There isn't a flow where we
* evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB
@@ -276,18 +334,21 @@ restart:
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
+ if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) {
+ entry = ERR_PTR(-EIO);
+ goto out_unlock;
+ }
+
if (entry) {
if (size_flag & RADIX_DAX_PMD) {
- if (!radix_tree_exceptional_entry(entry) ||
- dax_is_pte_entry(entry)) {
+ if (dax_is_pte_entry(entry)) {
put_unlocked_mapping_entry(mapping, index,
entry);
entry = ERR_PTR(-EEXIST);
goto out_unlock;
}
} else { /* trying to grab a PTE entry */
- if (radix_tree_exceptional_entry(entry) &&
- dax_is_pmd_entry(entry) &&
+ if (dax_is_pmd_entry(entry) &&
(dax_is_zero_entry(entry) ||
dax_is_empty_entry(entry))) {
pmd_downgrade = true;
@@ -321,7 +382,7 @@ restart:
mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
if (err) {
if (pmd_downgrade)
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ERR_PTR(err);
}
spin_lock_irq(&mapping->tree_lock);
@@ -371,52 +432,12 @@ restart:
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
- /* Normal page in radix tree? */
- if (!radix_tree_exceptional_entry(entry)) {
- struct page *page = entry;
-
- get_page(page);
- spin_unlock_irq(&mapping->tree_lock);
- lock_page(page);
- /* Page got truncated? Retry... */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto restart;
- }
- return page;
- }
entry = lock_slot(mapping, slot);
out_unlock:
spin_unlock_irq(&mapping->tree_lock);
return entry;
}
-/*
- * We do not necessarily hold the mapping->tree_lock when we call this
- * function so it is possible that 'entry' is no longer a valid item in the
- * radix tree. This is okay because all we really need to do is to find the
- * correct waitqueue where tasks might be waiting for that old 'entry' and
- * wake them.
- */
-void dax_wake_mapping_entry_waiter(struct address_space *mapping,
- pgoff_t index, void *entry, bool wake_all)
-{
- struct exceptional_entry_key key;
- wait_queue_head_t *wq;
-
- wq = dax_entry_waitqueue(mapping, index, entry, &key);
-
- /*
- * Checking for locked entry and prepare_to_wait_exclusive() happens
- * under mapping->tree_lock, ditto for entry handling in our callers.
- * So at this point all tasks that could have seen our entry locked
- * must be in the waitqueue and the following check will see them.
- */
- if (waitqueue_active(wq))
- __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-}
-
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (!entry || !radix_tree_exceptional_entry(entry))
+ if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
return __dax_invalidate_mapping_entry(mapping, index, false);
}
-/*
- * The user has performed a load from a hole in the file. Allocating
- * a new page in the file would cause excessive storage usage for
- * workloads with sparse files. We allocate a page cache page instead.
- * We'll kick it out of the page cache if it's ever written to,
- * otherwise it will simply fall out of the page cache under memory
- * pressure without ever having been dirtied.
- */
-static int dax_load_hole(struct address_space *mapping, void **entry,
- struct vm_fault *vmf)
-{
- struct inode *inode = mapping->host;
- struct page *page;
- int ret;
-
- /* Hole page already exists? Return it... */
- if (!radix_tree_exceptional_entry(*entry)) {
- page = *entry;
- goto finish_fault;
- }
-
- /* This will replace locked radix tree entry with a hole page */
- page = find_or_create_page(mapping, vmf->pgoff,
- vmf->gfp_mask | __GFP_ZERO);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
- }
-
-finish_fault:
- vmf->page = page;
- ret = finish_fault(vmf);
- vmf->page = NULL;
- *entry = page;
- if (!ret) {
- /* Grab reference for PTE that is now referencing the page */
- get_page(page);
- ret = VM_FAULT_NOPAGE;
- }
-out:
- trace_dax_load_hole(inode, vmf, ret);
- return ret;
-}
-
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
sector_t sector, size_t size, struct page *to,
unsigned long vaddr)
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
unsigned long flags)
{
struct radix_tree_root *page_tree = &mapping->page_tree;
- int error = 0;
- bool hole_fill = false;
void *new_entry;
pgoff_t index = vmf->pgoff;
if (vmf->flags & FAULT_FLAG_WRITE)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- /* Replacing hole page with block mapping? */
- if (!radix_tree_exceptional_entry(entry)) {
- hole_fill = true;
- /*
- * Unmap the page now before we remove it from page cache below.
- * The page is locked so it cannot be faulted in again.
- */
- unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
- PAGE_SIZE, 0);
- error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
- if (error)
- return ERR_PTR(error);
- } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
- /* replacing huge zero page with PMD block mapping */
- unmap_mapping_range(mapping,
- (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+ if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
+ /* we are replacing a zero page with block mapping */
+ if (dax_is_pmd_entry(entry))
+ unmap_mapping_range(mapping,
+ (vmf->pgoff << PAGE_SHIFT) & PMD_MASK,
+ PMD_SIZE, 0);
+ else /* pte entry */
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_SIZE, 0);
}
spin_lock_irq(&mapping->tree_lock);
new_entry = dax_radix_locked_entry(sector, flags);
- if (hole_fill) {
- __delete_from_page_cache(entry, NULL);
- /* Drop pagecache reference */
- put_page(entry);
- error = __radix_tree_insert(page_tree, index,
- dax_radix_order(new_entry), new_entry);
- if (error) {
- new_entry = ERR_PTR(error);
- goto unlock;
- }
- mapping->nrexceptional++;
- } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
/*
* Only swap our new entry into the radix tree if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
WARN_ON_ONCE(ret != entry);
__radix_tree_replace(page_tree, node, slot,
new_entry, NULL, NULL);
+ entry = new_entry;
}
+
if (vmf->flags & FAULT_FLAG_WRITE)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
- unlock:
+
spin_unlock_irq(&mapping->tree_lock);
- if (hole_fill) {
- radix_tree_preload_end();
- /*
- * We don't need hole page anymore, it has been replaced with
- * locked radix tree entry now.
- */
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(entry);
- unlock_page(entry);
- put_page(entry);
- }
- return new_entry;
+ return entry;
}
static inline unsigned long
@@ -727,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock);
entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
/* Entry got punched out / reallocated? */
- if (!entry2 || !radix_tree_exceptional_entry(entry2))
+ if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2)))
goto put_unlocked;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
@@ -799,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev,
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
dax_unlock:
dax_read_unlock(id);
- put_locked_mapping_entry(mapping, index, entry);
+ put_locked_mapping_entry(mapping, index);
return ret;
put_unlocked:
@@ -874,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, struct dax_device *dax_dev,
- sector_t sector, size_t size, void **entryp,
+ sector_t sector, size_t size, void *entry,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
unsigned long vaddr = vmf->address;
- void *entry = *entryp;
void *ret, *kaddr;
pgoff_t pgoff;
int id, rc;
@@ -899,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping,
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
- *entryp = ret;
trace_dax_insert_mapping(mapping->host, vmf, ret);
- return vm_insert_mixed(vma, vaddr, pfn);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ else
+ return vm_insert_mixed(vma, vaddr, pfn);
}
-/**
- * dax_pfn_mkwrite - handle first write to DAX page
- * @vmf: The description of the fault
+/*
+ * The user has performed a load from a hole in the file. Allocating a new
+ * page in the file would cause excessive storage usage for workloads with
+ * sparse files. Instead we insert a read-only mapping of the 4k zero page.
+ * If this page is ever written to we will re-fault and change the mapping to
+ * point to real DAX storage instead.
*/
-int dax_pfn_mkwrite(struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+ struct vm_fault *vmf)
{
- struct file *file = vmf->vma->vm_file;
- struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- void *entry, **slot;
- pgoff_t index = vmf->pgoff;
+ unsigned long vaddr = vmf->address;
+ int ret = VM_FAULT_NOPAGE;
+ struct page *zero_page;
+ void *entry2;
- spin_lock_irq(&mapping->tree_lock);
- entry = get_unlocked_mapping_entry(mapping, index, &slot);
- if (!entry || !radix_tree_exceptional_entry(entry)) {
- if (entry)
- put_unlocked_mapping_entry(mapping, index, entry);
- spin_unlock_irq(&mapping->tree_lock);
- trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+ zero_page = ZERO_PAGE(0);
+ if (unlikely(!zero_page)) {
+ ret = VM_FAULT_OOM;
+ goto out;
}
- radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
- entry = lock_slot(mapping, slot);
- spin_unlock_irq(&mapping->tree_lock);
- /*
- * If we race with somebody updating the PTE and finish_mkwrite_fault()
- * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
- * the fault in either case.
- */
- finish_mkwrite_fault(vmf);
- put_locked_mapping_entry(mapping, index, entry);
- trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE);
- return VM_FAULT_NOPAGE;
+
+ entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_ZERO_PAGE);
+ if (IS_ERR(entry2)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
+out:
+ trace_dax_load_hole(inode, vmf, ret);
+ return ret;
}
-EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
static bool dax_range_is_aligned(struct block_device *bdev,
unsigned int offset, unsigned int length)
@@ -1059,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (map_len > end - pos)
map_len = end - pos;
+ /*
+ * The userspace address for the memory copy has already been
+ * validated via access_ok() in either vfs_read() or
+ * vfs_write(), depending on which operation we are doing.
+ */
if (iov_iter_rw(iter) == WRITE)
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
@@ -1223,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
major = VM_FAULT_MAJOR;
}
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
- sector, PAGE_SIZE, &entry, vmf->vma, vmf);
+ sector, PAGE_SIZE, entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
@@ -1231,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
- vmf_ret = dax_load_hole(mapping, &entry, vmf);
+ vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1258,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+ put_locked_mapping_entry(mapping, vmf->pgoff);
out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
#ifdef CONFIG_FS_DAX_PMD
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
- * more often than one might expect in the below functions.
- */
-#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
-
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
- loff_t pos, void **entryp)
+ loff_t pos, void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const sector_t sector = dax_iomap_sector(iomap, pos);
@@ -1283,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
void *ret = NULL, *kaddr;
long length = 0;
pgoff_t pgoff;
- pfn_t pfn;
+ pfn_t pfn = {};
int id;
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
@@ -1303,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
goto unlock_fallback;
dax_read_unlock(id);
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
@@ -1321,7 +1267,7 @@ fallback:
}
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
- void **entryp)
+ void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK;
@@ -1336,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
if (unlikely(!zero_page))
goto fallback;
- ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
- RADIX_DAX_PMD | RADIX_DAX_HZP);
+ ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+ RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE);
if (IS_ERR(ret))
goto fallback;
- *entryp = ret;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1416,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
goto fallback;
/*
- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
- * PMD or a HZP entry. If it can't (because a 4k page is already in
- * the tree, for instance), it will return -EEXIST and we just fall
- * back to 4k entries.
+ * grab_mapping_entry() will make sure we get a 2MiB empty entry, a
+ * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page
+ * is already in the tree, for instance), it will return -EEXIST and
+ * we just fall back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
@@ -1452,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
switch (iomap.type) {
case IOMAP_MAPPED:
- result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
+ result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry);
break;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
break;
- result = dax_pmd_load_hole(vmf, &iomap, &entry);
+ result = dax_pmd_load_hole(vmf, &iomap, entry);
break;
default:
WARN_ON_ONCE(1);
@@ -1481,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
&iomap);
}
unlock_entry:
- put_locked_mapping_entry(mapping, pgoff, entry);
+ put_locked_mapping_entry(mapping, pgoff);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d34d32bdc944..ff3a3636a5ca 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf)
return ret;
}
-static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct ext2_inode_info *ei = EXT2_I(inode);
- loff_t size;
- int ret;
-
- sb_start_pagefault(inode->i_sb);
- file_update_time(vmf->vma->vm_file);
- down_read(&ei->dax_sem);
-
- /* check that the faulting page hasn't raced with truncate */
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else
- ret = dax_pfn_mkwrite(vmf);
-
- up_read(&ei->dax_sem);
- sb_end_pagefault(inode->i_sb);
- return ret;
-}
-
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
/*
@@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
* will always fail and fail back to regular faults.
*/
.page_mkwrite = ext2_dax_fault,
- .pfn_mkwrite = ext2_dax_pfn_mkwrite,
+ .pfn_mkwrite = ext2_dax_fault,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 197653ea6041..57dcaea762c3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -324,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf)
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
-/*
- * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
- * handler we check for races agaist truncate. Note that since we cycle through
- * i_mmap_sem, we are sure that also any hole punching that began before we
- * were called is finished by now and so if it included part of the file we
- * are working on, our pte will get unmapped and the check for pte_same() in
- * wp_pfn_shared() fails. Thus fault gets retried and things work out as
- * desired.
- */
-static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf)
-{
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct super_block *sb = inode->i_sb;
- loff_t size;
- int ret;
-
- sb_start_pagefault(sb);
- file_update_time(vmf->vma->vm_file);
- down_read(&EXT4_I(inode)->i_mmap_sem);
- size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (vmf->pgoff >= size)
- ret = VM_FAULT_SIGBUS;
- else
- ret = dax_pfn_mkwrite(vmf);
- up_read(&EXT4_I(inode)->i_mmap_sem);
- sb_end_pagefault(sb);
-
- return ret;
-}
-
static const struct vm_operations_struct ext4_dax_vm_ops = {
.fault = ext4_dax_fault,
.huge_fault = ext4_dax_huge_fault,
.page_mkwrite = ext4_dax_fault,
- .pfn_mkwrite = ext4_dax_pfn_mkwrite,
+ .pfn_mkwrite = ext4_dax_fault,
};
#else
#define ext4_dax_vm_ops ext4_file_vm_ops
@@ -507,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
pagevec_init(&pvec, 0);
do {
- int i, num;
+ int i;
unsigned long nr_pages;
- num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1;
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
- (pgoff_t)num);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+ &index, end);
if (nr_pages == 0)
break;
@@ -531,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
goto out;
}
- if (page->index > end)
- goto out;
-
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping)) {
@@ -576,14 +542,10 @@ next:
unlock_page(page);
}
- /* The no. of pages is less than our desired, we are done. */
- if (nr_pages < num)
- break;
-
- index = pvec.pages[i - 1]->index + 1;
pagevec_release(&pvec);
} while (index <= end);
+ /* There are no pages upto endoff - that would be a hole in there. */
if (whence == SEEK_HOLE && lastoff < endoff) {
found = 1;
*offset = lastoff;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 714396760616..e963508ea35f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
pagevec_init(&pvec, 0);
while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
- break;
+
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
if (invalidate) {
@@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
}
unlock_page(page);
}
- index = pvec.pages[nr_pages - 1]->index + 1;
pagevec_release(&pvec);
}
}
@@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
pagevec_init(&pvec, 0);
while (start <= end) {
- nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
- PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping,
+ &start, end);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- if (page->index > end)
- break;
- /* Up to 'end' pages must be contiguous */
- BUG_ON(page->index != start);
bh = head = page_buffers(page);
do {
if (lblk < mpd->map.m_lblk)
@@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
pagevec_release(&pvec);
return err;
}
- start++;
}
pagevec_release(&pvec);
}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c8c4f79c7ce1..0ad3fd3ad0b4 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
pagevec_init(&pvec, 0);
next = 0;
do {
- if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
+ if (!pagevec_lookup(&pvec, mapping, &next))
break;
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
- next = page->index;
if (PageFsCache(page)) {
__fscache_wait_on_page_write(cookie, page);
__fscache_uncache_page(cookie, page);
@@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie,
}
pagevec_release(&pvec);
cond_resched();
- } while (++next);
+ } while (next);
_leave("");
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28d2753be094..7c02b3f738e1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
const pgoff_t end = lend >> huge_page_shift(h);
struct vm_area_struct pseudo_vma;
struct pagevec pvec;
- pgoff_t next;
+ pgoff_t next, index;
int i, freed = 0;
- long lookup_nr = PAGEVEC_SIZE;
bool truncate_op = (lend == LLONG_MAX);
memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
@@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
next = start;
while (next < end) {
/*
- * Don't grab more pages than the number left in the range.
- */
- if (end - next < lookup_nr)
- lookup_nr = end - next;
-
- /*
* When no more pages are found, we are done.
*/
- if (!pagevec_lookup(&pvec, mapping, next, lookup_nr))
+ if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
break;
for (i = 0; i < pagevec_count(&pvec); ++i) {
struct page *page = pvec.pages[i];
u32 hash;
- /*
- * The page (index) could be beyond end. This is
- * only possible in the punch hole case as end is
- * max page offset in the truncate case.
- */
- next = page->index;
- if (next >= end)
- break;
-
+ index = page->index;
hash = hugetlb_fault_mutex_hash(h, current->mm,
&pseudo_vma,
- mapping, next, 0);
+ mapping, index, 0);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
@@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
i_mmap_lock_write(mapping);
hugetlb_vmdelete_list(&mapping->i_mmap,
- next * pages_per_huge_page(h),
- (next + 1) * pages_per_huge_page(h));
+ index * pages_per_huge_page(h),
+ (index + 1) * pages_per_huge_page(h));
i_mmap_unlock_write(mapping);
}
@@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
- next, next + 1, 1)))
+ index, index + 1, 1)))
hugetlb_fix_reserve_counts(inode);
}
unlock_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
- ++next;
huge_pagevec_release(&pvec);
cond_resched();
}
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 777b055063f6..3025fe8584a0 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
}
/*
- * Indication from FS-Cache that the cookie is no longer cached
- * - This function is called when the backing store currently caching a cookie
- * is removed
- * - The netfs should use this to clean up any markers indicating cached pages
- * - This is mandatory for any object that may have data
- */
-static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data)
-{
- struct nfs_inode *nfsi = cookie_netfs_data;
- struct pagevec pvec;
- pgoff_t first;
- int loop, nr_pages;
-
- pagevec_init(&pvec, 0);
- first = 0;
-
- dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi);
-
- for (;;) {
- /* grab a bunch of pages to unmark */
- nr_pages = pagevec_lookup(&pvec,
- nfsi->vfs_inode.i_mapping,
- first,
- PAGEVEC_SIZE - pagevec_count(&pvec));
- if (!nr_pages)
- break;
-
- for (loop = 0; loop < nr_pages; loop++)
- ClearPageFsCache(pvec.pages[loop]);
-
- first = pvec.pages[nr_pages - 1]->index + 1;
-
- pvec.nr = nr_pages;
- pagevec_release(&pvec);
- cond_resched();
- }
-}
-
-/*
* Get an extra reference on a read context.
* - This function can be absent if the completion function doesn't require a
* context.
@@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = {
.get_attr = nfs_fscache_inode_get_attr,
.get_aux = nfs_fscache_inode_get_aux,
.check_aux = nfs_fscache_inode_check_aux,
- .now_uncached = nfs_fscache_inode_now_uncached,
.get_context = nfs_fh_get_context,
.put_context = nfs_fh_put_context,
};
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index f11a3ad2df0c..8616c46d33da 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap,
pagevec_init(&pvec, 0);
repeat:
- n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
+ n = pagevec_lookup(&pvec, smap, &index);
if (!n)
return;
- index = pvec.pages[n - 1]->index + 1;
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i], *dpage;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e50a387959bf..40b5cc97f7b0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -221,7 +221,7 @@ out:
/*
* Set the access or default ACL of an inode.
*/
-int ocfs2_set_acl(handle_t *handle,
+static int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 2783a75b3999..7be0bb756286 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -28,13 +28,6 @@ struct ocfs2_acl_entry {
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-int ocfs2_set_acl(handle_t *handle,
- struct inode *inode,
- struct buffer_head *di_bh,
- int type,
- struct posix_acl *acl,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac);
extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *);
extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fb15a96df0b6..a177eae3aa1a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
/*
* How many free extents have we got before we need more meta data?
*/
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct ocfs2_extent_tree *et)
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et)
{
int retval;
struct ocfs2_extent_list *el = NULL;
@@ -1933,14 +1932,12 @@ out:
* the new changes.
*
* left_rec: the record on the left.
- * left_child_el: is the child list pointed to by left_rec
* right_rec: the record to the right of left_rec
* right_child_el: is the child list pointed to by right_rec
*
* By definition, this only works on interior nodes.
*/
static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
- struct ocfs2_extent_list *left_child_el,
struct ocfs2_extent_rec *right_rec,
struct ocfs2_extent_list *right_child_el)
{
@@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
*/
BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
- ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+ ocfs2_adjust_adjacent_records(&root_el->l_recs[i],
&root_el->l_recs[i + 1], right_el);
}
@@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
el = right_path->p_node[i].el;
right_rec = &el->l_recs[0];
- ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
- right_el);
+ ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el);
ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
@@ -2509,7 +2505,7 @@ out_ret_path:
static int ocfs2_update_edge_lengths(handle_t *handle,
struct ocfs2_extent_tree *et,
- int subtree_index, struct ocfs2_path *path)
+ struct ocfs2_path *path)
{
int i, idx, ret;
struct ocfs2_extent_rec *rec;
@@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle,
if (del_right_subtree) {
ocfs2_unlink_subtree(handle, et, left_path, right_path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
- left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle,
ocfs2_unlink_subtree(handle, et, left_path, path,
subtree_index, dealloc);
- ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
- left_path);
+ ret = ocfs2_update_edge_lengths(handle, et, left_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
if (mark_unwritten)
flags = OCFS2_EXT_UNWRITTEN;
- free_extents = ocfs2_num_free_extents(osb, et);
+ free_extents = ocfs2_num_free_extents(et);
if (free_extents < 0) {
status = free_extents;
mlog_errno(status);
@@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode,
*ac = NULL;
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 4a5152ec88a3..27b75cf32cfa 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
struct ocfs2_cached_dealloc_ctxt *dealloc,
u64 refcount_loc, bool refcount_tree_locked);
-int ocfs2_num_free_extents(struct ocfs2_super *osb,
- struct ocfs2_extent_tree *et);
+int ocfs2_num_free_extents(struct ocfs2_extent_tree *et);
/*
* how many new metadata chunks would an allocation need at maximum?
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ffe003982d95..56ac07cd35f6 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
}
}
-static void o2hb_wait_on_io(struct o2hb_region *reg,
- struct o2hb_bio_wait_ctxt *wc)
+static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc)
{
o2hb_bio_wait_dec(wc, 1);
wait_for_completion(&wc->wc_io_complete);
@@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg,
status = 0;
bail_and_wait:
- o2hb_wait_on_io(reg, &wc);
+ o2hb_wait_on_io(&wc);
if (wc.wc_error && !status)
status = wc.wc_error;
@@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
* before we can go to steady state. This ensures that
* people we find in our steady state have seen us.
*/
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
if (write_wc.wc_error) {
/* Do not re-arm the write timeout on I/O error - we
* can't be sure that the new block ever made it to
@@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data)
o2hb_prepare_block(reg, 0);
ret = o2hb_issue_node_write(reg, &write_wc);
if (ret == 0)
- o2hb_wait_on_io(reg, &write_wc);
+ o2hb_wait_on_io(&write_wc);
else
mlog_errno(ret);
}
@@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid,
}
EXPORT_SYMBOL_GPL(o2hb_unregister_callback);
-int o2hb_check_node_heartbeating(u8 node_num)
-{
- unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
-
- o2hb_fill_node_map(testing_map, sizeof(testing_map));
- if (!test_bit(node_num, testing_map)) {
- mlog(ML_HEARTBEAT,
- "node (%u) does not have heartbeating enabled.\n",
- node_num);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
-
int o2hb_check_node_heartbeating_no_sem(u8 node_num)
{
unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num)
}
EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback);
-/* Makes sure our local node is configured with a node number, and is
- * heartbeating. */
-int o2hb_check_local_node_heartbeating(void)
-{
- u8 node_num;
-
- /* if this node was set then we have networking */
- node_num = o2nm_this_node();
- if (node_num == O2NM_MAX_NODES) {
- mlog(ML_HEARTBEAT, "this node has not been configured.\n");
- return 0;
- }
-
- return o2hb_check_node_heartbeating(node_num);
-}
-EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating);
-
/*
* this is just a hack until we get the plumbing which flips file systems
* read only and drops the hb ref instead of killing the node dead.
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3ecb9f337b7d..febe6312ceff 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(dir)->ip_lock);
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir),
parent_fe_bh);
- num_free_extents = ocfs2_num_free_extents(osb, &et);
+ num_free_extents = ocfs2_num_free_extents(&et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 66e59d3163ea..6e41fc8fabbe 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -713,13 +713,6 @@ leave:
return status;
}
-int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
- u32 clusters_to_add, int mark_unwritten)
-{
- return __ocfs2_extend_allocation(inode, logical_start,
- clusters_to_add, mark_unwritten);
-}
-
/*
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index d5e5fa7f0743..36304434eacf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
ocfs2_schedule_truncate_log_flush(osb, 0);
osb->local_alloc_copy = NULL;
- osb->dirty = 0;
/* queue to recover orphan slots for all offline slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index e52a2852d50d..7eb3b0a6347e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 0c39d71c67a1..9a50f222ac97 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -320,7 +320,6 @@ struct ocfs2_super
u64 system_dir_blkno;
u64 bitmap_blkno;
u32 bitmap_cpg;
- u8 *uuid;
char *uuid_str;
u32 uuid_hash;
u8 *vol_label;
@@ -388,9 +387,8 @@ struct ocfs2_super
unsigned int osb_resv_level;
unsigned int osb_dir_resv_level;
- /* Next three fields are for local node slot recovery during
+ /* Next two fields are for local node slot recovery during
* mount. */
- int dirty;
struct ocfs2_dinode *local_alloc_copy;
struct ocfs2_quota_recovery *quota_rec;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index f8933cb53d68..ab156e35ec00 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
int *credits)
{
int ret = 0, meta_add = 0;
- int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
+ int num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6ad3533940ba..71f22c8fbffd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode,
BUG_ON(clusters_to_add != 0 && data_ac == NULL);
- num_free_extents = ocfs2_num_free_extents(osb, et);
+ num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83005f486451..3f936be379a9 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
if (dirty) {
/* Recovery will be completed after we've mounted the
* rest of the volume. */
- osb->dirty = 1;
osb->local_alloc_copy = local_alloc;
local_alloc = NULL;
}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index f70c3778d600..5fdf269ba82e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
*credits += 1;
/* count in the xattr tree change. */
- num_free_extents = ocfs2_num_free_extents(osb, xt_et);
+ num_free_extents = ocfs2_num_free_extents(xt_et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 98fd8f6df851..e5d89a0d0b8a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2931,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
@@ -3324,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
+ REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2b89071630..2cbfcd32e884 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *);
/*
* task_[no]mmu.c
*/
+struct mem_size_stats;
struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
+ struct mem_size_stats *rollup;
#ifdef CONFIG_MMU
struct vm_area_struct *tail_vma;
#endif
@@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations;
extern const struct file_operations proc_pid_numa_maps_operations;
extern const struct file_operations proc_tid_numa_maps_operations;
extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_tid_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 509a61668d90..cdd979724c74 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]);
show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]);
show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]);
- show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK));
+ show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK));
#ifdef CONFIG_HIGHMEM
show_val_kb(m, "HighTotal: ", i.totalhigh);
@@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "SUnreclaim: ",
global_node_page_state(NR_SLAB_UNRECLAIMABLE));
seq_printf(m, "KernelStack: %8lu kB\n",
- global_page_state(NR_KERNEL_STACK_KB));
+ global_zone_page_state(NR_KERNEL_STACK_KB));
show_val_kb(m, "PageTables: ",
- global_page_state(NR_PAGETABLE));
+ global_zone_page_state(NR_PAGETABLE));
#ifdef CONFIG_QUICKLIST
show_val_kb(m, "Quicklists: ", quicklist_total_size());
#endif
@@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "NFS_Unstable: ",
global_node_page_state(NR_UNSTABLE_NFS));
show_val_kb(m, "Bounce: ",
- global_page_state(NR_BOUNCE));
+ global_zone_page_state(NR_BOUNCE));
show_val_kb(m, "WritebackTmp: ",
global_node_page_state(NR_WRITEBACK_TEMP));
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
@@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#ifdef CONFIG_CMA
show_val_kb(m, "CmaTotal: ", totalcma_pages);
show_val_kb(m, "CmaFree: ",
- global_page_state(NR_FREE_CMA_PAGES));
+ global_zone_page_state(NR_FREE_CMA_PAGES));
#endif
hugetlb_report_meminfo(m);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fe8f3265e877..a290966f91ec 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file)
if (priv->mm)
mmdrop(priv->mm);
+ kfree(priv->rollup);
return seq_release_private(inode, file);
}
@@ -279,6 +280,23 @@ static int is_stack(struct proc_maps_private *priv,
vma->vm_end >= vma->vm_mm->start_stack;
}
+static void show_vma_header_prefix(struct seq_file *m,
+ unsigned long start, unsigned long end,
+ vm_flags_t flags, unsigned long long pgoff,
+ dev_t dev, unsigned long ino)
+{
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
+ start,
+ end,
+ flags & VM_READ ? 'r' : '-',
+ flags & VM_WRITE ? 'w' : '-',
+ flags & VM_EXEC ? 'x' : '-',
+ flags & VM_MAYSHARE ? 's' : 'p',
+ pgoff,
+ MAJOR(dev), MINOR(dev), ino);
+}
+
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
{
@@ -301,17 +319,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
start = vma->vm_start;
end = vma->vm_end;
-
- seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
- start,
- end,
- flags & VM_READ ? 'r' : '-',
- flags & VM_WRITE ? 'w' : '-',
- flags & VM_EXEC ? 'x' : '-',
- flags & VM_MAYSHARE ? 's' : 'p',
- pgoff,
- MAJOR(dev), MINOR(dev), ino);
+ show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
/*
* Print the dentry name for named mappings, and a
@@ -430,6 +438,7 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
+ bool first;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -443,7 +452,9 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long first_vma_start;
u64 pss;
+ u64 pss_locked;
u64 swap_pss;
bool check_shmem_swap;
};
@@ -652,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
[ilog2(VM_ARCH_1)] = "ar",
+ [ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
#ifdef CONFIG_MEM_SOFT_DIRTY
[ilog2(VM_SOFTDIRTY)] = "sd",
@@ -719,18 +731,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
- struct mem_size_stats mss;
+ struct mem_size_stats mss_stack;
+ struct mem_size_stats *mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
#ifdef CONFIG_HUGETLB_PAGE
.hugetlb_entry = smaps_hugetlb_range,
#endif
.mm = vma->vm_mm,
- .private = &mss,
};
+ int ret = 0;
+ bool rollup_mode;
+ bool last_vma;
+
+ if (priv->rollup) {
+ rollup_mode = true;
+ mss = priv->rollup;
+ if (mss->first) {
+ mss->first_vma_start = vma->vm_start;
+ mss->first = false;
+ }
+ last_vma = !m_next_vma(priv, vma);
+ } else {
+ rollup_mode = false;
+ memset(&mss_stack, 0, sizeof(mss_stack));
+ mss = &mss_stack;
+ }
- memset(&mss, 0, sizeof mss);
+ smaps_walk.private = mss;
#ifdef CONFIG_SHMEM
if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
@@ -748,9 +778,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
!(vma->vm_flags & VM_WRITE)) {
- mss.swap = shmem_swapped;
+ mss->swap = shmem_swapped;
} else {
- mss.check_shmem_swap = true;
+ mss->check_shmem_swap = true;
smaps_walk.pte_hole = smaps_pte_hole;
}
}
@@ -758,54 +788,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
/* mmap_sem is held in m_start */
walk_page_vma(vma, &smaps_walk);
+ if (vma->vm_flags & VM_LOCKED)
+ mss->pss_locked += mss->pss;
+
+ if (!rollup_mode) {
+ show_map_vma(m, vma, is_pid);
+ } else if (last_vma) {
+ show_vma_header_prefix(
+ m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
+ seq_pad(m, ' ');
+ seq_puts(m, "[rollup]\n");
+ } else {
+ ret = SEQ_SKIP;
+ }
- show_map_vma(m, vma, is_pid);
-
- seq_printf(m,
- "Size: %8lu kB\n"
- "Rss: %8lu kB\n"
- "Pss: %8lu kB\n"
- "Shared_Clean: %8lu kB\n"
- "Shared_Dirty: %8lu kB\n"
- "Private_Clean: %8lu kB\n"
- "Private_Dirty: %8lu kB\n"
- "Referenced: %8lu kB\n"
- "Anonymous: %8lu kB\n"
- "LazyFree: %8lu kB\n"
- "AnonHugePages: %8lu kB\n"
- "ShmemPmdMapped: %8lu kB\n"
- "Shared_Hugetlb: %8lu kB\n"
- "Private_Hugetlb: %7lu kB\n"
- "Swap: %8lu kB\n"
- "SwapPss: %8lu kB\n"
- "KernelPageSize: %8lu kB\n"
- "MMUPageSize: %8lu kB\n"
- "Locked: %8lu kB\n",
- (vma->vm_end - vma->vm_start) >> 10,
- mss.resident >> 10,
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
- mss.shared_clean >> 10,
- mss.shared_dirty >> 10,
- mss.private_clean >> 10,
- mss.private_dirty >> 10,
- mss.referenced >> 10,
- mss.anonymous >> 10,
- mss.lazyfree >> 10,
- mss.anonymous_thp >> 10,
- mss.shmem_thp >> 10,
- mss.shared_hugetlb >> 10,
- mss.private_hugetlb >> 10,
- mss.swap >> 10,
- (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
- vma_kernel_pagesize(vma) >> 10,
- vma_mmu_pagesize(vma) >> 10,
- (vma->vm_flags & VM_LOCKED) ?
- (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
-
- arch_show_smap(m, vma);
- show_smap_vma_flags(m, vma);
+ if (!rollup_mode)
+ seq_printf(m,
+ "Size: %8lu kB\n"
+ "KernelPageSize: %8lu kB\n"
+ "MMUPageSize: %8lu kB\n",
+ (vma->vm_end - vma->vm_start) >> 10,
+ vma_kernel_pagesize(vma) >> 10,
+ vma_mmu_pagesize(vma) >> 10);
+
+
+ if (!rollup_mode || last_vma)
+ seq_printf(m,
+ "Rss: %8lu kB\n"
+ "Pss: %8lu kB\n"
+ "Shared_Clean: %8lu kB\n"
+ "Shared_Dirty: %8lu kB\n"
+ "Private_Clean: %8lu kB\n"
+ "Private_Dirty: %8lu kB\n"
+ "Referenced: %8lu kB\n"
+ "Anonymous: %8lu kB\n"
+ "LazyFree: %8lu kB\n"
+ "AnonHugePages: %8lu kB\n"
+ "ShmemPmdMapped: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %7lu kB\n"
+ "Swap: %8lu kB\n"
+ "SwapPss: %8lu kB\n"
+ "Locked: %8lu kB\n",
+ mss->resident >> 10,
+ (unsigned long)(mss->pss >> (10 + PSS_SHIFT)),
+ mss->shared_clean >> 10,
+ mss->shared_dirty >> 10,
+ mss->private_clean >> 10,
+ mss->private_dirty >> 10,
+ mss->referenced >> 10,
+ mss->anonymous >> 10,
+ mss->lazyfree >> 10,
+ mss->anonymous_thp >> 10,
+ mss->shmem_thp >> 10,
+ mss->shared_hugetlb >> 10,
+ mss->private_hugetlb >> 10,
+ mss->swap >> 10,
+ (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)),
+ (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
+
+ if (!rollup_mode) {
+ arch_show_smap(m, vma);
+ show_smap_vma_flags(m, vma);
+ }
m_cache_vma(m, vma);
- return 0;
+ return ret;
}
static int show_pid_smap(struct seq_file *m, void *v)
@@ -837,6 +884,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_smaps_op);
}
+static int pid_smaps_rollup_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ struct proc_maps_private *priv;
+ int ret = do_maps_open(inode, file, &proc_pid_smaps_op);
+
+ if (ret < 0)
+ return ret;
+ seq = file->private_data;
+ priv = seq->private;
+ priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL);
+ if (!priv->rollup) {
+ proc_map_release(inode, file);
+ return -ENOMEM;
+ }
+ priv->rollup->first = true;
+ return 0;
+}
+
static int tid_smaps_open(struct inode *inode, struct file *file)
{
return do_maps_open(inode, file, &proc_tid_smaps_op);
@@ -849,6 +915,13 @@ const struct file_operations proc_pid_smaps_operations = {
.release = proc_map_release,
};
+const struct file_operations proc_pid_smaps_rollup_operations = {
+ .open = pid_smaps_rollup_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = proc_map_release,
+};
+
const struct file_operations proc_tid_smaps_operations = {
.open = tid_smaps_open,
.read = seq_read,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2ef7ce75c062..3ac1f2387083 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
if (!pages)
goto out_free;
- nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages);
+ nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages);
if (nr != lpages)
goto out_free_pages; /* leave if some pages were missing */
diff --git a/fs/sync.c b/fs/sync.c
index 27d6b8bbcb6a..2e3fd7d94d2d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -335,11 +335,6 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
goto out_put;
mapping = f.file->f_mapping;
- if (!mapping) {
- ret = -EINVAL;
- goto out_put;
- }
-
ret = 0;
if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
ret = file_fdatawait_range(f.file, offset, endbyte);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 886085b47c75..5419e7da82ba 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg)
static inline struct uffd_msg userfault_msg(unsigned long address,
unsigned int flags,
- unsigned long reason)
+ unsigned long reason,
+ unsigned int features)
{
struct uffd_msg msg;
msg_init(&msg);
@@ -202,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* write protect fault.
*/
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ if (features & UFFD_FEATURE_THREAD_ID)
+ msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
return msg;
}
@@ -370,6 +373,9 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+ if (ctx->features & UFFD_FEATURE_SIGBUS)
+ goto out;
+
/*
* If it's already released don't get it. This avoids to loop
* in __get_user_pages if userfaultfd_release waits on the
@@ -419,7 +425,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
+ uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+ ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -1194,7 +1201,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
struct uffdio_register __user *user_uffdio_register;
unsigned long vm_flags, new_flags;
bool found;
- bool non_anon_pages;
+ bool basic_ioctls;
unsigned long start, end, vma_end;
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1260,7 +1267,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* Search for not compatible vmas.
*/
found = false;
- non_anon_pages = false;
+ basic_ioctls = false;
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
cond_resched();
@@ -1299,8 +1306,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/*
* Note vmas containing huge pages
*/
- if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
- non_anon_pages = true;
+ if (is_vm_hugetlb_page(cur))
+ basic_ioctls = true;
found = true;
}
@@ -1371,7 +1378,7 @@ out_unlock:
* userland which ioctls methods are guaranteed to
* succeed on this range.
*/
- if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+ if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
UFFD_API_RANGE_IOCTLS,
&user_uffdio_register->ioctls))
ret = -EFAULT;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0debbc7e3f03..ec3e44fcf771 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1101,7 +1101,7 @@ xfs_filemap_pfn_mkwrite(
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
else if (IS_DAX(inode))
- ret = dax_pfn_mkwrite(vmf);
+ ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;