binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot

In both binfmt_elf and binfmt_elf_fdpic, use a new helper dump_vma_snapshot() to take a snapshot of the VMA list (including the gate VMA, if we have one) while protected by the mmap_lock, and then use that snapshot instead of walking the VMA list without locking. An alternative approach would be to keep the mmap_lock held across the entire core dumping operation; however, keeping the mmap_lock locked while we may be blocked for an unbounded amount of time (e.g. because we're dumping to a FUSE filesystem or so) isn't really optimal; the mmap_lock blocks things like the ->release handler of userfaultfd, and we don't really want critical system daemons to grind to a halt just because someone "gifted" them SCM_RIGHTS to an eternally-locked userfaultfd, or something like that. Since both the normal ELF code and the FDPIC ELF code need this functionality (and if any other binfmt wants to add coredump support in the future, they'd probably need it, too), implement this with a common helper in fs/coredump.c. A downside of this approach is that we now need a bigger amount of kernel memory per userspace VMA in the normal ELF case, and that we need O(n) kernel memory in the FDPIC ELF case at all; but 40 bytes per VMA shouldn't be terribly bad. There currently is a data race between stack expansion and anything that reads ->vm_start or ->vm_end under the mmap_lock held in read mode; to mitigate that for core dumping, take the mmap_lock in write mode when taking a snapshot of the VMA hierarchy. (If we only took the mmap_lock in read mode, we could end up with a corrupted core dump if someone does get_user_pages_remote() concurrently. Not really a major problem, but taking the mmap_lock either way works here, so we might as well avoid the issue.) (This doesn't do anything about the existing data races with stack expansion in other mm code.) Signed-off-by: Jann Horn <jannh@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: "Eric W . Biederman" <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Hugh Dickins <hughd@google.com> Link: http://lkml.kernel.org/r/20200827114932.3572699-6-jannh@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Jann Horn <jannh@google.com> 2020-10-16 05:12:54 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2020-10-16 20:11:21 +0200
commit: a07279c9a8cd7dbd321640ff7210591599ee00a4 (patch)
tree: c1f27d5713449e3ec7762dcaddae067a19f2c145 /fs/coredump.c
parent: coredump: rework elf/elf_fdpic vma_dump_size() into common helper (diff)
download: linux-a07279c9a8cd7dbd321640ff7210591599ee00a4.tar.xz
linux-a07279c9a8cd7dbd321640ff7210591599ee00a4.zip
1 files changed, 80 insertions, 1 deletions
diff --git a/fs/coredump.c b/fs/coredump.c
index 4ef4c49a65b7..0cd9056d79cc 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -971,7 +971,8 @@ static bool always_dump_vma(struct vm_area_struct *vma)
 /*
  * Decide how much of @vma's contents should be included in a core dump.
  */
-unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags)
+static unsigned long vma_dump_size(struct vm_area_struct *vma,
+				   unsigned long mm_flags)
 {
 #define FILTER(type)	(mm_flags & (1UL << MMF_DUMP_##type))
 
@@ -1037,3 +1038,81 @@ unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags)
 whole:
 	return vma->vm_end - vma->vm_start;
 }
+
+static struct vm_area_struct *first_vma(struct task_struct *tsk,
+					struct vm_area_struct *gate_vma)
+{
+	struct vm_area_struct *ret = tsk->mm->mmap;
+
+	if (ret)
+		return ret;
+	return gate_vma;
+}
+
+/*
+ * Helper function for iterating across a vma list.  It ensures that the caller
+ * will visit `gate_vma' prior to terminating the search.
+ */
+static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
+				       struct vm_area_struct *gate_vma)
+{
+	struct vm_area_struct *ret;
+
+	ret = this_vma->vm_next;
+	if (ret)
+		return ret;
+	if (this_vma == gate_vma)
+		return NULL;
+	return gate_vma;
+}
+
+/*
+ * Under the mmap_lock, take a snapshot of relevant information about the task's
+ * VMAs.
+ */
+int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
+		      struct core_vma_metadata **vma_meta,
+		      size_t *vma_data_size_ptr)
+{
+	struct vm_area_struct *vma, *gate_vma;
+	struct mm_struct *mm = current->mm;
+	int i;
+	size_t vma_data_size = 0;
+
+	/*
+	 * Once the stack expansion code is fixed to not change VMA bounds
+	 * under mmap_lock in read mode, this can be changed to take the
+	 * mmap_lock in read mode.
+	 */
+	if (mmap_write_lock_killable(mm))
+		return -EINTR;
+
+	gate_vma = get_gate_vma(mm);
+	*vma_count = mm->map_count + (gate_vma ? 1 : 0);
+
+	*vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
+	if (!*vma_meta) {
+		mmap_write_unlock(mm);
+		return -ENOMEM;
+	}
+
+	for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
+			vma = next_vma(vma, gate_vma), i++) {
+		struct core_vma_metadata *m = (*vma_meta) + i;
+
+		m->start = vma->vm_start;
+		m->end = vma->vm_end;
+		m->flags = vma->vm_flags;
+		m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+
+		vma_data_size += m->dump_size;
+	}
+
+	mmap_write_unlock(mm);
+
+	if (WARN_ON(i != *vma_count))
+		return -EFAULT;
+
+	*vma_data_size_ptr = vma_data_size;
+	return 0;
+}
author	Jann Horn <jannh@google.com>	2020-10-16 05:12:54 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2020-10-16 20:11:21 +0200
commit	a07279c9a8cd7dbd321640ff7210591599ee00a4 (patch)
tree	c1f27d5713449e3ec7762dcaddae067a19f2c145 /fs/coredump.c
parent	coredump: rework elf/elf_fdpic vma_dump_size() into common helper (diff)
download	linux-a07279c9a8cd7dbd321640ff7210591599ee00a4.tar.xz linux-a07279c9a8cd7dbd321640ff7210591599ee00a4.zip