From 18d585f0f2d4c9dc7dfe6e69dcae4933d5a428c9 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.de>
Date: Thu, 12 Mar 2015 16:25:46 -0700
Subject: ocfs2: make append_dio an incompat feature

It turns out that making this feature ro_compat isn't quite enough to
prevent accidental corruption on mount from older kernels.  Ocfs2 (like
other file systems) will process orphaned inodes even when the user mounts
in 'ro' mode.  So for the case of a filesystem not knowing the append_dio
feature, mounting the filesystem could result in orphaned-for-dio files
being deleted, which we clearly don't want.

So instead, turn this into an incompat flag.

Btw, this is kind of my fault - initially I asked that we add a flag to
cover the feature and even suggested that we use an ro flag.  It wasn't
until I was looking through our commits for v4.0-rc1 that I realized we
actually want this to be incompat.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ocfs2.h    |  2 +-
 fs/ocfs2/ocfs2_fs.h | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8490c64d34fe..460c6c37e683 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
 
 static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
 {
-	if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
 		return 1;
 	return 0;
 }
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 20e37a3ed26f..db64ce2d4667 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -102,11 +102,11 @@
 					 | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
 					 | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
 					 | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG	\
-					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+					 | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+					 | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	(OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
 					 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
-					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
-					 | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+					 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
 
 /*
  * Heartbeat-only devices are missing journals and other files.  The
@@ -178,6 +178,11 @@
  */
 #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO	0x4000
 
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO	0x8000
+
 /*
  * backup superblock flag is used to indicate that this volume
  * has backup superblocks.
@@ -200,10 +205,6 @@
 #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA	0x0002
 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA	0x0004
 
-/*
- * Append Direct IO support
- */
-#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO	0x0008
 
 /* The byte offset of the first backup block will be 1G.
  * The following will be 4G, 16G, 64G, 256G and 1T.
-- 
cgit v1.2.3


From 8792f7772f4f40ffc68bad5f28311205584b734d Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Date: Thu, 12 Mar 2015 16:25:49 -0700
Subject: drivers/rtc/rtc-s3c.c: add .needs_src_clk to s3c6410 RTC data

Commit df9e26d093d3 ("rtc: s3c: add support for RTC of Exynos3250 SoC")
added an "rtc_src" DT property to specify the clock used as a source to
the S3C real-time clock.

Not all SoCs needs this so commit eaf3a659086e ("drivers/rtc/rtc-s3c.c:
fix initialization failure without rtc source clock") changed to check
the struct s3c_rtc_data .needs_src_clk to conditionally grab the clock.

But that commit didn't update the data for each IP version so the RTC
broke on the boards that needs a source clock. This is the case of at
least Exynos5250 and Exynos5440 which uses the s3c6410 RTC IP block.

This commit fixes the S3C rtc on the Exynos5250 Snow and Exynos5420
Peach Pit and Pi Chromebooks.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Chanwoo Choi <cw00.choi@samsung.com>
Cc: Doug Anderson <dianders@chromium.org>
Cc: Olof Johansson <olof@lixom.net>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Tyler Baker <tyler.baker@linaro.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-s3c.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c
index 4241eeab3386..f4cf6851fae9 100644
--- a/drivers/rtc/rtc-s3c.c
+++ b/drivers/rtc/rtc-s3c.c
@@ -849,6 +849,7 @@ static struct s3c_rtc_data const s3c2443_rtc_data = {
 
 static struct s3c_rtc_data const s3c6410_rtc_data = {
 	.max_user_freq		= 32768,
+	.needs_src_clk		= true,
 	.irq_handler		= s3c6410_rtc_irq,
 	.set_freq		= s3c6410_rtc_setfreq,
 	.enable_tick		= s3c6410_rtc_enable_tick,
-- 
cgit v1.2.3


From e009d5dc0a94a7133e5f1c083732d760bfd038e6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Thu, 12 Mar 2015 16:25:52 -0700
Subject: mm, oom: do not fail __GFP_NOFAIL allocation if oom killer is
 disabled

Tetsuo Handa has pointed out that __GFP_NOFAIL allocations might fail
after OOM killer is disabled if the allocation is performed by a kernel
thread.  This behavior was introduced from the very beginning by
7f33d49a2ed5 ("mm, PM/Freezer: Disable OOM killer when tasks are frozen").
 This means that the basic contract for the allocation request is broken
and the context requesting such an allocation might blow up unexpectedly.

There are basically two ways forward.

1) move oom_killer_disable after kernel threads are frozen.  This has a
   risk that the OOM victim wouldn't be able to finish because it would
   depend on an already frozen kernel thread.  This would be really tricky
   to debug.

2) do not fail GFP_NOFAIL allocation no matter what and risk a
   potential Freezable kernel threads will loop and fail the suspend.
   Incidental allocations after kernel threads are frozen will at least
   dump a warning - if we are lucky and the serial console is still active
   of course...

This patch implements the later option because it is safer.  We would see
warning rather than allocation failures for the kernel threads which would
blow up otherwise and have a higher chances to identify __GFP_NOFAIL users
from deeper pm code.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: David Rientjes <rientjes@gooogle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7abfa70cdc1a..40e29429e7b0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2373,7 +2373,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
+			|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
 		*did_some_progress = 1;
 out:
 	oom_zonelist_unlock(ac->zonelist, gfp_mask);
-- 
cgit v1.2.3


From 44fc80573cc760a7154f41fd0a958ee10eba1a81 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 12 Mar 2015 16:25:54 -0700
Subject: mm, hugetlb: close race when setting PageTail for gigantic pages

Now that gigantic pages are dynamically allocatable, care must be taken to
ensure that p->first_page is valid before setting PageTail.

If this isn't done, then it is possible to race and have compound_head()
return NULL.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a9ac6c26832..c41b2a0ee273 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -917,7 +917,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 	__SetPageHead(page);
 	__ClearPageReserved(page);
 	for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-		__SetPageTail(p);
 		/*
 		 * For gigantic hugepages allocated through bootmem at
 		 * boot, it's safer to be consistent with the not-gigantic
@@ -933,6 +932,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 		p->first_page = page;
+		/* Make sure p->first_page is always valid for PageTail() */
+		smp_wmb();
+		__SetPageTail(p);
 	}
 }
 
-- 
cgit v1.2.3


From 850fc430f47aad52092deaaeb32b99f97f0e6aca Mon Sep 17 00:00:00 2001
From: Danesh Petigara <dpetigara@broadcom.com>
Date: Thu, 12 Mar 2015 16:25:57 -0700
Subject: mm: cma: fix CMA aligned offset calculation

The CMA aligned offset calculation is incorrect for non-zero order_per_bit
values.

For example, if cma->order_per_bit=1, cma->base_pfn= 0x2f800000 and
align_order=12, the function returns a value of 0x17c00 instead of 0x400.

This patch fixes the CMA aligned offset calculation.

The previous calculation was wrong and would return too-large values for
the offset, so that when cma_alloc looks for free pages in the bitmap with
the requested alignment > order_per_bit, it starts too far into the bitmap
and so CMA allocations will fail despite there actually being plenty of
free pages remaining.  It will also probably have the wrong alignment.
With this change, we will get the correct offset into the bitmap.

One affected user is powerpc KVM, which has kvm_cma->order_per_bit set to
KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, or 18 - 12 = 6.

[gregory.0xf0@gmail.com: changelog additions]
Signed-off-by: Danesh Petigara <dpetigara@broadcom.com>
Reviewed-by: Gregory Fong <gregory.0xf0@gmail.com>
Acked-by: Michal Nazarewicz <mina86@mina86.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/cma.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index 75016fd1de90..68ecb7a42983 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -64,15 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
 	return (1UL << (align_order - cma->order_per_bit)) - 1;
 }
 
+/*
+ * Find a PFN aligned to the specified order and return an offset represented in
+ * order_per_bits.
+ */
 static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
 {
-	unsigned int alignment;
-
 	if (align_order <= cma->order_per_bit)
 		return 0;
-	alignment = 1UL << (align_order - cma->order_per_bit);
-	return ALIGN(cma->base_pfn, alignment) -
-		(cma->base_pfn >> cma->order_per_bit);
+
+	return (ALIGN(cma->base_pfn, (1UL << align_order))
+		- cma->base_pfn) >> cma->order_per_bit;
 }
 
 static unsigned long cma_bitmap_maxno(struct cma *cma)
-- 
cgit v1.2.3


From 283ee1482f349d6c0c09dfb725db5880afc56813 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Thu, 12 Mar 2015 16:26:00 -0700
Subject: nilfs2: fix deadlock of segment constructor during recovery

According to a report from Yuxuan Shui, nilfs2 in kernel 3.19 got stuck
during recovery at mount time.  The code path that caused the deadlock was
as follows:

  nilfs_fill_super()
    load_nilfs()
      nilfs_salvage_orphan_logs()
        * Do roll-forwarding, attach segment constructor for recovery,
          and kick it.

        nilfs_segctor_thread()
          nilfs_segctor_thread_construct()
           * A lock is held with nilfs_transaction_lock()
             nilfs_segctor_do_construct()
               nilfs_segctor_drop_written_files()
                 iput()
                   iput_final()
                     write_inode_now()
                       writeback_single_inode()
                         __writeback_single_inode()
                           do_writepages()
                             nilfs_writepage()
                               nilfs_construct_dsync_segment()
                                 nilfs_transaction_lock() --> deadlock

This can happen if commit 7ef3ff2fea8b ("nilfs2: fix deadlock of segment
constructor over I_SYNC flag") is applied and roll-forward recovery was
performed at mount time.  The roll-forward recovery can happen if datasync
write is done and the file system crashes immediately after that.  For
instance, we can reproduce the issue with the following steps:

 < nilfs2 is mounted on /nilfs (device: /dev/sdb1) >
 # dd if=/dev/zero of=/nilfs/test bs=4k count=1 && sync
 # dd if=/dev/zero of=/nilfs/test conv=notrunc oflag=dsync bs=4k
 count=1 && reboot -nfh
 < the system will immediately reboot >
 # mount -t nilfs2 /dev/sdb1 /nilfs

The deadlock occurs because iput() can run segment constructor through
writeback_single_inode() if MS_ACTIVE flag is not set on sb->s_flags.  The
above commit changed segment constructor so that it calls iput()
asynchronously for inodes with i_nlink == 0, but that change was
imperfect.

This fixes the another deadlock by deferring iput() in segment constructor
even for the case that mount is not finished, that is, for the case that
MS_ACTIVE flag is not set.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Reported-by: Yuxuan Shui <yshuiv7@gmail.com>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nilfs2/segment.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b9f99b..0c3f303baf32 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
 	struct nilfs_inode_info *ii, *n;
+	int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
 	int defer_iput = false;
 
 	spin_lock(&nilfs->ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
 		brelse(ii->i_bh);
 		ii->i_bh = NULL;
 		list_del_init(&ii->i_dirty);
-		if (!ii->vfs_inode.i_nlink) {
+		if (!ii->vfs_inode.i_nlink || during_mount) {
 			/*
-			 * Defer calling iput() to avoid a deadlock
-			 * over I_SYNC flag for inodes with i_nlink == 0
+			 * Defer calling iput() to avoid deadlocks if
+			 * i_nlink == 0 or mount is not yet finished.
 			 */
 			list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
 			defer_iput = true;
-- 
cgit v1.2.3


From 65b9ab888cd7bd14b314e9238ce6d4886df846fa Mon Sep 17 00:00:00 2001
From: Chen Gang <762976180@qq.com>
Date: Thu, 12 Mar 2015 16:26:03 -0700
Subject: arch/c6x/include/asm/pgtable.h: define dummy pgprot_writecombine for
 !MMU

When !MMU, asm-generic will not define default pgprot_writecombine, so c6x
needs to define it by itself.  The related error:

    CC [M]  fs/pstore/ram_core.o
  fs/pstore/ram_core.c: In function 'persistent_ram_vmap':
  fs/pstore/ram_core.c:399:10: error: implicit declaration of function 'pgprot_writecombine' [-Werror=implicit-function-declaration]
     prot = pgprot_writecombine(PAGE_KERNEL);
            ^
  fs/pstore/ram_core.c:399:8: error: incompatible types when assigning to type 'pgprot_t {aka struct <anonymous>}' from type 'int'
     prot = pgprot_writecombine(PAGE_KERNEL);
          ^

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/c6x/include/asm/pgtable.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h
index 78d4483ba40c..ec4db6df5e0d 100644
--- a/arch/c6x/include/asm/pgtable.h
+++ b/arch/c6x/include/asm/pgtable.h
@@ -67,6 +67,11 @@ extern unsigned long empty_zero_page;
  */
 #define pgtable_cache_init()   do { } while (0)
 
+/*
+ * c6x is !MMU, so define the simpliest implementation
+ */
+#define pgprot_writecombine pgprot_noncached
+
 #include <asm-generic/pgtable.h>
 
 #endif /* _ASM_C6X_PGTABLE_H */
-- 
cgit v1.2.3


From 5b8bf30721980b254be7a07315c353b3a3175b74 Mon Sep 17 00:00:00 2001
From: gchen gchen <xili_gchen_5257@hotmail.com>
Date: Thu, 12 Mar 2015 16:26:05 -0700
Subject: mm/nommu.c: export symbol max_mapnr

Several modules may need max_mapnr, so export, the related error with
allmodconfig under c6x:

  MODPOST 3327 modules
  ERROR: "max_mapnr" [fs/pstore/ramoops.ko] undefined!
  ERROR: "max_mapnr" [drivers/media/v4l2-core/videobuf2-dma-contig.ko] undefined!

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/nommu.c b/mm/nommu.c
index 3e67e7538ecf..3fba2dc97c44 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,6 +62,7 @@ void *high_memory;
 EXPORT_SYMBOL(high_memory);
 struct page *mem_map;
 unsigned long max_mapnr;
+EXPORT_SYMBOL(max_mapnr);
 unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-- 
cgit v1.2.3


From b3c1030d50bad39383fcfa6721bd3c35463b3f3f Mon Sep 17 00:00:00 2001
From: "Suzuki K. Poulose" <suzuki.poulose@arm.com>
Date: Thu, 12 Mar 2015 16:26:08 -0700
Subject: fanotify: fix event filtering with FAN_ONDIR set

With FAN_ONDIR set, the user can end up getting events, which it hasn't
marked.  This was revealed with fanotify04 testcase failure on
Linux-4.0-rc1, and is a regression from 3.19, revealed with 66ba93c0d7fe6
("fanotify: don't set FAN_ONDIR implicitly on a marks ignored mask").

   # /opt/ltp/testcases/bin/fanotify04
   [ ... ]
  fanotify04    7  TPASS  :  event generated properly for type 100000
  fanotify04    8  TFAIL  :  fanotify04.c:147: got unexpected event 30
  fanotify04    9  TPASS  :  No event as expected

The testcase sets the adds the following marks : FAN_OPEN | FAN_ONDIR for
a fanotify on a dir.  Then does an open(), followed by close() of the
directory and expects to see an event FAN_OPEN(0x20).  However, the
fanotify returns (FAN_OPEN|FAN_CLOSE_NOWRITE(0x10)).  This happens due to
the flaw in the check for event_mask in fanotify_should_send_event() which
does:

	if (event_mask & marks_mask & ~marks_ignored_mask)
		return true;

where, event_mask == (FAN_ONDIR | FAN_CLOSE_NOWRITE),
       marks_mask == (FAN_ONDIR | FAN_OPEN),
       marks_ignored_mask == 0

Fix this by masking the outgoing events to the user, as we already take
care of FAN_ONDIR and FAN_EVENT_ON_CHILD.

Signed-off-by: Suzuki K. Poulose <suzuki.poulose@arm.com>
Tested-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Eric Paris <eparis@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fanotify/fanotify.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9a66ff79ff27..d2f97ecca6a5 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -143,7 +143,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 	    !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
 		return false;
 
-	if (event_mask & marks_mask & ~marks_ignored_mask)
+	if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
+				 ~marks_ignored_mask)
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Thu, 12 Mar 2015 16:26:11 -0700
Subject: kasan, module, vmalloc: rework shadow allocation for modules

Current approach in handling shadow memory for modules is broken.

Shadow memory could be freed only after memory shadow corresponds it is no
longer used.  vfree() called from interrupt context could use memory its
freeing to store 'struct llist_node' in it:

    void vfree(const void *addr)
    {
    ...
        if (unlikely(in_interrupt())) {
            struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred);
            if (llist_add((struct llist_node *)addr, &p->list))
                    schedule_work(&p->wq);

Later this list node used in free_work() which actually frees memory.
Currently module_memfree() called in interrupt context will free shadow
before freeing module's memory which could provoke kernel crash.

So shadow memory should be freed after module's memory.  However, such
deallocation order could race with kasan_module_alloc() in module_alloc().

Free shadow right before releasing vm area.  At this point vfree()'d
memory is not used anymore and yet not available for other allocations.
New VM_KASAN flag used to indicate that vm area has dynamically allocated
shadow memory so kasan frees shadow only if it was previously allocated.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h   |  5 +++--
 include/linux/vmalloc.h |  1 +
 kernel/module.c         |  2 --
 mm/kasan/kasan.c        | 14 +++++++++++---
 mm/vmalloc.c            |  1 +
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 72ba725ddf9c..5fa48a21d73e 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -5,6 +5,7 @@
 
 struct kmem_cache;
 struct page;
+struct vm_struct;
 
 #ifdef CONFIG_KASAN
 
@@ -52,7 +53,7 @@ void kasan_slab_free(struct kmem_cache *s, void *object);
 #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
 
 int kasan_module_alloc(void *addr, size_t size);
-void kasan_module_free(void *addr);
+void kasan_free_shadow(const struct vm_struct *vm);
 
 #else /* CONFIG_KASAN */
 
@@ -82,7 +83,7 @@ static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
 static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
-static inline void kasan_module_free(void *addr) {}
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
 
 #endif /* CONFIG_KASAN */
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 7d7acb35603d..0ec598381f97 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -17,6 +17,7 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #define VM_VPAGES		0x00000010	/* buffer for pages was vmalloc'ed */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
 #define VM_NO_GUARD		0x00000040      /* don't add guard page */
+#define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
diff --git a/kernel/module.c b/kernel/module.c
index cc93cf68653c..b3d634ed06c9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -56,7 +56,6 @@
 #include <linux/async.h>
 #include <linux/percpu.h>
 #include <linux/kmemleak.h>
-#include <linux/kasan.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
 #include <linux/bsearch.h>
@@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { }
 void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
-	kasan_module_free(module_region);
 }
 
 void __weak module_arch_cleanup(struct module *mod)
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 78fee632a7ee..936d81661c47 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
 #include <linux/stacktrace.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 #include <linux/kasan.h>
 
 #include "kasan.h"
@@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size)
 			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
 			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
 			__builtin_return_address(0));
-	return ret ? 0 : -ENOMEM;
+
+	if (ret) {
+		find_vm_area(addr)->flags |= VM_KASAN;
+		return 0;
+	}
+
+	return -ENOMEM;
 }
 
-void kasan_module_free(void *addr)
+void kasan_free_shadow(const struct vm_struct *vm)
 {
-	vfree(kasan_mem_to_shadow(addr));
+	if (vm->flags & VM_KASAN)
+		vfree(kasan_mem_to_shadow(vm->addr));
 }
 
 static void register_global(struct kasan_global *global)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35b25e1340ca..49abccf29a29 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 		spin_unlock(&vmap_area_lock);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
+		kasan_free_shadow(vm);
 		free_unmap_vmap_area(va);
 		vm->size -= PAGE_SIZE;
 
-- 
cgit v1.2.3


From d3733e5c98e952d419e77fa721912f09d15a2806 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Thu, 12 Mar 2015 16:26:14 -0700
Subject: kasan, module: move MODULE_ALIGN macro into <linux/moduleloader.h>

include/linux/moduleloader.h is more suitable place for this macro.
Also change alignment to PAGE_SIZE for CONFIG_KASAN=n as such
alignment already assumed in several places.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h        | 4 ----
 include/linux/moduleloader.h | 8 ++++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5fa48a21d73e..5bb074431eb0 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,15 +50,11 @@ void kasan_krealloc(const void *object, size_t new_size);
 void kasan_slab_alloc(struct kmem_cache *s, void *object);
 void kasan_slab_free(struct kmem_cache *s, void *object);
 
-#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
-
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
 
 #else /* CONFIG_KASAN */
 
-#define MODULE_ALIGN 1
-
 static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
 
 static inline void kasan_enable_current(void) {}
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index f7556261fe3c..4d0cb9bba93e 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -84,4 +84,12 @@ void module_arch_cleanup(struct module *mod);
 
 /* Any cleanup before freeing mod->module_init */
 void module_arch_freeing_init(struct module *mod);
+
+#ifdef CONFIG_KASAN
+#include <linux/kasan.h>
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+#else
+#define MODULE_ALIGN PAGE_SIZE
+#endif
+
 #endif
-- 
cgit v1.2.3


From a5a6579db33af91f4f5134e14be758dc71c1b694 Mon Sep 17 00:00:00 2001
From: Jeff Vander Stoep <jeffv@google.com>
Date: Thu, 12 Mar 2015 16:26:17 -0700
Subject: mm: reorder can_do_mlock to fix audit denial

A userspace call to mmap(MAP_LOCKED) may result in the successful locking
of memory while also producing a confusing audit log denial.  can_do_mlock
checks capable and rlimit.  If either of these return positive
can_do_mlock returns true.  The capable check leads to an LSM hook used by
apparmour and selinux which produce the audit denial.  Reordering so
rlimit is checked first eliminates the denial on success, only recording a
denial when the lock is unsuccessful as a result of the denial.

Signed-off-by: Jeff Vander Stoep <jeffv@google.com>
Acked-by: Nick Kralevich <nnk@google.com>
Cc: Jeff Vander Stoep <jeffv@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Paul Cassella <cassella@cray.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index 73cf0987088c..8a54cd214925 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -26,10 +26,10 @@
 
 int can_do_mlock(void)
 {
-	if (capable(CAP_IPC_LOCK))
-		return 1;
 	if (rlimit(RLIMIT_MEMLOCK) != 0)
 		return 1;
+	if (capable(CAP_IPC_LOCK))
+		return 1;
 	return 0;
 }
 EXPORT_SYMBOL(can_do_mlock);
-- 
cgit v1.2.3


From 7feee590bb18ffc42636975f74c2c3120ce1901c Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 12 Mar 2015 16:26:19 -0700
Subject: memcg: disable hierarchy support if bound to the legacy cgroup
 hierarchy

If the memory cgroup controller is initially mounted in the scope of the
default cgroup hierarchy and then remounted to a legacy hierarchy, it will
still have hierarchy support enabled, which is incorrect.  We should
disable hierarchy support if bound to the legacy cgroup hierarchy.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9fe07692eaad..b34ef4a32a3b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5232,7 +5232,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 	 * on for the root memcg is enough.
 	 */
 	if (cgroup_on_dfl(root_css->cgroup))
-		mem_cgroup_from_css(root_css)->use_hierarchy = true;
+		root_mem_cgroup->use_hierarchy = true;
+	else
+		root_mem_cgroup->use_hierarchy = false;
 }
 
 static u64 memory_current_read(struct cgroup_subsys_state *css,
-- 
cgit v1.2.3