From 05eb6e7263185a6bb0de9501ccf2addc52429414 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 2 Aug 2016 14:03:01 -0700 Subject: radix-tree: account nodes to memcg only if explicitly requested Radix trees may be used not only for storing page cache pages, so unconditionally accounting radix tree nodes to the current memory cgroup is bad: if a radix tree node is used for storing data shared among different cgroups we risk pinning dead memory cgroups forever. So let's only account radix tree nodes if it was explicitly requested by passing __GFP_ACCOUNT to INIT_RADIX_TREE. Currently, we only want to account page cache entries, so mark mapping->page_tree so. Fixes: 58e698af4c63 ("radix-tree: account radix_tree_node to memory cgroup") Link: http://lkml.kernel.org/r/1470057188-7864-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 61b8fb529cef..1b7bf7314141 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -277,10 +277,11 @@ radix_tree_node_alloc(struct radix_tree_root *root) /* * Even if the caller has preloaded, try to allocate from the - * cache first for the new node to get accounted. + * cache first for the new node to get accounted to the memory + * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN); + gfp_mask | __GFP_NOWARN); if (ret) goto out; @@ -303,8 +304,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) kmemleak_update_trace(ret); goto out; } - ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT); + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); return ret; @@ -351,6 +351,12 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr) struct radix_tree_node *node; int ret = -ENOMEM; + /* + * Nodes preloaded by one cgroup can be be used by another cgroup, so + * they should never be accounted to any particular memory cgroup. + */ + gfp_mask &= ~__GFP_ACCOUNT; + preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { -- cgit v1.2.3 From 901d805c33fc4c029fc6b2d94ee5fb7d30278045 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Tue, 2 Aug 2016 14:03:10 -0700 Subject: UBSAN: fix typo in format string handle_object_size_mismatch() used %pk to format a kernel pointer with pr_err(). This seemed to be a misspelling for %pK, but using this to format a kernel pointer does not make much sence here. Therefore use %p instead, like in handle_missaligned_access(). Link: http://lkml.kernel.org/r/20160730083010.11569-1-nicolas.iooss_linux@m4x.org Signed-off-by: Nicolas Iooss Acked-by: Andrey Ryabinin Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ubsan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/ubsan.c b/lib/ubsan.c index 8799ae5e2e42..fb0409df1bcf 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -308,7 +308,7 @@ static void handle_object_size_mismatch(struct type_mismatch_data *data, return; ubsan_prologue(&data->location, &flags); - pr_err("%s address %pk with insufficient space\n", + pr_err("%s address %p with insufficient space\n", type_check_kinds[data->type_check_kind], (void *) ptr); pr_err("for an object of type %s\n", data->type->type_name); -- cgit v1.2.3 From 6b1d174b0c27b5de421eda55c2731f32b6bd9852 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 2 Aug 2016 14:04:04 -0700 Subject: ratelimit: extend to print suppressed messages on release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the ratelimiting facility to print the amount of suppressed lines when it is being released. This use case is aimed at short-termed, burst-like users for which we want to output the suppressed lines stats only once, after it has been disposed of. For an example, see /dev/kmsg usage in a follow-on patch. Also, change the printk() line we issue on release to not use "callbacks" as it is misleading: we're not suppressing callbacks but printk() calls. This has been separated from a previous patch by Linus. Link: http://lkml.kernel.org/r/20160716061745.15795-2-bp@alien8.de Signed-off-by: Borislav Petkov Cc: Dave Young Cc: Franck Bui Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ratelimit.h | 38 +++++++++++++++++++++++++++++++++----- lib/ratelimit.c | 10 ++++++---- 2 files changed, 39 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/include/linux/ratelimit.h b/include/linux/ratelimit.h index 18102529254e..57c9e0622a38 100644 --- a/include/linux/ratelimit.h +++ b/include/linux/ratelimit.h @@ -2,11 +2,15 @@ #define _LINUX_RATELIMIT_H #include +#include #include #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) #define DEFAULT_RATELIMIT_BURST 10 +/* issue num suppressed message on exit */ +#define RATELIMIT_MSG_ON_RELEASE BIT(0) + struct ratelimit_state { raw_spinlock_t lock; /* protect the state */ @@ -15,6 +19,7 @@ struct ratelimit_state { int printed; int missed; unsigned long begin; + unsigned long flags; }; #define RATELIMIT_STATE_INIT(name, interval_init, burst_init) { \ @@ -34,12 +39,35 @@ struct ratelimit_state { static inline void ratelimit_state_init(struct ratelimit_state *rs, int interval, int burst) { + memset(rs, 0, sizeof(*rs)); + raw_spin_lock_init(&rs->lock); - rs->interval = interval; - rs->burst = burst; - rs->printed = 0; - rs->missed = 0; - rs->begin = 0; + rs->interval = interval; + rs->burst = burst; +} + +static inline void ratelimit_default_init(struct ratelimit_state *rs) +{ + return ratelimit_state_init(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); +} + +static inline void ratelimit_state_exit(struct ratelimit_state *rs) +{ + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) + return; + + if (rs->missed) { + pr_warn("%s: %d output lines suppressed due to ratelimiting\n", + current->comm, rs->missed); + rs->missed = 0; + } +} + +static inline void +ratelimit_set_flags(struct ratelimit_state *rs, unsigned long flags) +{ + rs->flags = flags; } extern struct ratelimit_state printk_ratelimit_state; diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 2c5de86460c5..08f8043cac61 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -46,12 +46,14 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) rs->begin = jiffies; if (time_is_before_jiffies(rs->begin + rs->interval)) { - if (rs->missed) - printk(KERN_WARNING "%s: %d callbacks suppressed\n", - func, rs->missed); + if (rs->missed) { + if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) { + pr_warn("%s: %d callbacks suppressed\n", func, rs->missed); + rs->missed = 0; + } + } rs->begin = jiffies; rs->printed = 0; - rs->missed = 0; } if (rs->burst && rs->burst > rs->printed) { rs->printed++; -- cgit v1.2.3 From f003a1f182bb821f13775338a4bf8711830f927a Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Tue, 2 Aug 2016 14:04:13 -0700 Subject: lib/iommu-helper: skip to next segment When a large enough area in the iommu bitmap is found but would span a boundary we continue the search starting from the next bit position. For large allocations this can lead to several useless invocations of bitmap_find_next_zero_area() and iommu_is_span_boundary(). Continue the search from the start of the next segment (which is the next bit position such that we'll not cross the same segment boundary again). Link: http://lkml.kernel.org/r/alpine.LFD.2.20.1606081910070.3211@schleppi Signed-off-by: Sebastian Ott Reviewed-by: Gerald Schaefer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/iommu-helper.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index c27e269210c4..a816f3a80625 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -29,8 +29,7 @@ again: index = bitmap_find_next_zero_area(map, size, start, nr, align_mask); if (index < size) { if (iommu_is_span_boundary(index, nr, shift, boundary_size)) { - /* we could do more effectively */ - start = index + 1; + start = ALIGN(shift + index, boundary_size) - shift; goto again; } bitmap_set(map, index, nr); -- cgit v1.2.3 From a9bfd3321713ecec86282dd2bec04212189f91f1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 2 Aug 2016 14:04:16 -0700 Subject: crc32: use ktime_get_ns() for measurement The crc32 test function measures the elapsed time in nanoseconds, but uses 'struct timespec' for that. We want to remove timespec from the kernel for y2038 compatibility, and ktime_get_ns() also helps make the code simpler here. It is also slightly better to use monontonic time, as we are only interested in the time difference. Link: http://lkml.kernel.org/r/20160617143932.3289626-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: "David S . Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/crc32.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/crc32.c b/lib/crc32.c index 9a907d489d95..7fbd1a112b9d 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -979,7 +979,6 @@ static int __init crc32c_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -999,20 +998,17 @@ static int __init crc32c_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc32c_le != __crc32c_le(test[i].crc, test_buf + test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32c: CRC_LE_BITS = %d\n", CRC_LE_BITS); if (errors) @@ -1065,7 +1061,6 @@ static int __init crc32_test(void) int i; int errors = 0; int bytes = 0; - struct timespec start, stop; u64 nsec; unsigned long flags; @@ -1088,7 +1083,7 @@ static int __init crc32_test(void) local_irq_save(flags); local_irq_disable(); - getnstimeofday(&start); + nsec = ktime_get_ns(); for (i = 0; i < 100; i++) { if (test[i].crc_le != crc32_le(test[i].crc, test_buf + test[i].start, test[i].length)) @@ -1098,14 +1093,11 @@ static int __init crc32_test(void) test[i].start, test[i].length)) errors++; } - getnstimeofday(&stop); + nsec = ktime_get_ns() - nsec; local_irq_restore(flags); local_irq_enable(); - nsec = stop.tv_nsec - start.tv_nsec + - 1000000000 * (stop.tv_sec - start.tv_sec); - pr_info("crc32: CRC_LE_BITS = %d, CRC_BE BITS = %d\n", CRC_LE_BITS, CRC_BE_BITS); -- cgit v1.2.3 From a4691deabf284a601149a067525759939cc563b2 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 2 Aug 2016 14:07:30 -0700 Subject: kcov: allow more fine-grained coverage instrumentation For more targeted fuzzing, it's better to disable kernel-wide instrumentation and instead enable it on a per-subsystem basis. This follows the pattern of UBSAN and allows you to compile in the kcov driver without instrumenting the whole kernel. To instrument a part of the kernel, you can use either # for a single file in the current directory KCOV_INSTRUMENT_filename.o := y or # for all the files in the current directory (excluding subdirectories) KCOV_INSTRUMENT := y or # (same as above) ccflags-y += $(CFLAGS_KCOV) or # for all the files in the current directory (including subdirectories) subdir-ccflags-y += $(CFLAGS_KCOV) Link: http://lkml.kernel.org/r/1464008380-11405-1-git-send-email-vegard.nossum@oracle.com Signed-off-by: Vegard Nossum Cc: Dmitry Vyukov Cc: Quentin Casasnovas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 11 +++++++++++ scripts/Makefile.lib | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f07842e2d69f..cc02f282d05b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -719,6 +719,17 @@ config KCOV For more details, see Documentation/kcov.txt. +config KCOV_INSTRUMENT_ALL + bool "Instrument all code by default" + depends on KCOV + default y if KCOV + help + If you are doing generic system call fuzzing (like e.g. syzkaller), + then you will want to instrument the whole kernel and you should + say y here. If you are doing more targeted fuzzing (like e.g. + filesystem fuzzing with AFL) then you will want to enable coverage + for more specific subsets of files, and should say n here. + config DEBUG_SHIRQ bool "Debug shared IRQ handlers" depends on DEBUG_KERNEL diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index e7df0f5db7ec..76494e15417b 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -138,7 +138,7 @@ endif ifeq ($(CONFIG_KCOV),y) _c_flags += $(if $(patsubst n%,, \ - $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \ + $(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)$(CONFIG_KCOV_INSTRUMENT_ALL)), \ $(CFLAGS_KCOV)) endif -- cgit v1.2.3