From d75f773c86a2b8b7278e2c33343b46a4024bc002 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 25 Mar 2019 21:32:28 +0200 Subject: treewide: Switch printk users from %pf and %pF to %ps and %pS, respectively %pF and %pf are functionally equivalent to %pS and %ps conversion specifiers. The former are deprecated, therefore switch the current users to use the preferred variant. The changes have been produced by the following command: git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \ while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done And verifying the result. Link: http://lkml.kernel.org/r/20190325193229.23390-1-sakari.ailus@linux.intel.com Cc: Andy Shevchenko Cc: linux-arm-kernel@lists.infradead.org Cc: sparclinux@vger.kernel.org Cc: linux-um@lists.infradead.org Cc: xen-devel@lists.xenproject.org Cc: linux-acpi@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: drbd-dev@lists.linbit.com Cc: linux-block@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-nvdimm@lists.01.org Cc: linux-pci@vger.kernel.org Cc: linux-scsi@vger.kernel.org Cc: linux-btrfs@vger.kernel.org Cc: linux-f2fs-devel@lists.sourceforge.net Cc: linux-mm@kvack.org Cc: ceph-devel@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Sakari Ailus Acked-by: David Sterba (for btrfs) Acked-by: Mike Rapoport (for mm/memblock.c) Acked-by: Bjorn Helgaas (for drivers/pci) Acked-by: Rafael J. Wysocki Signed-off-by: Petr Mladek --- init/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index c86a1c8f19f4..3192c6cc5382 100644 --- a/init/main.c +++ b/init/main.c @@ -826,7 +826,7 @@ trace_initcall_start_cb(void *data, initcall_t fn) { ktime_t *calltime = (ktime_t *)data; - printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); + printk(KERN_DEBUG "calling %pS @ %i\n", fn, task_pid_nr(current)); *calltime = ktime_get(); } @@ -840,7 +840,7 @@ trace_initcall_finish_cb(void *data, initcall_t fn, int ret) rettime = ktime_get(); delta = ktime_sub(rettime, *calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; - printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", + printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", fn, ret, duration); } @@ -897,7 +897,7 @@ int __init_or_module do_one_initcall(initcall_t fn) strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); local_irq_enable(); } - WARN(msgbuf[0], "initcall %pF returned with %s\n", fn, msgbuf); + WARN(msgbuf[0], "initcall %pS returned with %s\n", fn, msgbuf); add_latent_entropy(); return ret; -- cgit v1.2.3 From 5dd50aaeb1853ee0953b60fa6d1143d95429ae7b Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 5 Nov 2018 17:40:31 +0000 Subject: Make anon_inodes unconditional Make the anon_inodes facility unconditional so that it can be used by core VFS code and pidfd code. Signed-off-by: David Howells Signed-off-by: Al Viro [christian@brauner.io: adapt commit message to mention pidfds] Signed-off-by: Christian Brauner --- arch/arm/kvm/Kconfig | 1 - arch/arm64/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/x86/kvm/Kconfig | 1 - drivers/base/Kconfig | 1 - drivers/char/tpm/Kconfig | 1 - drivers/dma-buf/Kconfig | 1 - drivers/gpio/Kconfig | 1 - drivers/iio/Kconfig | 1 - drivers/infiniband/Kconfig | 1 - drivers/vfio/Kconfig | 1 - fs/Makefile | 2 +- fs/notify/fanotify/Kconfig | 1 - fs/notify/inotify/Kconfig | 1 - init/Kconfig | 10 ---------- 18 files changed, 1 insertion(+), 27 deletions(-) (limited to 'init') diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3f5320f46de2..f591026347a5 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on MMU && OF select PREEMPT_NOTIFIERS - select ANON_INODES select ARM_GIC select ARM_GIC_V3 select ARM_GIC_V3_ITS diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a3f85624313e..a67121d419a2 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM depends on OF select MMU_NOTIFIER select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 4528bc9c3cb1..eac25aef21e0 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM depends on MIPS_FP_SUPPORT select EXPORT_UASM select PREEMPT_NOTIFIERS - select ANON_INODES select KVM_GENERIC_DIRTYLOG_READ_PROTECT select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index bfdde04e4905..f53997a8ca62 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select SRCU diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 767453faacfc..1816ee48eadd 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -21,7 +21,6 @@ config KVM prompt "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS - select ANON_INODES select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_EVENTFD diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5ad92419be19..7a70fb58b2d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -44,7 +44,6 @@ config X86 # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI - select ANON_INODES select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 72fa955f4a15..fc042419e670 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -27,7 +27,6 @@ config KVM depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER - select ANON_INODES select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select IRQ_BYPASS_MANAGER diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 059700ea3521..03f067da12ee 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig" config DMA_SHARED_BUFFER bool default n - select ANON_INODES select IRQ_WORK help This option enables the framework for buffer-sharing between diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 536e55d3919f..f3e4bc490cf0 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -157,7 +157,6 @@ config TCG_CRB config TCG_VTPM_PROXY tristate "VTPM Proxy Interface" depends on TCG_TPM - select ANON_INODES ---help--- This driver proxies for an emulated TPM (vTPM) running in userspace. A device /dev/vtpmx is provided that creates a device pair diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig index 2e5a0faa2cb1..3fc9c2efc583 100644 --- a/drivers/dma-buf/Kconfig +++ b/drivers/dma-buf/Kconfig @@ -3,7 +3,6 @@ menu "DMABUF options" config SYNC_FILE bool "Explicit Synchronization Framework" default n - select ANON_INODES select DMA_SHARED_BUFFER ---help--- The Sync File Framework adds explicit syncronization via diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 3f50526a771f..0f91600c27ae 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -12,7 +12,6 @@ config ARCH_HAVE_CUSTOM_GPIO_H menuconfig GPIOLIB bool "GPIO Support" - select ANON_INODES help This enables GPIO support through the generic GPIO library. You only need to enable this, if you also want to enable diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig index d08aeb41cd07..1dec0fecb6ef 100644 --- a/drivers/iio/Kconfig +++ b/drivers/iio/Kconfig @@ -4,7 +4,6 @@ menuconfig IIO tristate "Industrial I/O support" - select ANON_INODES help The industrial I/O subsystem provides a unified framework for drivers for many different types of embedded sensors using a diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a1fb840de45d..d318bab25860 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -25,7 +25,6 @@ config INFINIBAND_USER_MAD config INFINIBAND_USER_ACCESS tristate "InfiniBand userspace access (verbs and CM)" - select ANON_INODES depends on MMU ---help--- Userspace InfiniBand access support. This enables the diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 9de5ed38da83..3798d77d131c 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -22,7 +22,6 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64) - select ANON_INODES help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/fs/Makefile b/fs/Makefile index 427fec226fae..35945f8139e6 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o -obj-$(CONFIG_ANON_INODES) += anon_inodes.o +obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig index 735bfb2e9190..521dc91d2cb5 100644 --- a/fs/notify/fanotify/Kconfig +++ b/fs/notify/fanotify/Kconfig @@ -1,7 +1,6 @@ config FANOTIFY bool "Filesystem wide access notification" select FSNOTIFY - select ANON_INODES select EXPORTFS default n ---help--- diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index b981fc0c8379..0161c74e76e2 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -1,6 +1,5 @@ config INOTIFY_USER bool "Inotify support for userspace" - select ANON_INODES select FSNOTIFY default y ---help--- diff --git a/init/Kconfig b/init/Kconfig index 4592bf7997c0..be8f97e37a76 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1171,9 +1171,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION config SYSCTL bool -config ANON_INODES - bool - config HAVE_UID16 bool @@ -1378,14 +1375,12 @@ config HAVE_FUTEX_CMPXCHG config EPOLL bool "Enable eventpoll support" if EXPERT default y - select ANON_INODES help Disabling this option will cause the kernel to be built without support for epoll family of system calls. config SIGNALFD bool "Enable signalfd() system call" if EXPERT - select ANON_INODES default y help Enable the signalfd() system call that allows to receive signals @@ -1395,7 +1390,6 @@ config SIGNALFD config TIMERFD bool "Enable timerfd() system call" if EXPERT - select ANON_INODES default y help Enable the timerfd() system call that allows to receive timer @@ -1405,7 +1399,6 @@ config TIMERFD config EVENTFD bool "Enable eventfd() system call" if EXPERT - select ANON_INODES default y help Enable the eventfd() system call that allows to receive both @@ -1516,7 +1509,6 @@ config KALLSYMS_BASE_RELATIVE # syscall, maps, verifier config BPF_SYSCALL bool "Enable bpf() system call" - select ANON_INODES select BPF select IRQ_WORK default n @@ -1533,7 +1525,6 @@ config BPF_JIT_ALWAYS_ON config USERFAULTFD bool "Enable userfaultfd() system call" - select ANON_INODES depends on MMU help Enable the userfaultfd() system call that allows to intercept and @@ -1600,7 +1591,6 @@ config PERF_EVENTS bool "Kernel performance events and counters" default y if PROFILING depends on HAVE_PERF_EVENTS - select ANON_INODES select IRQ_WORK select SRCU help -- cgit v1.2.3 From 6041186a32585fc7a1d0f6cfe2f138b05fdc3c82 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 18 Apr 2019 17:50:44 -0700 Subject: init: initialize jump labels before command line option parsing When a module option, or core kernel argument, toggles a static-key it requires jump labels to be initialized early. While x86, PowerPC, and ARM64 arrange for jump_label_init() to be called before parse_args(), ARM does not. Kernel command line: rdinit=/sbin/init page_alloc.shuffle=1 panic=-1 console=ttyAMA0,115200 page_alloc.shuffle=1 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at ./include/linux/jump_label.h:303 page_alloc_shuffle+0x12c/0x1ac static_key_enable(): static key 'page_alloc_shuffle_key+0x0/0x4' used before call to jump_label_init() Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 5.1.0-rc4-next-20190410-00003-g3367c36ce744 #1 Hardware name: ARM Integrator/CP (Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x18) [] (show_stack) from [] (dump_stack+0x18/0x24) [] (dump_stack) from [] (__warn+0xe0/0x108) [] (__warn) from [] (warn_slowpath_fmt+0x44/0x6c) [] (warn_slowpath_fmt) from [] (page_alloc_shuffle+0x12c/0x1ac) [] (page_alloc_shuffle) from [] (shuffle_store+0x28/0x48) [] (shuffle_store) from [] (parse_args+0x1f4/0x350) [] (parse_args) from [] (start_kernel+0x1c0/0x488) Move the fallback call to jump_label_init() to occur before parse_args(). The redundant calls to jump_label_init() in other archs are left intact in case they have static key toggling use cases that are even earlier than option parsing. Link: http://lkml.kernel.org/r/155544804466.1032396.13418949511615676665.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Guenter Roeck Reviewed-by: Kees Cook Cc: Mathieu Desnoyers Cc: Thomas Gleixner Cc: Mike Rapoport Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'init') diff --git a/init/main.c b/init/main.c index 598e278b46f7..7d4025d665eb 100644 --- a/init/main.c +++ b/init/main.c @@ -582,6 +582,8 @@ asmlinkage __visible void __init start_kernel(void) page_alloc_init(); pr_notice("Kernel command line: %s\n", boot_command_line); + /* parameters may set static keys */ + jump_label_init(); parse_early_param(); after_dashes = parse_args("Booting kernel", static_command_line, __start___param, @@ -591,8 +593,6 @@ asmlinkage __visible void __init start_kernel(void) parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, NULL, set_init_arg); - jump_label_init(); - /* * These use large bootmem allocations and must precede * kmem_cache_init() -- cgit v1.2.3 From d55535232c3dbde9a523a9d10d68670f5fe5dec3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Apr 2019 23:27:05 -0400 Subject: random: move rand_initialize() earlier Right now rand_initialize() is run as an early_initcall(), but it only depends on timekeeping_init() (for mixing ktime_get_real() into the pools). However, the call to boot_init_stack_canary() for stack canary initialization runs earlier, which triggers a warning at boot: random: get_random_bytes called from start_kernel+0x357/0x548 with crng_init=0 Instead, this moves rand_initialize() to after timekeeping_init(), and moves canary initialization here as well. Note that this warning may still remain for machines that do not have UEFI RNG support (which initializes the RNG pools during setup_arch()), or for x86 machines without RDRAND (or booting without "random.trust=on" or CONFIG_RANDOM_TRUST_CPU=y). Signed-off-by: Kees Cook Signed-off-by: Theodore Ts'o --- drivers/char/random.c | 5 ++--- include/linux/random.h | 1 + init/main.c | 21 ++++++++++++++------- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'init') diff --git a/drivers/char/random.c b/drivers/char/random.c index e247c45b2772..8757ed493b11 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1783,7 +1783,7 @@ EXPORT_SYMBOL(get_random_bytes_arch); * data into the pool to prepare it for use. The pool is not cleared * as that can only decrease the entropy in the pool. */ -static void init_std_data(struct entropy_store *r) +static void __init init_std_data(struct entropy_store *r) { int i; ktime_t now = ktime_get_real(); @@ -1810,7 +1810,7 @@ static void init_std_data(struct entropy_store *r) * take care not to overwrite the precious per platform data * we were given. */ -static int rand_initialize(void) +int __init rand_initialize(void) { init_std_data(&input_pool); init_std_data(&blocking_pool); @@ -1822,7 +1822,6 @@ static int rand_initialize(void) } return 0; } -early_initcall(rand_initialize); #ifdef CONFIG_BLOCK void rand_initialize_disk(struct gendisk *disk) diff --git a/include/linux/random.h b/include/linux/random.h index 445a0ea4ff49..13aeaf5a4bd4 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -36,6 +36,7 @@ extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; extern void get_random_bytes(void *buf, int nbytes); extern int wait_for_random_bytes(void); +extern int __init rand_initialize(void); extern bool rng_is_initialized(void); extern int add_random_ready_callback(struct random_ready_callback *rdy); extern void del_random_ready_callback(struct random_ready_callback *rdy); diff --git a/init/main.c b/init/main.c index 598e278b46f7..55243bc0a067 100644 --- a/init/main.c +++ b/init/main.c @@ -564,13 +564,6 @@ asmlinkage __visible void __init start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); - /* - * Set up the the initial canary and entropy after arch - * and after adding latent and command line entropy. - */ - add_latent_entropy(); - add_device_randomness(command_line, strlen(command_line)); - boot_init_stack_canary(); mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); @@ -655,6 +648,20 @@ asmlinkage __visible void __init start_kernel(void) hrtimers_init(); softirq_init(); timekeeping_init(); + + /* + * For best initial stack canary entropy, prepare it after: + * - setup_arch() for any UEFI RNG entropy and boot cmdline access + * - timekeeping_init() for ktime entropy used in rand_initialize() + * - rand_initialize() to get any arch-specific entropy like RDRAND + * - add_latent_entropy() to get any latent entropy + * - adding command line entropy + */ + rand_initialize(); + add_latent_entropy(); + add_device_randomness(command_line, strlen(command_line)); + boot_init_stack_canary(); + time_init(); printk_safe_init(); perf_event_init(); -- cgit v1.2.3 From 43d8ce9d65a54846d378545770991e65838981e0 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 26 Apr 2019 15:04:29 -0400 Subject: Provide in-kernel headers to make extending kernel easier Introduce in-kernel headers which are made available as an archive through proc (/proc/kheaders.tar.xz file). This archive makes it possible to run eBPF and other tracing programs that need to extend the kernel for tracing purposes without any dependency on the file system having headers. A github PR is sent for the corresponding BCC patch at: https://github.com/iovisor/bcc/pull/2312 On Android and embedded systems, it is common to switch kernels but not have kernel headers available on the file system. Further once a different kernel is booted, any headers stored on the file system will no longer be useful. This is an issue even well known to distros. By storing the headers as a compressed archive within the kernel, we can avoid these issues that have been a hindrance for a long time. The best way to use this feature is by building it in. Several users have a need for this, when they switch debug kernels, they do not want to update the filesystem or worry about it where to store the headers on it. However, the feature is also buildable as a module in case the user desires it not being part of the kernel image. This makes it possible to load and unload the headers from memory on demand. A tracing program can load the module, do its operations, and then unload the module to save kernel memory. The total memory needed is 3.3MB. By having the archive available at a fixed location independent of filesystem dependencies and conventions, all debugging tools can directly refer to the fixed location for the archive, without concerning with where the headers on a typical filesystem which significantly simplifies tooling that needs kernel headers. The code to read the headers is based on /proc/config.gz code and uses the same technique to embed the headers. Other approaches were discussed such as having an in-memory mountable filesystem, but that has drawbacks such as requiring an in-kernel xz decompressor which we don't have today, and requiring usage of 42 MB of kernel memory to host the decompressed headers at anytime. Also this approach is simpler than such approaches. Reviewed-by: Masahiro Yamada Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- init/Kconfig | 10 ++++++ kernel/.gitignore | 1 + kernel/Makefile | 10 ++++++ kernel/gen_ikh_data.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kheaders.c | 74 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 184 insertions(+) create mode 100755 kernel/gen_ikh_data.sh create mode 100644 kernel/kheaders.c (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 4592bf7997c0..47c0db6e63a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -580,6 +580,16 @@ config IKCONFIG_PROC This option enables access to the kernel configuration file through /proc/config.gz. +config IKHEADERS_PROC + tristate "Enable kernel header artifacts through /proc/kheaders.tar.xz" + depends on PROC_FS + help + This option enables access to the kernel header and other artifacts that + are generated during the build process. These can be used to build eBPF + tracing programs, or similar programs. If you build the headers as a + module, a module called kheaders.ko is built which can be loaded on-demand + to get access to the headers. + config LOG_BUF_SHIFT int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" range 12 25 diff --git a/kernel/.gitignore b/kernel/.gitignore index 6e699100872f..34d1e77ee9df 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -1,5 +1,6 @@ # # Generated files # +kheaders.md5 timeconst.h hz.bc diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..12399614c350 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o @@ -121,3 +122,12 @@ $(obj)/configs.o: $(obj)/config_data.gz targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) + +$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz + +quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz +cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ +$(obj)/kheaders_data.tar.xz: FORCE + $(call cmd,genikh) + +clean-files := kheaders_data.tar.xz kheaders.md5 diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_ikh_data.sh new file mode 100755 index 000000000000..591a94f7b387 --- /dev/null +++ b/kernel/gen_ikh_data.sh @@ -0,0 +1,89 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This script generates an archive consisting of kernel headers +# for CONFIG_IKHEADERS_PROC. +set -e +spath="$(dirname "$(readlink -f "$0")")" +kroot="$spath/.." +outdir="$(pwd)" +tarfile=$1 +cpio_dir=$outdir/$tarfile.tmp + +# Script filename relative to the kernel source root +# We add it to the archive because it is small and any changes +# to this script will also cause a rebuild of the archive. +sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" + +src_file_list=" +include/ +arch/$SRCARCH/include/ +$sfile +" + +obj_file_list=" +include/ +arch/$SRCARCH/include/ +" + +# Support incremental builds by skipping archive generation +# if timestamps of files being archived are not changed. + +# This block is useful for debugging the incremental builds. +# Uncomment it for debugging. +# iter=1 +# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; +# else; iter=$(($(cat /tmp/iter) + 1)); fi +# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter +# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter + +# include/generated/compile.h is ignored because it is touched even when none +# of the source files changed. This causes pointless regeneration, so let us +# ignore them for md5 calculation. +pushd $kroot > /dev/null +src_files_md5="$(find $src_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" +popd > /dev/null +obj_files_md5="$(find $obj_file_list -type f | + grep -v "include/generated/compile.h" | + xargs ls -lR | md5sum | cut -d ' ' -f1)" + +if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi +if [ -f kernel/kheaders.md5 ] && + [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && + [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && + [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + exit +fi + +if [ "${quiet}" != "silent_" ]; then + echo " GEN $tarfile" +fi + +rm -rf $cpio_dir +mkdir $cpio_dir + +pushd $kroot > /dev/null +for f in $src_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir +popd > /dev/null + +# The second CPIO can complain if files already exist which can +# happen with out of tree builds. Just silence CPIO for now. +for f in $obj_file_list; + do find "$f" ! -name "*.cmd" ! -name ".*"; +done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 + +# Remove comments except SDPX lines +find $cpio_dir -type f -print0 | + xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' + +tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null + +echo "$src_files_md5" > kernel/kheaders.md5 +echo "$obj_files_md5" >> kernel/kheaders.md5 +echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 + +rm -rf $cpio_dir diff --git a/kernel/kheaders.c b/kernel/kheaders.c new file mode 100644 index 000000000000..70ae6052920d --- /dev/null +++ b/kernel/kheaders.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide kernel headers useful to build tracing programs + * such as for running eBPF tracing tools. + * + * (Borrowed code from kernel/configs.c) + */ + +#include +#include +#include +#include +#include + +/* + * Define kernel_headers_data and kernel_headers_data_end, within which the + * compressed kernel headers are stored. The file is first compressed with xz. + */ + +asm ( +" .pushsection .rodata, \"a\" \n" +" .global kernel_headers_data \n" +"kernel_headers_data: \n" +" .incbin \"kernel/kheaders_data.tar.xz\" \n" +" .global kernel_headers_data_end \n" +"kernel_headers_data_end: \n" +" .popsection \n" +); + +extern char kernel_headers_data; +extern char kernel_headers_data_end; + +static ssize_t +ikheaders_read_current(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + return simple_read_from_buffer(buf, len, offset, + &kernel_headers_data, + &kernel_headers_data_end - + &kernel_headers_data); +} + +static const struct file_operations ikheaders_file_ops = { + .read = ikheaders_read_current, + .llseek = default_llseek, +}; + +static int __init ikheaders_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current headers file */ + entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, + &ikheaders_file_ops); + if (!entry) + return -ENOMEM; + + proc_set_size(entry, + &kernel_headers_data_end - + &kernel_headers_data); + return 0; +} + +static void __exit ikheaders_cleanup(void) +{ + remove_proc_entry("kheaders.tar.xz", NULL); +} + +module_init(ikheaders_init); +module_exit(ikheaders_cleanup); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joel Fernandes"); +MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel"); -- cgit v1.2.3 From bc0c60457c35c895b5591f85046b1f13da5487c4 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 26 Apr 2019 15:04:30 -0400 Subject: init/config: Do not select BUILD_BIN2C for IKCONFIG Since commit 13610aa908dc ("kernel/configs: use .incbin directive to embed config_data.gz"), IKCONFIG no longer uses BUILD_BIN2C so prevent it from being selected in Kconfig. Reviewed-by: Masahiro Yamada Signed-off-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- init/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 47c0db6e63a5..26a364a95b57 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -562,7 +562,6 @@ config BUILD_BIN2C config IKCONFIG tristate "Kernel .config support" - select BUILD_BIN2C ---help--- This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation -- cgit v1.2.3 From 4fc19708b165c1c152fa1f12f6600e66184b7786 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 26 Apr 2019 16:22:46 -0700 Subject: x86/alternatives: Initialize temporary mm for patching To prevent improper use of the PTEs that are used for text patching, the next patches will use a temporary mm struct. Initailize it by copying the init mm. The address that will be used for patching is taken from the lower area that is usually used for the task memory. Doing so prevents the need to frequently synchronize the temporary-mm (e.g., when BPF programs are installed), since different PGDs are used for the task memory. Finally, randomize the address of the PTEs to harden against exploits that use these PTEs. Suggested-by: Andy Lutomirski Tested-by: Masami Hiramatsu Signed-off-by: Nadav Amit Signed-off-by: Rick Edgecombe Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Rik van Riel Cc: Thomas Gleixner Cc: akpm@linux-foundation.org Cc: ard.biesheuvel@linaro.org Cc: deneen.t.dock@intel.com Cc: kernel-hardening@lists.openwall.com Cc: kristen@linux.intel.com Cc: linux_dti@icloud.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190426232303.28381-8-nadav.amit@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 3 +++ arch/x86/include/asm/text-patching.h | 2 ++ arch/x86/kernel/alternative.c | 3 +++ arch/x86/mm/init.c | 37 ++++++++++++++++++++++++++++++++++++ init/main.c | 3 +++ 5 files changed, 48 insertions(+) (limited to 'init') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2779ace16d23..702db5904753 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1021,6 +1021,9 @@ static inline void __meminit init_trampoline_default(void) /* Default trampoline pgd value */ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; } + +void __init poking_init(void); + # ifdef CONFIG_RANDOMIZE_MEMORY void __meminit init_trampoline(void); # else diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index f8fc8e86cf01..a75eed841eed 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -39,5 +39,7 @@ extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); extern int after_bootmem; +extern __ro_after_init struct mm_struct *poking_mm; +extern __ro_after_init unsigned long poking_addr; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 0a814d73547a..11d5c710a94f 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -679,6 +679,9 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, return addr; } +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + static void *__text_poke(void *addr, const void *opcode, size_t len) { unsigned long flags; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8dacdb96899e..fd10d91a6115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include /* * We need to define the tracepoints somewhere, and tlb.c @@ -701,6 +703,41 @@ void __init init_mem_mapping(void) early_memtest(0, max_pfn_mapped << PAGE_SHIFT); } +/* + * Initialize an mm_struct to be used during poking and a pointer to be used + * during patching. + */ +void __init poking_init(void) +{ + spinlock_t *ptl; + pte_t *ptep; + + poking_mm = copy_init_mm(); + BUG_ON(!poking_mm); + + /* + * Randomize the poking address, but make sure that the following page + * will be mapped at the same PMD. We need 2 pages, so find space for 3, + * and adjust the address if the PMD ends after the first one. + */ + poking_addr = TASK_UNMAPPED_BASE; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); + + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + poking_addr += PAGE_SIZE; + + /* + * We need to trigger the allocation of the page-tables that will be + * needed for poking now. Later, poking may be performed in an atomic + * section, which might cause allocation to fail. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + BUG_ON(!ptep); + pte_unmap_unlock(ptep, ptl); +} + /* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. diff --git a/init/main.c b/init/main.c index 7d4025d665eb..95dd9406ee31 100644 --- a/init/main.c +++ b/init/main.c @@ -504,6 +504,8 @@ void __init __weak thread_stack_cache_init(void) void __init __weak mem_encrypt_init(void) { } +void __init __weak poking_init(void) { } + bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); @@ -737,6 +739,7 @@ asmlinkage __visible void __init start_kernel(void) taskstats_init_early(); delayacct_init(); + poking_init(); check_bugs(); acpi_subsystem_init(); -- cgit v1.2.3 From caa841360134f863987f2d4f77b8dc2fbb7596f8 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sat, 4 May 2019 18:11:24 -0700 Subject: x86/mm: Initialize PGD cache during mm initialization Poking-mm initialization might require to duplicate the PGD in early stage. Initialize the PGD cache earlier to prevent boot failures. Reported-by: kernel test robot Signed-off-by: Nadav Amit Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rick Edgecombe Cc: Rik van Riel Cc: Stephen Rothwell Cc: Thomas Gleixner Fixes: 4fc19708b165 ("x86/alternatives: Initialize temporary mm for patching") Link: http://lkml.kernel.org/r/20190505011124.39692-1-namit@vmware.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 10 ++++++---- include/asm-generic/pgtable.h | 2 ++ init/main.c | 3 +++ 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'init') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd01709a091..c8177045b7d4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) - return 0; + return; /* * when PAE kernel is not running as a Xen domain, it uses @@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); - return 0; } -core_initcall(pgd_cache_init); static inline pgd_t *_pgd_alloc(void) { @@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) } #else +void __init pgd_cache_init(void) +{ +} + static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index fa782fba51ee..75d9d68a6de7 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1126,6 +1126,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, static inline void init_espfix_bsp(void) { } #endif +extern void __init pgd_cache_init(void); + #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { diff --git a/init/main.c b/init/main.c index 95dd9406ee31..9dc2f3b4f753 100644 --- a/init/main.c +++ b/init/main.c @@ -506,6 +506,8 @@ void __init __weak mem_encrypt_init(void) { } void __init __weak poking_init(void) { } +void __init __weak pgd_cache_init(void) { } + bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); @@ -537,6 +539,7 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); + pgd_cache_init(); } void __init __weak arch_call_rest_init(void) -- cgit v1.2.3 From 54c7a8916a887f357088f99e9c3a7720cd57d2c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:17 -0700 Subject: initramfs: free initrd memory if opening /initrd.image fails Patch series "initramfs tidyups". I've spent some time chasing down behavior in initramfs and found plenty of opportunity to improve the code. A first stab on that is contained in this series. This patch (of 7): We free the initrd memory for all successful or error cases except for the case where opening /initrd.image fails, which looks like an oversight. Steven said: : This also changes the behaviour when CONFIG_INITRAMFS_FORCE is enabled : - specifically it means that the initrd is freed (previously it was : ignored and never freed). But that seems like reasonable behaviour and : the previous behaviour looks like another oversight. Link: http://lkml.kernel.org/r/20190213174621.29297-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Steven Price Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Alexander Viro Cc: Russell King Cc: Will Deacon Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'init') diff --git a/init/initramfs.c b/init/initramfs.c index 4749e1115eef..c322e1099f43 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -612,13 +612,12 @@ static int __init populate_rootfs(void) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); - if (!err) { - free_initrd(); + if (!err) goto done; - } else { - clean_rootfs(); - unpack_to_rootfs(__initramfs_start, __initramfs_size); - } + + clean_rootfs(); + unpack_to_rootfs(__initramfs_start, __initramfs_size); + printk(KERN_INFO "rootfs image is not initramfs (%s)" "; looks like an initrd\n", err); fd = ksys_open("/initrd.image", @@ -632,7 +631,6 @@ static int __init populate_rootfs(void) written, initrd_end - initrd_start); ksys_close(fd); - free_initrd(); } done: /* empty statement */; @@ -642,9 +640,9 @@ static int __init populate_rootfs(void) initrd_end - initrd_start); if (err) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); - free_initrd(); #endif } + free_initrd(); flush_delayed_fput(); return 0; } -- cgit v1.2.3 From 23091e287355440fb680868c23bcada594d3f399 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:21 -0700 Subject: initramfs: cleanup initrd freeing Factor the kexec logic into a separate helper, and then inline the rest of free_initrd into the only caller. Link: http://lkml.kernel.org/r/20190213174621.29297-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'init') diff --git a/init/initramfs.c b/init/initramfs.c index c322e1099f43..5fda9557a134 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -518,37 +518,35 @@ extern unsigned long __initramfs_size; #include #include -static void __init free_initrd(void) -{ #ifdef CONFIG_KEXEC_CORE +static bool kexec_free_initrd(void) +{ unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); -#endif - if (do_retain_initrd) - goto skip; -#ifdef CONFIG_KEXEC_CORE /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ - if (initrd_start < crashk_end && initrd_end > crashk_start) { - /* - * Initialize initrd memory region since the kexec boot does - * not do. - */ - memset((void *)initrd_start, 0, initrd_end - initrd_start); - if (initrd_start < crashk_start) - free_initrd_mem(initrd_start, crashk_start); - if (initrd_end > crashk_end) - free_initrd_mem(crashk_end, initrd_end); - } else -#endif - free_initrd_mem(initrd_start, initrd_end); -skip: - initrd_start = 0; - initrd_end = 0; + if (initrd_start >= crashk_end || initrd_end <= crashk_start) + return false; + + /* + * Initialize initrd memory region since the kexec boot does not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + return true; } +#else +static inline bool kexec_free_initrd(void) +{ + return false; +} +#endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM #define BUF_SIZE 1024 @@ -642,7 +640,16 @@ static int __init populate_rootfs(void) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); #endif } - free_initrd(); + + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (!do_retain_initrd && !kexec_free_initrd()) + free_initrd_mem(initrd_start, initrd_end); + initrd_start = 0; + initrd_end = 0; + flush_delayed_fput(); return 0; } -- cgit v1.2.3 From 7c184ecd262fe64fe8cf4e099e0f7cefe88d88b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:24 -0700 Subject: initramfs: factor out a helper to populate the initrd image This will allow for cleaner code sharing in the caller. Link: http://lkml.kernel.org/r/20190213174621.29297-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'init') diff --git a/init/initramfs.c b/init/initramfs.c index 5fda9557a134..e3de626dbd98 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -597,6 +597,28 @@ static void __init clean_rootfs(void) } #endif +#ifdef CONFIG_BLK_DEV_RAM +static void populate_initrd_image(char *err) +{ + ssize_t written; + int fd; + + unpack_to_rootfs(__initramfs_start, __initramfs_size); + + printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", + err); + fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + if (fd < 0) + return; + + written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start); + if (written != initrd_end - initrd_start) + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", + written, initrd_end - initrd_start); + ksys_close(fd); +} +#endif /* CONFIG_BLK_DEV_RAM */ + static int __init populate_rootfs(void) { /* Load the built in initramfs */ @@ -606,7 +628,6 @@ static int __init populate_rootfs(void) /* If available load the bootloader supplied initrd */ if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { #ifdef CONFIG_BLK_DEV_RAM - int fd; printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); @@ -614,22 +635,7 @@ static int __init populate_rootfs(void) goto done; clean_rootfs(); - unpack_to_rootfs(__initramfs_start, __initramfs_size); - - printk(KERN_INFO "rootfs image is not initramfs (%s)" - "; looks like an initrd\n", err); - fd = ksys_open("/initrd.image", - O_WRONLY|O_CREAT, 0700); - if (fd >= 0) { - ssize_t written = xwrite(fd, (char *)initrd_start, - initrd_end - initrd_start); - - if (written != initrd_end - initrd_start) - pr_err("/initrd.image: incomplete write (%zd != %ld)\n", - written, initrd_end - initrd_start); - - ksys_close(fd); - } + populate_initrd_image(err); done: /* empty statement */; #else -- cgit v1.2.3 From afef7889c480ed134247f16c2ebdeabd75e77fd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:27 -0700 Subject: initramfs: cleanup populate_rootfs The code for kernels that support ramdisks or not is mostly the same. Unify it by using an IS_ENABLED for the info message, and moving the error message into a stub for populate_initrd_image. [cai@lca.pw: fix a compilation error] Link: http://lkml.kernel.org/r/20190328014806.36375-1-cai@lca.pw Link: http://lkml.kernel.org/r/20190213174621.29297-6-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Qian Cai Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) (limited to 'init') diff --git a/init/initramfs.c b/init/initramfs.c index e3de626dbd98..32f940473d67 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -595,7 +595,11 @@ static void __init clean_rootfs(void) ksys_close(fd); kfree(buf); } -#endif +#else +static inline void clean_rootfs(void) +{ +} +#endif /* CONFIG_BLK_DEV_RAM */ #ifdef CONFIG_BLK_DEV_RAM static void populate_initrd_image(char *err) @@ -617,6 +621,11 @@ static void populate_initrd_image(char *err) written, initrd_end - initrd_start); ksys_close(fd); } +#else +static void populate_initrd_image(char *err) +{ + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); +} #endif /* CONFIG_BLK_DEV_RAM */ static int __init populate_rootfs(void) @@ -625,28 +634,22 @@ static int __init populate_rootfs(void) char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - /* If available load the bootloader supplied initrd */ - if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { -#ifdef CONFIG_BLK_DEV_RAM + + if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) + goto done; + + if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); - err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start); - if (!err) - goto done; + else + printk(KERN_INFO "Unpacking initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); + if (err) { clean_rootfs(); populate_initrd_image(err); - done: - /* empty statement */; -#else - printk(KERN_INFO "Unpacking initramfs...\n"); - err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start); - if (err) - printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); -#endif } +done: /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. -- cgit v1.2.3 From d8ae8a3765bfa1f9bf977e2496fcc9cf64fbfabd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:30 -0700 Subject: initramfs: move the legacy keepinitrd parameter to core code No need to handle the freeing disable in arch code when we already have a core hook (and a different name for the option) for it. Link: http://lkml.kernel.org/r/20190213174621.29297-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas [arm64] Acked-by: Mike Rapoport Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/arm/Kconfig | 1 + arch/arm/mm/init.c | 25 ++++++------------------- arch/arm64/Kconfig | 1 + arch/arm64/mm/init.c | 17 ++--------------- arch/unicore32/Kconfig | 1 + arch/unicore32/mm/init.c | 14 +------------- init/initramfs.c | 9 +++++++++ 8 files changed, 28 insertions(+), 47 deletions(-) (limited to 'init') diff --git a/arch/Kconfig b/arch/Kconfig index 5e43fcbad4ca..f11f0698b148 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE An architecture should select this when it can successfully build and run with CONFIG_FORTIFY_SOURCE. +# +# Select if the arch provides a historic keepinit alias for the retain_initrd +# command line option +# +config ARCH_HAS_KEEPINITRD + bool + # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h config ARCH_HAS_SET_MEMORY bool diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index dc9855c4a3b4..a11dfcc2a130 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -9,6 +9,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_KEEPINITRD select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL if ARM_LPAE diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index c2daabbe0af0..68dcd5f8d7c6 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -695,27 +695,14 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd; - void free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) { - if (start == initrd_start) - start = round_down(start, PAGE_SIZE); - if (end == initrd_end) - end = round_up(end, PAGE_SIZE); + if (start == initrd_start) + start = round_down(start, PAGE_SIZE); + if (end == initrd_end) + end = round_up(end, PAGE_SIZE); - poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, -1, "initrd"); - } + poison_init_mem((void *)start, PAGE_ALIGN(end) - start); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; -} - -__setup("keepinitrd", keepinitrd_setup); #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3f957443f286..e24dc16453aa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -21,6 +21,7 @@ config ARM64 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 40e2d7e5efcb..007c05a4cce0 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -578,24 +578,11 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd __initdata; - void __init free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); - memblock_free(__virt_to_phys(start), end - start); - } -} - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; + free_reserved_area((void *)start, (void *)end, 0, "initrd"); + memblock_free(__virt_to_phys(start), end - start); } - -__setup("keepinitrd", keepinitrd_setup); #endif /* diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 2445dfcf6444..afe4949cfc2d 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -3,6 +3,7 @@ config UNICORE32 def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_KEEPINITRD select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_KERNEL_GZIP diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 74b6a2e29809..c0573feb5064 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -294,20 +294,8 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd; - void free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } - -__setup("keepinitrd", keepinitrd_setup); #endif diff --git a/init/initramfs.c b/init/initramfs.c index 32f940473d67..cb3d17735c66 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -513,6 +513,15 @@ static int __init retain_initrd_param(char *str) } __setup("retain_initrd", retain_initrd_param); +#ifdef CONFIG_ARCH_HAS_KEEPINITRD +static int __init keepinitrd_setup(char *__unused) +{ + do_retain_initrd = 1; + return 1; +} +__setup("keepinitrd", keepinitrd_setup); +#endif + extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include -- cgit v1.2.3 From 4afd58e14dd415e456fd236755373f52e6055ec7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:34 -0700 Subject: initramfs: provide a generic free_initrd_mem implementation For most architectures free_initrd_mem just expands to the same free_reserved_area call. Provide that as a generic implementation marked __weak. Link: http://lkml.kernel.org/r/20190213174621.29297-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Geert Uytterhoeven [m68k] Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 8 -------- arch/arc/mm/init.c | 7 ------- arch/c6x/mm/init.c | 7 ------- arch/h8300/mm/init.c | 8 -------- arch/m68k/mm/init.c | 7 ------- arch/microblaze/mm/init.c | 7 ------- arch/nds32/mm/init.c | 7 ------- arch/nios2/mm/init.c | 7 ------- arch/openrisc/mm/init.c | 7 ------- arch/parisc/mm/init.c | 7 ------- arch/powerpc/mm/mem.c | 7 ------- arch/sh/mm/init.c | 7 ------- arch/um/kernel/mem.c | 7 ------- arch/unicore32/mm/init.c | 7 ------- init/initramfs.c | 5 +++++ 15 files changed, 5 insertions(+), 100 deletions(-) (limited to 'init') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index a42fc5c4db89..97f4940f11e3 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -291,11 +291,3 @@ free_initmem(void) { free_initmem_default(-1); } - -#ifdef CONFIG_BLK_DEV_INITRD -void -free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index e1ab2d7f1d64..c357a3bd1532 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -214,10 +214,3 @@ void __ref free_initmem(void) { free_initmem_default(-1); } - -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index fe582c3a1794..6fd43ec53507 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -69,13 +69,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __init free_initmem(void) { free_initmem_default(-1); diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 0f04a5e9aa4f..ea635c9025fe 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -103,14 +103,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void free_initmem(void) { diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 8868a4c9adae..778cacb7d57b 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -147,10 +147,3 @@ void __init mem_init(void) init_pointer_tables(); mem_init_print_info(NULL); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 7e97d44f6538..b675bc666e68 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -186,13 +186,6 @@ void __init setup_memory(void) paging_init(); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void free_initmem(void) { free_initmem_default(-1); diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 1d03633f89a9..9a7065c1fb83 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -257,13 +257,6 @@ void free_initmem(void) free_initmem_default(-1); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 16cea5776b87..60736a725883 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,13 +82,6 @@ void __init mmu_init(void) flush_tlb_all(); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __ref free_initmem(void) { free_initmem_default(-1); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index caeb4184e8a6..08df7f0b1d96 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -224,13 +224,6 @@ void __init mem_init(void) return; } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void free_initmem(void) { free_initmem_default(-1); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3b0f9eab7f2c..11ec1f1221a6 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -917,10 +917,3 @@ void flush_tlb_all(void) spin_unlock(&sid_lock); } #endif - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index cd525d709072..20266898f3a8 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -338,13 +338,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* * This is called when a page has been modified by the kernel. * It just marks the page as not i-cache clean. We do the i-cache diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 70621324db41..3e68d98af1bd 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -408,13 +408,6 @@ void free_initmem(void) free_initmem_default(-1); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 99aa11bf53d1..a9c9a94c096f 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -188,13 +188,6 @@ void free_initmem(void) { } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* Allocate and free page tables. */ pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index c0573feb5064..6e352de80038 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -292,10 +292,3 @@ void free_initmem(void) { free_initmem_default(-1); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/init/initramfs.c b/init/initramfs.c index cb3d17735c66..fcb759a106be 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -527,6 +527,11 @@ extern unsigned long __initramfs_size; #include #include +void __weak free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, -1, "initrd"); +} + #ifdef CONFIG_KEXEC_CORE static bool kexec_free_initrd(void) { -- cgit v1.2.3 From f94f7434cbbb02f7eb55ed5ad66284023c47968f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:37 -0700 Subject: initramfs: poison freed initrd memory Various architectures including x86 poison the freed initrd memory. Do the same in the generic free_initrd_mem implementation and switch a few more architectures that are identical to the generic code over to it now. Link: http://lkml.kernel.org/r/20190213174621.29297-9-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 8 -------- arch/s390/mm/init.c | 8 -------- arch/sparc/mm/init_32.c | 8 -------- arch/sparc/mm/init_64.c | 8 -------- init/initramfs.c | 3 ++- 5 files changed, 2 insertions(+), 33 deletions(-) (limited to 'init') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index bbb196ad5f26..8a038b30d3c4 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - void (*free_init_pages_eva)(void *begin, void *end) = NULL; void __ref free_initmem(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 7cf48eefec8f..5f48fc7e61d5 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -157,14 +157,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - unsigned long memory_block_size_bytes(void) { /* diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index a8ff29821bdb..417f89d5e0b2 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -299,14 +299,6 @@ void free_initmem (void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index bc2aaa47bc8a..4b099dd7a767 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2572,14 +2572,6 @@ void free_initmem(void) } } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - pgprot_t PAGE_KERNEL __read_mostly; EXPORT_SYMBOL(PAGE_KERNEL); diff --git a/init/initramfs.c b/init/initramfs.c index fcb759a106be..435a428c2af1 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -529,7 +529,8 @@ extern unsigned long __initramfs_size; void __weak free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, -1, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #ifdef CONFIG_KEXEC_CORE -- cgit v1.2.3 From 997aef68af3ef1f2cb97da1c0b41a5afa87f63e2 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:18:40 -0700 Subject: init: provide a generic free_initmem implementation Patch series "provide a generic free_initmem implementation", v2. Many architectures implement free_initmem() in exactly the same or very similar way: they wrap the call to free_initmem_default() with sometimes different 'poison' parameter. These patches switch those architectures to use a generic implementation that does free_initmem_default(POISON_FREE_INITMEM). This was inspired by Christoph's patches for free_initrd_mem [1] and I shamelessly copied changelog entries from his patches :) [1] https://lore.kernel.org/lkml/20190213174621.29297-1-hch@lst.de/ This patch (of 2): For most architectures free_initmem just a wrapper for the same free_initmem_default(-1) call. Provide that as a generic implementation marked __weak. Link: http://lkml.kernel.org/r/1550515285-17446-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Christoph Hellwig Cc: Palmer Dabbelt Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 6 ------ arch/arc/mm/init.c | 8 -------- arch/c6x/mm/init.c | 5 ----- arch/h8300/mm/init.c | 6 ------ arch/microblaze/mm/init.c | 5 ----- arch/nds32/mm/init.c | 5 ----- arch/nios2/mm/init.c | 5 ----- arch/openrisc/mm/init.c | 5 ----- arch/sh/mm/init.c | 5 ----- arch/unicore32/mm/init.c | 5 ----- arch/xtensa/mm/init.c | 5 ----- init/main.c | 5 +++++ 12 files changed, 5 insertions(+), 60 deletions(-) (limited to 'init') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 97f4940f11e3..e2cbec3789e8 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -285,9 +285,3 @@ mem_init(void) memblock_free_all(); mem_init_print_info(NULL); } - -void -free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index c357a3bd1532..02b7a3b20d7c 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -206,11 +206,3 @@ void __init mem_init(void) memblock_free_all(); mem_init_print_info(NULL); } - -/* - * free_initmem: Free all the __init memory. - */ -void __ref free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 6fd43ec53507..573242b160e1 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -68,8 +68,3 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -void __init free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index ea635c9025fe..1eab16b1a0bc 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -102,9 +102,3 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -void -free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index b675bc666e68..a015a951c8b7 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -186,11 +186,6 @@ void __init setup_memory(void) paging_init(); } -void free_initmem(void) -{ - free_initmem_default(-1); -} - void __init mem_init(void) { high_memory = (void *)__va(memory_start + lowmem_size - 1); diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 9a7065c1fb83..1a4ab1b7525f 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -252,11 +252,6 @@ void __init mem_init(void) return; } -void free_initmem(void) -{ - free_initmem_default(-1); -} - void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 60736a725883..2c609c2516b2 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,11 +82,6 @@ void __init mmu_init(void) flush_tlb_all(); } -void __ref free_initmem(void) -{ - free_initmem_default(-1); -} - #define __page_aligned(order) __aligned(PAGE_SIZE << (order)) pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER); pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 08df7f0b1d96..abe87e54e231 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -223,8 +223,3 @@ void __init mem_init(void) mem_init_done = 1; return; } - -void free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3e68d98af1bd..aeb9f45c7a39 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -403,11 +403,6 @@ void __init mem_init(void) mem_init_done = 1; } -void free_initmem(void) -{ - free_initmem_default(-1); -} - #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 6e352de80038..b4442f3060ce 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -287,8 +287,3 @@ void __init mem_init(void) sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } } - -void free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index d49861099684..b51746f2b80b 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void free_initmem(void) -{ - free_initmem_default(-1); -} - static void __init parse_memmap_one(char *p) { char *oldp; diff --git a/init/main.c b/init/main.c index 33c87e91dc37..26234570a324 100644 --- a/init/main.c +++ b/init/main.c @@ -1074,6 +1074,11 @@ static inline void mark_readonly(void) } #endif +void __weak free_initmem(void) +{ + free_initmem_default(-1); +} + static int __ref kernel_init(void *unused) { int ret; -- cgit v1.2.3 From f40399992a245c852ad446e265d1567010db5e10 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:18:46 -0700 Subject: init: free_initmem: poison freed init memory Various architectures including x86 poison the freed init memory. Do the same in the generic free_initmem implementation and switch sparc32 architecture that is identical to the generic code over to it now. Link: http://lkml.kernel.org/r/1550515285-17446-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Christoph Hellwig Cc: Palmer Dabbelt Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/init_32.c | 5 ----- init/main.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'init') diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 417f89d5e0b2..046ab116cc8c 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -294,11 +294,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -void free_initmem (void) -{ - free_initmem_default(POISON_FREE_INITMEM); -} - void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); diff --git a/init/main.c b/init/main.c index 26234570a324..5a2c69b4d7b3 100644 --- a/init/main.c +++ b/init/main.c @@ -1076,7 +1076,7 @@ static inline void mark_readonly(void) void __weak free_initmem(void) { - free_initmem_default(-1); + free_initmem_default(POISON_FREE_INITMEM); } static int __ref kernel_init(void *unused) -- cgit v1.2.3 From e900a918b0984ec8f2eb150b8477a47b75d17692 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 14 May 2019 15:41:28 -0700 Subject: mm: shuffle initial free memory to improve memory-side-cache utilization Patch series "mm: Randomize free memory", v10. This patch (of 3): Randomization of the page allocator improves the average utilization of a direct-mapped memory-side-cache. Memory side caching is a platform capability that Linux has been previously exposed to in HPC (high-performance computing) environments on specialty platforms. In that instance it was a smaller pool of high-bandwidth-memory relative to higher-capacity / lower-bandwidth DRAM. Now, this capability is going to be found on general purpose server platforms where DRAM is a cache in front of higher latency persistent memory [1]. Robert offered an explanation of the state of the art of Linux interactions with memory-side-caches [2], and I copy it here: It's been a problem in the HPC space: http://www.nersc.gov/research-and-development/knl-cache-mode-performance-coe/ A kernel module called zonesort is available to try to help: https://software.intel.com/en-us/articles/xeon-phi-software and this abandoned patch series proposed that for the kernel: https://lkml.kernel.org/r/20170823100205.17311-1-lukasz.daniluk@intel.com Dan's patch series doesn't attempt to ensure buffers won't conflict, but also reduces the chance that the buffers will. This will make performance more consistent, albeit slower than "optimal" (which is near impossible to attain in a general-purpose kernel). That's better than forcing users to deploy remedies like: "To eliminate this gradual degradation, we have added a Stream measurement to the Node Health Check that follows each job; nodes are rebooted whenever their measured memory bandwidth falls below 300 GB/s." A replacement for zonesort was merged upstream in commit cc9aec03e58f ("x86/numa_emulation: Introduce uniform split capability"). With this numa_emulation capability, memory can be split into cache sized ("near-memory" sized) numa nodes. A bind operation to such a node, and disabling workloads on other nodes, enables full cache performance. However, once the workload exceeds the cache size then cache conflicts are unavoidable. While HPC environments might be able to tolerate time-scheduling of cache sized workloads, for general purpose server platforms, the oversubscribed cache case will be the common case. The worst case scenario is that a server system owner benchmarks a workload at boot with an un-contended cache only to see that performance degrade over time, even below the average cache performance due to excessive conflicts. Randomization clips the peaks and fills in the valleys of cache utilization to yield steady average performance. Here are some performance impact details of the patches: 1/ An Intel internal synthetic memory bandwidth measurement tool, saw a 3X speedup in a contrived case that tries to force cache conflicts. The contrived cased used the numa_emulation capability to force an instance of the benchmark to be run in two of the near-memory sized numa nodes. If both instances were placed on the same emulated they would fit and cause zero conflicts. While on separate emulated nodes without randomization they underutilized the cache and conflicted unnecessarily due to the in-order allocation per node. 2/ A well known Java server application benchmark was run with a heap size that exceeded cache size by 3X. The cache conflict rate was 8% for the first run and degraded to 21% after page allocator aging. With randomization enabled the rate levelled out at 11%. 3/ A MongoDB workload did not observe measurable difference in cache-conflict rates, but the overall throughput dropped by 7% with randomization in one case. 4/ Mel Gorman ran his suite of performance workloads with randomization enabled on platforms without a memory-side-cache and saw a mix of some improvements and some losses [3]. While there is potentially significant improvement for applications that depend on low latency access across a wide working-set, the performance may be negligible to negative for other workloads. For this reason the shuffle capability defaults to off unless a direct-mapped memory-side-cache is detected. Even then, the page_alloc.shuffle=0 parameter can be specified to disable the randomization on those systems. Outside of memory-side-cache utilization concerns there is potentially security benefit from randomization. Some data exfiltration and return-oriented-programming attacks rely on the ability to infer the location of sensitive data objects. The kernel page allocator, especially early in system boot, has predictable first-in-first out behavior for physical pages. Pages are freed in physical address order when first onlined. Quoting Kees: "While we already have a base-address randomization (CONFIG_RANDOMIZE_MEMORY), attacks against the same hardware and memory layouts would certainly be using the predictability of allocation ordering (i.e. for attacks where the base address isn't important: only the relative positions between allocated memory). This is common in lots of heap-style attacks. They try to gain control over ordering by spraying allocations, etc. I'd really like to see this because it gives us something similar to CONFIG_SLAB_FREELIST_RANDOM but for the page allocator." While SLAB_FREELIST_RANDOM reduces the predictability of some local slab caches it leaves vast bulk of memory to be predictably in order allocated. However, it should be noted, the concrete security benefits are hard to quantify, and no known CVE is mitigated by this randomization. Introduce shuffle_free_memory(), and its helper shuffle_zone(), to perform a Fisher-Yates shuffle of the page allocator 'free_area' lists when they are initially populated with free memory at boot and at hotplug time. Do this based on either the presence of a page_alloc.shuffle=Y command line parameter, or autodetection of a memory-side-cache (to be added in a follow-on patch). The shuffling is done in terms of CONFIG_SHUFFLE_PAGE_ORDER sized free pages where the default CONFIG_SHUFFLE_PAGE_ORDER is MAX_ORDER-1 i.e. 10, 4MB this trades off randomization granularity for time spent shuffling. MAX_ORDER-1 was chosen to be minimally invasive to the page allocator while still showing memory-side cache behavior improvements, and the expectation that the security implications of finer granularity randomization is mitigated by CONFIG_SLAB_FREELIST_RANDOM. The performance impact of the shuffling appears to be in the noise compared to other memory initialization work. This initial randomization can be undone over time so a follow-on patch is introduced to inject entropy on page free decisions. It is reasonable to ask if the page free entropy is sufficient, but it is not enough due to the in-order initial freeing of pages. At the start of that process putting page1 in front or behind page0 still keeps them close together, page2 is still near page1 and has a high chance of being adjacent. As more pages are added ordering diversity improves, but there is still high page locality for the low address pages and this leads to no significant impact to the cache conflict rate. [1]: https://itpeernetwork.intel.com/intel-optane-dc-persistent-memory-operating-modes/ [2]: https://lkml.kernel.org/r/AT5PR8401MB1169D656C8B5E121752FC0F8AB120@AT5PR8401MB1169.NAMPRD84.PROD.OUTLOOK.COM [3]: https://lkml.org/lkml/2018/10/12/309 [dan.j.williams@intel.com: fix shuffle enable] Link: http://lkml.kernel.org/r/154943713038.3858443.4125180191382062871.stgit@dwillia2-desk3.amr.corp.intel.com [cai@lca.pw: fix SHUFFLE_PAGE_ALLOCATOR help texts] Link: http://lkml.kernel.org/r/20190425201300.75650-1-cai@lca.pw Link: http://lkml.kernel.org/r/154899811738.3165233.12325692939590944259.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Qian Cai Reviewed-by: Kees Cook Acked-by: Michal Hocko Cc: Dave Hansen Cc: Keith Busch Cc: Robert Elliott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 10 ++ include/linux/list.h | 17 +++ include/linux/mmzone.h | 1 + init/Kconfig | 24 ++++ mm/Makefile | 7 +- mm/memory_hotplug.c | 3 + mm/page_alloc.c | 6 +- mm/shuffle.c | 184 ++++++++++++++++++++++++ mm/shuffle.h | 52 +++++++ 9 files changed, 302 insertions(+), 2 deletions(-) create mode 100644 mm/shuffle.c create mode 100644 mm/shuffle.h (limited to 'init') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 43176340c73d..5be4d3ff5e70 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3174,6 +3174,16 @@ This will also cause panics on machine check exceptions. Useful together with panic=30 to trigger a reboot. + page_alloc.shuffle= + [KNL] Boolean flag to control whether the page allocator + should randomize its free lists. The randomization may + be automatically enabled if the kernel detects it is + running on a platform with a direct-mapped memory-side + cache, and this parameter can be used to + override/disable that behavior. The state of the flag + can be read from sysfs at: + /sys/module/page_alloc/parameters/shuffle. + page_owner= [KNL] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, diff --git a/include/linux/list.h b/include/linux/list.h index 9e9a6403dbe4..d3b4db895340 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -150,6 +150,23 @@ static inline void list_replace_init(struct list_head *old, INIT_LIST_HEAD(old); } +/** + * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position + * @entry1: the location to place entry2 + * @entry2: the location to place entry1 + */ +static inline void list_swap(struct list_head *entry1, + struct list_head *entry2) +{ + struct list_head *pos = entry2->prev; + + list_del(entry2); + list_replace(entry1, entry2); + if (pos == entry1) + pos = entry2; + list_add(entry1, pos); +} + /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a4aedc160bd..1fb5a04530aa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1271,6 +1271,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +#define pfn_present pfn_valid #endif /* CONFIG_SPARSEMEM */ /* diff --git a/init/Kconfig b/init/Kconfig index 82b84e5ee30d..8b9ffe236e4f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1752,6 +1752,30 @@ config SLAB_FREELIST_HARDENED sacrifies to harden the kernel slab allocator against common freelist exploit methods. +config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA + help + Randomization of the page allocator improves the average + utilization of a direct-mapped memory-side-cache. See section + 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI + 6.2a specification for an example of how a platform advertises + the presence of a memory-side-cache. There are also incidental + security benefits as it reduces the predictability of page + allocations to compliment SLAB_FREELIST_RANDOM, but the + default granularity of shuffling on the "MAX_ORDER - 1" i.e, + 10th order of pages is selected based on cache utilization + benefits on x86. + + While the randomization improves cache utilization it may + negatively impact workloads on platforms without a cache. For + this reason, by default, the randomization is enabled only + after runtime detection of a direct-mapped memory-side-cache. + Otherwise, the randomization may be force enabled with the + 'page_alloc.shuffle' kernel command line parameter. + + Say Y if unsure. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/mm/Makefile b/mm/Makefile index d210cc9d6f80..ac5e5ba78874 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o \ + maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ @@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) +# Give 'page_alloc' its own module-parameter namespace +page-alloc-y := page_alloc.o +page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o + +obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6c0c4f48638e..328878b6799d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -39,6 +39,7 @@ #include #include "internal.h" +#include "shuffle.h" /* * online_page_callback contains pointer to current page onlining function. @@ -891,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->zone_pgdat->node_present_pages += onlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); + shuffle_zone(zone); + if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4b2d5f50431d..548f8f5d3295 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include #include #include "internal.h" +#include "shuffle.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -1874,9 +1875,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) void __init page_alloc_init_late(void) { struct zone *zone; + int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - int nid; /* There will be num_node_state(N_MEMORY) threads */ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); @@ -1900,6 +1901,9 @@ void __init page_alloc_init_late(void) /* Discard memblock private memory */ memblock_discard(); + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); + for_each_populated_zone(zone) set_zone_contiguous(zone); } diff --git a/mm/shuffle.c b/mm/shuffle.c new file mode 100644 index 000000000000..bc0419a61fbe --- /dev/null +++ b/mm/shuffle.c @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#include +#include +#include +#include +#include +#include "internal.h" +#include "shuffle.h" + +DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +static unsigned long shuffle_state __ro_after_init; + +/* + * Depending on the architecture, module parameter parsing may run + * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents, + * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE + * attempts to turn on the implementation, but aborts if it finds + * SHUFFLE_FORCE_DISABLE already set. + */ +__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ + if (ctl == SHUFFLE_FORCE_DISABLE) + set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state); + + if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) { + if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_disable(&page_alloc_shuffle_key); + } else if (ctl == SHUFFLE_ENABLE + && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_enable(&page_alloc_shuffle_key); +} + +static bool shuffle_param; +extern int shuffle_show(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) + ? 'Y' : 'N'); +} + +static __meminit int shuffle_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + if (shuffle_param) + page_alloc_shuffle(SHUFFLE_ENABLE); + else + page_alloc_shuffle(SHUFFLE_FORCE_DISABLE); + return 0; +} +module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) +{ + struct page *page; + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ...is the pfn even in the memmap? */ + if (!pfn_valid_within(pfn)) + return NULL; + + /* ...is the pfn in a present section or a hole? */ + if (!pfn_present(pfn)) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + page = pfn_to_page(pfn); + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (page_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through an + * array, pfns in this case, and randomly swapping each entry with another in + * the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for sources + * of bias in the distribution, like modulo bias or pseudo-random number + * generator bias. I.e. the expectation is that this shuffling raises the bar + * for attacks that exploit the predictability of page allocations, but need not + * be a perfect shuffle. + */ +#define SHUFFLE_RETRY 10 +void __meminit __shuffle_zone(struct zone *z) +{ + unsigned long i, flags; + unsigned long start_pfn = z->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(z); + const int order = SHUFFLE_ORDER; + const int order_pages = 1 << order; + + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being added + * (@start_pfn to @end_pfn), to more likely be valid compared to + * page_j randomly selected in the span @zone_start_pfn to + * @spanned_pages. + */ + page_i = shuffle_valid_page(i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page in the zone span as + * a swap target. If the selected pfn is a hole, retry + * up to SHUFFLE_RETRY attempts find a random valid pfn + * in the zone. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make sure the + * types match otherwise we're moving pages to lists where they + * do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +/** + * shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + */ +void __meminit __shuffle_free_memory(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z); +} diff --git a/mm/shuffle.h b/mm/shuffle.h new file mode 100644 index 000000000000..644c8ee97b9e --- /dev/null +++ b/mm/shuffle.h @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. +#ifndef _MM_SHUFFLE_H +#define _MM_SHUFFLE_H +#include + +/* + * SHUFFLE_ENABLE is called from the command line enabling path, or by + * platform-firmware enabling that indicates the presence of a + * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from + * the command line path and overrides any previous or future + * SHUFFLE_ENABLE. + */ +enum mm_shuffle_ctl { + SHUFFLE_ENABLE, + SHUFFLE_FORCE_DISABLE, +}; + +#define SHUFFLE_ORDER (MAX_ORDER-1) + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); +extern void __shuffle_free_memory(pg_data_t *pgdat); +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_free_memory(pgdat); +} + +extern void __shuffle_zone(struct zone *z); +static inline void shuffle_zone(struct zone *z) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_zone(z); +} +#else +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ +} + +static inline void shuffle_zone(struct zone *z) +{ +} + +static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ +} +#endif +#endif /* _MM_SHUFFLE_H */ -- cgit v1.2.3 From 5d59aa8f9ce972b472201aed86e904bb75879ff0 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Fri, 17 May 2019 14:31:47 -0700 Subject: initramfs: don't free a non-existent initrd Since commit 54c7a8916a88 ("initramfs: free initrd memory if opening /initrd.image fails"), the kernel has unconditionally attempted to free the initrd even if it doesn't exist. In the non-existent case this causes a boot-time splat if CONFIG_DEBUG_VIRTUAL is enabled due to a call to virt_to_phys() with a NULL address. Instead we should check that the initrd actually exists and only attempt to free it if it does. Link: http://lkml.kernel.org/r/20190516143125.48948-1-steven.price@arm.com Fixes: 54c7a8916a88 ("initramfs: free initrd memory if opening /initrd.image fails") Signed-off-by: Steven Price Reported-by: Mark Rutland Tested-by: Mark Rutland Reviewed-by: Mike Rapoport Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init') diff --git a/init/initramfs.c b/init/initramfs.c index 435a428c2af1..178130fd61c2 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -669,7 +669,7 @@ done: * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ - if (!do_retain_initrd && !kexec_free_initrd()) + if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) free_initrd_mem(initrd_start, initrd_end); initrd_start = 0; initrd_end = 0; -- cgit v1.2.3